• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 
117 #include <linux/uaccess.h>
118 
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/net_namespace.h>
123 #include <net/request_sock.h>
124 #include <net/sock.h>
125 #include <linux/net_tstamp.h>
126 #include <net/xfrm.h>
127 #include <linux/ipsec.h>
128 #include <net/cls_cgroup.h>
129 #include <net/netprio_cgroup.h>
130 #include <linux/sock_diag.h>
131 
132 #include <linux/filter.h>
133 #include <net/sock_reuseport.h>
134 #include <net/bpf_sk_storage.h>
135 
136 #include <trace/events/sock.h>
137 #include <trace/hooks/net.h>
138 
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141 
142 static DEFINE_MUTEX(proto_list_mutex);
143 static LIST_HEAD(proto_list);
144 
145 static void sock_inuse_add(struct net *net, int val);
146 
147 /**
148  * sk_ns_capable - General socket capability test
149  * @sk: Socket to use a capability on or through
150  * @user_ns: The user namespace of the capability to use
151  * @cap: The capability to use
152  *
153  * Test to see if the opener of the socket had when the socket was
154  * created and the current process has the capability @cap in the user
155  * namespace @user_ns.
156  */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)157 bool sk_ns_capable(const struct sock *sk,
158 		   struct user_namespace *user_ns, int cap)
159 {
160 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 		ns_capable(user_ns, cap);
162 }
163 EXPORT_SYMBOL(sk_ns_capable);
164 
165 /**
166  * sk_capable - Socket global capability test
167  * @sk: Socket to use a capability on or through
168  * @cap: The global capability to use
169  *
170  * Test to see if the opener of the socket had when the socket was
171  * created and the current process has the capability @cap in all user
172  * namespaces.
173  */
sk_capable(const struct sock * sk,int cap)174 bool sk_capable(const struct sock *sk, int cap)
175 {
176 	return sk_ns_capable(sk, &init_user_ns, cap);
177 }
178 EXPORT_SYMBOL(sk_capable);
179 
180 /**
181  * sk_net_capable - Network namespace socket capability test
182  * @sk: Socket to use a capability on or through
183  * @cap: The capability to use
184  *
185  * Test to see if the opener of the socket had when the socket was created
186  * and the current process has the capability @cap over the network namespace
187  * the socket is a member of.
188  */
sk_net_capable(const struct sock * sk,int cap)189 bool sk_net_capable(const struct sock *sk, int cap)
190 {
191 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
192 }
193 EXPORT_SYMBOL(sk_net_capable);
194 
195 /*
196  * Each address family might have different locking rules, so we have
197  * one slock key per address family and separate keys for internal and
198  * userspace sockets.
199  */
200 static struct lock_class_key af_family_keys[AF_MAX];
201 static struct lock_class_key af_family_kern_keys[AF_MAX];
202 static struct lock_class_key af_family_slock_keys[AF_MAX];
203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
204 
205 /*
206  * Make lock validator output more readable. (we pre-construct these
207  * strings build-time, so that runtime initialization of socket
208  * locks is fast):
209  */
210 
211 #define _sock_locks(x)						  \
212   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
213   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
214   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
215   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
216   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
217   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
218   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
219   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
220   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
221   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
222   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
223   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
224   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
225   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
226   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
227   x "AF_MAX"
228 
229 static const char *const af_family_key_strings[AF_MAX+1] = {
230 	_sock_locks("sk_lock-")
231 };
232 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
233 	_sock_locks("slock-")
234 };
235 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
236 	_sock_locks("clock-")
237 };
238 
239 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
240 	_sock_locks("k-sk_lock-")
241 };
242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
243 	_sock_locks("k-slock-")
244 };
245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
246 	_sock_locks("k-clock-")
247 };
248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
249 	_sock_locks("rlock-")
250 };
251 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
252 	_sock_locks("wlock-")
253 };
254 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
255 	_sock_locks("elock-")
256 };
257 
258 /*
259  * sk_callback_lock and sk queues locking rules are per-address-family,
260  * so split the lock classes by using a per-AF key:
261  */
262 static struct lock_class_key af_callback_keys[AF_MAX];
263 static struct lock_class_key af_rlock_keys[AF_MAX];
264 static struct lock_class_key af_wlock_keys[AF_MAX];
265 static struct lock_class_key af_elock_keys[AF_MAX];
266 static struct lock_class_key af_kern_callback_keys[AF_MAX];
267 
268 /* Run time adjustable parameters. */
269 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
270 EXPORT_SYMBOL(sysctl_wmem_max);
271 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
272 EXPORT_SYMBOL(sysctl_rmem_max);
273 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
274 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
275 
276 /* Maximal space eaten by iovec or ancillary data plus some space */
277 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
278 EXPORT_SYMBOL(sysctl_optmem_max);
279 
280 int sysctl_tstamp_allow_data __read_mostly = 1;
281 
282 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
283 EXPORT_SYMBOL_GPL(memalloc_socks_key);
284 
285 /**
286  * sk_set_memalloc - sets %SOCK_MEMALLOC
287  * @sk: socket to set it on
288  *
289  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
290  * It's the responsibility of the admin to adjust min_free_kbytes
291  * to meet the requirements
292  */
sk_set_memalloc(struct sock * sk)293 void sk_set_memalloc(struct sock *sk)
294 {
295 	sock_set_flag(sk, SOCK_MEMALLOC);
296 	sk->sk_allocation |= __GFP_MEMALLOC;
297 	static_branch_inc(&memalloc_socks_key);
298 }
299 EXPORT_SYMBOL_GPL(sk_set_memalloc);
300 
sk_clear_memalloc(struct sock * sk)301 void sk_clear_memalloc(struct sock *sk)
302 {
303 	sock_reset_flag(sk, SOCK_MEMALLOC);
304 	sk->sk_allocation &= ~__GFP_MEMALLOC;
305 	static_branch_dec(&memalloc_socks_key);
306 
307 	/*
308 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
309 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
310 	 * it has rmem allocations due to the last swapfile being deactivated
311 	 * but there is a risk that the socket is unusable due to exceeding
312 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
313 	 */
314 	sk_mem_reclaim(sk);
315 }
316 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
317 
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
319 {
320 	int ret;
321 	unsigned int noreclaim_flag;
322 
323 	/* these should have been dropped before queueing */
324 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
325 
326 	noreclaim_flag = memalloc_noreclaim_save();
327 	ret = sk->sk_backlog_rcv(sk, skb);
328 	memalloc_noreclaim_restore(noreclaim_flag);
329 
330 	return ret;
331 }
332 EXPORT_SYMBOL(__sk_backlog_rcv);
333 
sock_get_timeout(long timeo,void * optval,bool old_timeval)334 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
335 {
336 	struct __kernel_sock_timeval tv;
337 	int size;
338 
339 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
340 		tv.tv_sec = 0;
341 		tv.tv_usec = 0;
342 	} else {
343 		tv.tv_sec = timeo / HZ;
344 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
345 	}
346 
347 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
348 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
349 		*(struct old_timeval32 *)optval = tv32;
350 		return sizeof(tv32);
351 	}
352 
353 	if (old_timeval) {
354 		struct __kernel_old_timeval old_tv;
355 		old_tv.tv_sec = tv.tv_sec;
356 		old_tv.tv_usec = tv.tv_usec;
357 		*(struct __kernel_old_timeval *)optval = old_tv;
358 		size = sizeof(old_tv);
359 	} else {
360 		*(struct __kernel_sock_timeval *)optval = tv;
361 		size = sizeof(tv);
362 	}
363 
364 	return size;
365 }
366 
sock_set_timeout(long * timeo_p,char __user * optval,int optlen,bool old_timeval)367 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
368 {
369 	struct __kernel_sock_timeval tv;
370 
371 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
372 		struct old_timeval32 tv32;
373 
374 		if (optlen < sizeof(tv32))
375 			return -EINVAL;
376 
377 		if (copy_from_user(&tv32, optval, sizeof(tv32)))
378 			return -EFAULT;
379 		tv.tv_sec = tv32.tv_sec;
380 		tv.tv_usec = tv32.tv_usec;
381 	} else if (old_timeval) {
382 		struct __kernel_old_timeval old_tv;
383 
384 		if (optlen < sizeof(old_tv))
385 			return -EINVAL;
386 		if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
387 			return -EFAULT;
388 		tv.tv_sec = old_tv.tv_sec;
389 		tv.tv_usec = old_tv.tv_usec;
390 	} else {
391 		if (optlen < sizeof(tv))
392 			return -EINVAL;
393 		if (copy_from_user(&tv, optval, sizeof(tv)))
394 			return -EFAULT;
395 	}
396 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
397 		return -EDOM;
398 
399 	if (tv.tv_sec < 0) {
400 		static int warned __read_mostly;
401 
402 		*timeo_p = 0;
403 		if (warned < 10 && net_ratelimit()) {
404 			warned++;
405 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
406 				__func__, current->comm, task_pid_nr(current));
407 		}
408 		return 0;
409 	}
410 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
411 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
412 		return 0;
413 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
414 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
415 	return 0;
416 }
417 
sock_warn_obsolete_bsdism(const char * name)418 static void sock_warn_obsolete_bsdism(const char *name)
419 {
420 	static int warned;
421 	static char warncomm[TASK_COMM_LEN];
422 	if (strcmp(warncomm, current->comm) && warned < 5) {
423 		strcpy(warncomm,  current->comm);
424 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
425 			warncomm, name);
426 		warned++;
427 	}
428 }
429 
sock_needs_netstamp(const struct sock * sk)430 static bool sock_needs_netstamp(const struct sock *sk)
431 {
432 	switch (sk->sk_family) {
433 	case AF_UNSPEC:
434 	case AF_UNIX:
435 		return false;
436 	default:
437 		return true;
438 	}
439 }
440 
sock_disable_timestamp(struct sock * sk,unsigned long flags)441 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
442 {
443 	if (sk->sk_flags & flags) {
444 		sk->sk_flags &= ~flags;
445 		if (sock_needs_netstamp(sk) &&
446 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
447 			net_disable_timestamp();
448 	}
449 }
450 
451 
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)452 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
453 {
454 	unsigned long flags;
455 	struct sk_buff_head *list = &sk->sk_receive_queue;
456 
457 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
458 		atomic_inc(&sk->sk_drops);
459 		trace_sock_rcvqueue_full(sk, skb);
460 		return -ENOMEM;
461 	}
462 
463 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
464 		atomic_inc(&sk->sk_drops);
465 		return -ENOBUFS;
466 	}
467 
468 	skb->dev = NULL;
469 	skb_set_owner_r(skb, sk);
470 
471 	/* we escape from rcu protected region, make sure we dont leak
472 	 * a norefcounted dst
473 	 */
474 	skb_dst_force(skb);
475 
476 	spin_lock_irqsave(&list->lock, flags);
477 	sock_skb_set_dropcount(sk, skb);
478 	__skb_queue_tail(list, skb);
479 	spin_unlock_irqrestore(&list->lock, flags);
480 
481 	if (!sock_flag(sk, SOCK_DEAD))
482 		sk->sk_data_ready(sk);
483 	return 0;
484 }
485 EXPORT_SYMBOL(__sock_queue_rcv_skb);
486 
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)487 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
488 {
489 	int err;
490 
491 	err = sk_filter(sk, skb);
492 	if (err)
493 		return err;
494 
495 	return __sock_queue_rcv_skb(sk, skb);
496 }
497 EXPORT_SYMBOL(sock_queue_rcv_skb);
498 
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)499 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
500 		     const int nested, unsigned int trim_cap, bool refcounted)
501 {
502 	int rc = NET_RX_SUCCESS;
503 
504 	if (sk_filter_trim_cap(sk, skb, trim_cap))
505 		goto discard_and_relse;
506 
507 	skb->dev = NULL;
508 
509 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
510 		atomic_inc(&sk->sk_drops);
511 		goto discard_and_relse;
512 	}
513 	if (nested)
514 		bh_lock_sock_nested(sk);
515 	else
516 		bh_lock_sock(sk);
517 	if (!sock_owned_by_user(sk)) {
518 		/*
519 		 * trylock + unlock semantics:
520 		 */
521 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
522 
523 		rc = sk_backlog_rcv(sk, skb);
524 
525 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
526 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
527 		bh_unlock_sock(sk);
528 		atomic_inc(&sk->sk_drops);
529 		goto discard_and_relse;
530 	}
531 
532 	bh_unlock_sock(sk);
533 out:
534 	if (refcounted)
535 		sock_put(sk);
536 	return rc;
537 discard_and_relse:
538 	kfree_skb(skb);
539 	goto out;
540 }
541 EXPORT_SYMBOL(__sk_receive_skb);
542 
__sk_dst_check(struct sock * sk,u32 cookie)543 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
544 {
545 	struct dst_entry *dst = __sk_dst_get(sk);
546 
547 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
548 		sk_tx_queue_clear(sk);
549 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
550 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
551 		dst_release(dst);
552 		return NULL;
553 	}
554 
555 	return dst;
556 }
557 EXPORT_SYMBOL(__sk_dst_check);
558 
sk_dst_check(struct sock * sk,u32 cookie)559 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
560 {
561 	struct dst_entry *dst = sk_dst_get(sk);
562 
563 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
564 		sk_dst_reset(sk);
565 		dst_release(dst);
566 		return NULL;
567 	}
568 
569 	return dst;
570 }
571 EXPORT_SYMBOL(sk_dst_check);
572 
sock_setbindtodevice_locked(struct sock * sk,int ifindex)573 static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
574 {
575 	int ret = -ENOPROTOOPT;
576 #ifdef CONFIG_NETDEVICES
577 	struct net *net = sock_net(sk);
578 
579 	/* Sorry... */
580 	ret = -EPERM;
581 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
582 		goto out;
583 
584 	ret = -EINVAL;
585 	if (ifindex < 0)
586 		goto out;
587 
588 	sk->sk_bound_dev_if = ifindex;
589 	if (sk->sk_prot->rehash)
590 		sk->sk_prot->rehash(sk);
591 	sk_dst_reset(sk);
592 
593 	ret = 0;
594 
595 out:
596 #endif
597 
598 	return ret;
599 }
600 
sock_setbindtodevice(struct sock * sk,char __user * optval,int optlen)601 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
602 				int optlen)
603 {
604 	int ret = -ENOPROTOOPT;
605 #ifdef CONFIG_NETDEVICES
606 	struct net *net = sock_net(sk);
607 	char devname[IFNAMSIZ];
608 	int index;
609 
610 	ret = -EINVAL;
611 	if (optlen < 0)
612 		goto out;
613 
614 	/* Bind this socket to a particular device like "eth0",
615 	 * as specified in the passed interface name. If the
616 	 * name is "" or the option length is zero the socket
617 	 * is not bound.
618 	 */
619 	if (optlen > IFNAMSIZ - 1)
620 		optlen = IFNAMSIZ - 1;
621 	memset(devname, 0, sizeof(devname));
622 
623 	ret = -EFAULT;
624 	if (copy_from_user(devname, optval, optlen))
625 		goto out;
626 
627 	index = 0;
628 	if (devname[0] != '\0') {
629 		struct net_device *dev;
630 
631 		rcu_read_lock();
632 		dev = dev_get_by_name_rcu(net, devname);
633 		if (dev)
634 			index = dev->ifindex;
635 		rcu_read_unlock();
636 		ret = -ENODEV;
637 		if (!dev)
638 			goto out;
639 	}
640 
641 	lock_sock(sk);
642 	ret = sock_setbindtodevice_locked(sk, index);
643 	release_sock(sk);
644 
645 out:
646 #endif
647 
648 	return ret;
649 }
650 
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)651 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
652 				int __user *optlen, int len)
653 {
654 	int ret = -ENOPROTOOPT;
655 #ifdef CONFIG_NETDEVICES
656 	struct net *net = sock_net(sk);
657 	char devname[IFNAMSIZ];
658 
659 	if (sk->sk_bound_dev_if == 0) {
660 		len = 0;
661 		goto zero;
662 	}
663 
664 	ret = -EINVAL;
665 	if (len < IFNAMSIZ)
666 		goto out;
667 
668 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
669 	if (ret)
670 		goto out;
671 
672 	len = strlen(devname) + 1;
673 
674 	ret = -EFAULT;
675 	if (copy_to_user(optval, devname, len))
676 		goto out;
677 
678 zero:
679 	ret = -EFAULT;
680 	if (put_user(len, optlen))
681 		goto out;
682 
683 	ret = 0;
684 
685 out:
686 #endif
687 
688 	return ret;
689 }
690 
sock_valbool_flag(struct sock * sk,int bit,int valbool)691 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
692 {
693 	if (valbool)
694 		sock_set_flag(sk, bit);
695 	else
696 		sock_reset_flag(sk, bit);
697 }
698 
sk_mc_loop(struct sock * sk)699 bool sk_mc_loop(struct sock *sk)
700 {
701 	if (dev_recursion_level())
702 		return false;
703 	if (!sk)
704 		return true;
705 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
706 	switch (READ_ONCE(sk->sk_family)) {
707 	case AF_INET:
708 		return inet_sk(sk)->mc_loop;
709 #if IS_ENABLED(CONFIG_IPV6)
710 	case AF_INET6:
711 		return inet6_sk(sk)->mc_loop;
712 #endif
713 	}
714 	WARN_ON_ONCE(1);
715 	return true;
716 }
717 EXPORT_SYMBOL(sk_mc_loop);
718 
719 /*
720  *	This is meant for all protocols to use and covers goings on
721  *	at the socket level. Everything here is generic.
722  */
723 
sock_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)724 int sock_setsockopt(struct socket *sock, int level, int optname,
725 		    char __user *optval, unsigned int optlen)
726 {
727 	struct sock_txtime sk_txtime;
728 	struct sock *sk = sock->sk;
729 	int val;
730 	int valbool;
731 	struct linger ling;
732 	int ret = 0;
733 
734 	/*
735 	 *	Options without arguments
736 	 */
737 
738 	if (optname == SO_BINDTODEVICE)
739 		return sock_setbindtodevice(sk, optval, optlen);
740 
741 	if (optlen < sizeof(int))
742 		return -EINVAL;
743 
744 	if (get_user(val, (int __user *)optval))
745 		return -EFAULT;
746 
747 	valbool = val ? 1 : 0;
748 
749 	lock_sock(sk);
750 
751 	switch (optname) {
752 	case SO_DEBUG:
753 		if (val && !capable(CAP_NET_ADMIN))
754 			ret = -EACCES;
755 		else
756 			sock_valbool_flag(sk, SOCK_DBG, valbool);
757 		break;
758 	case SO_REUSEADDR:
759 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
760 		break;
761 	case SO_REUSEPORT:
762 		sk->sk_reuseport = valbool;
763 		break;
764 	case SO_TYPE:
765 	case SO_PROTOCOL:
766 	case SO_DOMAIN:
767 	case SO_ERROR:
768 		ret = -ENOPROTOOPT;
769 		break;
770 	case SO_DONTROUTE:
771 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
772 		sk_dst_reset(sk);
773 		break;
774 	case SO_BROADCAST:
775 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
776 		break;
777 	case SO_SNDBUF:
778 		/* Don't error on this BSD doesn't and if you think
779 		 * about it this is right. Otherwise apps have to
780 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
781 		 * are treated in BSD as hints
782 		 */
783 		val = min_t(u32, val, sysctl_wmem_max);
784 set_sndbuf:
785 		/* Ensure val * 2 fits into an int, to prevent max_t()
786 		 * from treating it as a negative value.
787 		 */
788 		val = min_t(int, val, INT_MAX / 2);
789 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
790 		WRITE_ONCE(sk->sk_sndbuf,
791 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
792 		/* Wake up sending tasks if we upped the value. */
793 		sk->sk_write_space(sk);
794 		break;
795 
796 	case SO_SNDBUFFORCE:
797 		if (!capable(CAP_NET_ADMIN)) {
798 			ret = -EPERM;
799 			break;
800 		}
801 
802 		/* No negative values (to prevent underflow, as val will be
803 		 * multiplied by 2).
804 		 */
805 		if (val < 0)
806 			val = 0;
807 		goto set_sndbuf;
808 
809 	case SO_RCVBUF:
810 		/* Don't error on this BSD doesn't and if you think
811 		 * about it this is right. Otherwise apps have to
812 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
813 		 * are treated in BSD as hints
814 		 */
815 		val = min_t(u32, val, sysctl_rmem_max);
816 set_rcvbuf:
817 		/* Ensure val * 2 fits into an int, to prevent max_t()
818 		 * from treating it as a negative value.
819 		 */
820 		val = min_t(int, val, INT_MAX / 2);
821 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
822 		/*
823 		 * We double it on the way in to account for
824 		 * "struct sk_buff" etc. overhead.   Applications
825 		 * assume that the SO_RCVBUF setting they make will
826 		 * allow that much actual data to be received on that
827 		 * socket.
828 		 *
829 		 * Applications are unaware that "struct sk_buff" and
830 		 * other overheads allocate from the receive buffer
831 		 * during socket buffer allocation.
832 		 *
833 		 * And after considering the possible alternatives,
834 		 * returning the value we actually used in getsockopt
835 		 * is the most desirable behavior.
836 		 */
837 		WRITE_ONCE(sk->sk_rcvbuf,
838 			   max_t(int, val * 2, SOCK_MIN_RCVBUF));
839 		break;
840 
841 	case SO_RCVBUFFORCE:
842 		if (!capable(CAP_NET_ADMIN)) {
843 			ret = -EPERM;
844 			break;
845 		}
846 
847 		/* No negative values (to prevent underflow, as val will be
848 		 * multiplied by 2).
849 		 */
850 		if (val < 0)
851 			val = 0;
852 		goto set_rcvbuf;
853 
854 	case SO_KEEPALIVE:
855 		if (sk->sk_prot->keepalive)
856 			sk->sk_prot->keepalive(sk, valbool);
857 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
858 		break;
859 
860 	case SO_OOBINLINE:
861 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
862 		break;
863 
864 	case SO_NO_CHECK:
865 		sk->sk_no_check_tx = valbool;
866 		break;
867 
868 	case SO_PRIORITY:
869 		if ((val >= 0 && val <= 6) ||
870 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
871 			sk->sk_priority = val;
872 		else
873 			ret = -EPERM;
874 		break;
875 
876 	case SO_LINGER:
877 		if (optlen < sizeof(ling)) {
878 			ret = -EINVAL;	/* 1003.1g */
879 			break;
880 		}
881 		if (copy_from_user(&ling, optval, sizeof(ling))) {
882 			ret = -EFAULT;
883 			break;
884 		}
885 		if (!ling.l_onoff)
886 			sock_reset_flag(sk, SOCK_LINGER);
887 		else {
888 #if (BITS_PER_LONG == 32)
889 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
890 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
891 			else
892 #endif
893 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
894 			sock_set_flag(sk, SOCK_LINGER);
895 		}
896 		break;
897 
898 	case SO_BSDCOMPAT:
899 		sock_warn_obsolete_bsdism("setsockopt");
900 		break;
901 
902 	case SO_PASSCRED:
903 		if (valbool)
904 			set_bit(SOCK_PASSCRED, &sock->flags);
905 		else
906 			clear_bit(SOCK_PASSCRED, &sock->flags);
907 		break;
908 
909 	case SO_TIMESTAMP_OLD:
910 	case SO_TIMESTAMP_NEW:
911 	case SO_TIMESTAMPNS_OLD:
912 	case SO_TIMESTAMPNS_NEW:
913 		if (valbool)  {
914 			if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
915 				sock_set_flag(sk, SOCK_TSTAMP_NEW);
916 			else
917 				sock_reset_flag(sk, SOCK_TSTAMP_NEW);
918 
919 			if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
920 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
921 			else
922 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
923 			sock_set_flag(sk, SOCK_RCVTSTAMP);
924 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
925 		} else {
926 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
927 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
928 		}
929 		break;
930 
931 	case SO_TIMESTAMPING_NEW:
932 	case SO_TIMESTAMPING_OLD:
933 		if (val & ~SOF_TIMESTAMPING_MASK) {
934 			ret = -EINVAL;
935 			break;
936 		}
937 
938 		if (val & SOF_TIMESTAMPING_OPT_ID &&
939 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
940 			if (sk->sk_protocol == IPPROTO_TCP &&
941 			    sk->sk_type == SOCK_STREAM) {
942 				if ((1 << sk->sk_state) &
943 				    (TCPF_CLOSE | TCPF_LISTEN)) {
944 					ret = -EINVAL;
945 					break;
946 				}
947 				sk->sk_tskey = tcp_sk(sk)->snd_una;
948 			} else {
949 				sk->sk_tskey = 0;
950 			}
951 		}
952 
953 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
954 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
955 			ret = -EINVAL;
956 			break;
957 		}
958 
959 		sk->sk_tsflags = val;
960 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
961 
962 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
963 			sock_enable_timestamp(sk,
964 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
965 		else
966 			sock_disable_timestamp(sk,
967 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
968 		break;
969 
970 	case SO_RCVLOWAT:
971 		if (val < 0)
972 			val = INT_MAX;
973 		if (sock->ops->set_rcvlowat)
974 			ret = sock->ops->set_rcvlowat(sk, val);
975 		else
976 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
977 		break;
978 
979 	case SO_RCVTIMEO_OLD:
980 	case SO_RCVTIMEO_NEW:
981 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
982 		break;
983 
984 	case SO_SNDTIMEO_OLD:
985 	case SO_SNDTIMEO_NEW:
986 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
987 		break;
988 
989 	case SO_ATTACH_FILTER:
990 		ret = -EINVAL;
991 		if (optlen == sizeof(struct sock_fprog)) {
992 			struct sock_fprog fprog;
993 
994 			ret = -EFAULT;
995 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
996 				break;
997 
998 			ret = sk_attach_filter(&fprog, sk);
999 		}
1000 		break;
1001 
1002 	case SO_ATTACH_BPF:
1003 		ret = -EINVAL;
1004 		if (optlen == sizeof(u32)) {
1005 			u32 ufd;
1006 
1007 			ret = -EFAULT;
1008 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
1009 				break;
1010 
1011 			ret = sk_attach_bpf(ufd, sk);
1012 		}
1013 		break;
1014 
1015 	case SO_ATTACH_REUSEPORT_CBPF:
1016 		ret = -EINVAL;
1017 		if (optlen == sizeof(struct sock_fprog)) {
1018 			struct sock_fprog fprog;
1019 
1020 			ret = -EFAULT;
1021 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
1022 				break;
1023 
1024 			ret = sk_reuseport_attach_filter(&fprog, sk);
1025 		}
1026 		break;
1027 
1028 	case SO_ATTACH_REUSEPORT_EBPF:
1029 		ret = -EINVAL;
1030 		if (optlen == sizeof(u32)) {
1031 			u32 ufd;
1032 
1033 			ret = -EFAULT;
1034 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
1035 				break;
1036 
1037 			ret = sk_reuseport_attach_bpf(ufd, sk);
1038 		}
1039 		break;
1040 
1041 	case SO_DETACH_REUSEPORT_BPF:
1042 		ret = reuseport_detach_prog(sk);
1043 		break;
1044 
1045 	case SO_DETACH_FILTER:
1046 		ret = sk_detach_filter(sk);
1047 		break;
1048 
1049 	case SO_LOCK_FILTER:
1050 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1051 			ret = -EPERM;
1052 		else
1053 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1054 		break;
1055 
1056 	case SO_PASSSEC:
1057 		if (valbool)
1058 			set_bit(SOCK_PASSSEC, &sock->flags);
1059 		else
1060 			clear_bit(SOCK_PASSSEC, &sock->flags);
1061 		break;
1062 	case SO_MARK:
1063 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1064 			ret = -EPERM;
1065 		} else if (val != sk->sk_mark) {
1066 			sk->sk_mark = val;
1067 			sk_dst_reset(sk);
1068 		}
1069 		break;
1070 
1071 	case SO_RXQ_OVFL:
1072 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1073 		break;
1074 
1075 	case SO_WIFI_STATUS:
1076 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1077 		break;
1078 
1079 	case SO_PEEK_OFF:
1080 		if (sock->ops->set_peek_off)
1081 			ret = sock->ops->set_peek_off(sk, val);
1082 		else
1083 			ret = -EOPNOTSUPP;
1084 		break;
1085 
1086 	case SO_NOFCS:
1087 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1088 		break;
1089 
1090 	case SO_SELECT_ERR_QUEUE:
1091 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1092 		break;
1093 
1094 #ifdef CONFIG_NET_RX_BUSY_POLL
1095 	case SO_BUSY_POLL:
1096 		/* allow unprivileged users to decrease the value */
1097 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1098 			ret = -EPERM;
1099 		else {
1100 			if (val < 0)
1101 				ret = -EINVAL;
1102 			else
1103 				WRITE_ONCE(sk->sk_ll_usec, val);
1104 		}
1105 		break;
1106 #endif
1107 
1108 	case SO_MAX_PACING_RATE:
1109 		{
1110 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1111 
1112 		if (sizeof(ulval) != sizeof(val) &&
1113 		    optlen >= sizeof(ulval) &&
1114 		    get_user(ulval, (unsigned long __user *)optval)) {
1115 			ret = -EFAULT;
1116 			break;
1117 		}
1118 		if (ulval != ~0UL)
1119 			cmpxchg(&sk->sk_pacing_status,
1120 				SK_PACING_NONE,
1121 				SK_PACING_NEEDED);
1122 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1123 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1124 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1125 		break;
1126 		}
1127 	case SO_INCOMING_CPU:
1128 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1129 		break;
1130 
1131 	case SO_CNX_ADVICE:
1132 		if (val == 1)
1133 			dst_negative_advice(sk);
1134 		break;
1135 
1136 	case SO_ZEROCOPY:
1137 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1138 			if (!((sk->sk_type == SOCK_STREAM &&
1139 			       sk->sk_protocol == IPPROTO_TCP) ||
1140 			      (sk->sk_type == SOCK_DGRAM &&
1141 			       sk->sk_protocol == IPPROTO_UDP)))
1142 				ret = -ENOTSUPP;
1143 		} else if (sk->sk_family != PF_RDS) {
1144 			ret = -ENOTSUPP;
1145 		}
1146 		if (!ret) {
1147 			if (val < 0 || val > 1)
1148 				ret = -EINVAL;
1149 			else
1150 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1151 		}
1152 		break;
1153 
1154 	case SO_TXTIME:
1155 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1156 			ret = -EPERM;
1157 		} else if (optlen != sizeof(struct sock_txtime)) {
1158 			ret = -EINVAL;
1159 		} else if (copy_from_user(&sk_txtime, optval,
1160 			   sizeof(struct sock_txtime))) {
1161 			ret = -EFAULT;
1162 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1163 			ret = -EINVAL;
1164 		} else {
1165 			sock_valbool_flag(sk, SOCK_TXTIME, true);
1166 			sk->sk_clockid = sk_txtime.clockid;
1167 			sk->sk_txtime_deadline_mode =
1168 				!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1169 			sk->sk_txtime_report_errors =
1170 				!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1171 		}
1172 		break;
1173 
1174 	case SO_BINDTOIFINDEX:
1175 		ret = sock_setbindtodevice_locked(sk, val);
1176 		break;
1177 
1178 	default:
1179 		ret = -ENOPROTOOPT;
1180 		break;
1181 	}
1182 	release_sock(sk);
1183 	return ret;
1184 }
1185 EXPORT_SYMBOL(sock_setsockopt);
1186 
sk_get_peer_cred(struct sock * sk)1187 static const struct cred *sk_get_peer_cred(struct sock *sk)
1188 {
1189 	const struct cred *cred;
1190 
1191 	spin_lock(&sk->sk_peer_lock);
1192 	cred = get_cred(sk->sk_peer_cred);
1193 	spin_unlock(&sk->sk_peer_lock);
1194 
1195 	return cred;
1196 }
1197 
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1198 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1199 			  struct ucred *ucred)
1200 {
1201 	ucred->pid = pid_vnr(pid);
1202 	ucred->uid = ucred->gid = -1;
1203 	if (cred) {
1204 		struct user_namespace *current_ns = current_user_ns();
1205 
1206 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1207 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1208 	}
1209 }
1210 
groups_to_user(gid_t __user * dst,const struct group_info * src)1211 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1212 {
1213 	struct user_namespace *user_ns = current_user_ns();
1214 	int i;
1215 
1216 	for (i = 0; i < src->ngroups; i++)
1217 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1218 			return -EFAULT;
1219 
1220 	return 0;
1221 }
1222 
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1223 int sock_getsockopt(struct socket *sock, int level, int optname,
1224 		    char __user *optval, int __user *optlen)
1225 {
1226 	struct sock *sk = sock->sk;
1227 
1228 	union {
1229 		int val;
1230 		u64 val64;
1231 		unsigned long ulval;
1232 		struct linger ling;
1233 		struct old_timeval32 tm32;
1234 		struct __kernel_old_timeval tm;
1235 		struct  __kernel_sock_timeval stm;
1236 		struct sock_txtime txtime;
1237 	} v;
1238 
1239 	int lv = sizeof(int);
1240 	int len;
1241 
1242 	if (get_user(len, optlen))
1243 		return -EFAULT;
1244 	if (len < 0)
1245 		return -EINVAL;
1246 
1247 	memset(&v, 0, sizeof(v));
1248 
1249 	switch (optname) {
1250 	case SO_DEBUG:
1251 		v.val = sock_flag(sk, SOCK_DBG);
1252 		break;
1253 
1254 	case SO_DONTROUTE:
1255 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1256 		break;
1257 
1258 	case SO_BROADCAST:
1259 		v.val = sock_flag(sk, SOCK_BROADCAST);
1260 		break;
1261 
1262 	case SO_SNDBUF:
1263 		v.val = READ_ONCE(sk->sk_sndbuf);
1264 		break;
1265 
1266 	case SO_RCVBUF:
1267 		v.val = READ_ONCE(sk->sk_rcvbuf);
1268 		break;
1269 
1270 	case SO_REUSEADDR:
1271 		v.val = sk->sk_reuse;
1272 		break;
1273 
1274 	case SO_REUSEPORT:
1275 		v.val = sk->sk_reuseport;
1276 		break;
1277 
1278 	case SO_KEEPALIVE:
1279 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1280 		break;
1281 
1282 	case SO_TYPE:
1283 		v.val = sk->sk_type;
1284 		break;
1285 
1286 	case SO_PROTOCOL:
1287 		v.val = sk->sk_protocol;
1288 		break;
1289 
1290 	case SO_DOMAIN:
1291 		v.val = sk->sk_family;
1292 		break;
1293 
1294 	case SO_ERROR:
1295 		v.val = -sock_error(sk);
1296 		if (v.val == 0)
1297 			v.val = xchg(&sk->sk_err_soft, 0);
1298 		break;
1299 
1300 	case SO_OOBINLINE:
1301 		v.val = sock_flag(sk, SOCK_URGINLINE);
1302 		break;
1303 
1304 	case SO_NO_CHECK:
1305 		v.val = sk->sk_no_check_tx;
1306 		break;
1307 
1308 	case SO_PRIORITY:
1309 		v.val = sk->sk_priority;
1310 		break;
1311 
1312 	case SO_LINGER:
1313 		lv		= sizeof(v.ling);
1314 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1315 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1316 		break;
1317 
1318 	case SO_BSDCOMPAT:
1319 		sock_warn_obsolete_bsdism("getsockopt");
1320 		break;
1321 
1322 	case SO_TIMESTAMP_OLD:
1323 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1324 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1325 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1326 		break;
1327 
1328 	case SO_TIMESTAMPNS_OLD:
1329 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1330 		break;
1331 
1332 	case SO_TIMESTAMP_NEW:
1333 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1334 		break;
1335 
1336 	case SO_TIMESTAMPNS_NEW:
1337 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1338 		break;
1339 
1340 	case SO_TIMESTAMPING_OLD:
1341 		v.val = sk->sk_tsflags;
1342 		break;
1343 
1344 	case SO_RCVTIMEO_OLD:
1345 	case SO_RCVTIMEO_NEW:
1346 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1347 		break;
1348 
1349 	case SO_SNDTIMEO_OLD:
1350 	case SO_SNDTIMEO_NEW:
1351 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1352 		break;
1353 
1354 	case SO_RCVLOWAT:
1355 		v.val = READ_ONCE(sk->sk_rcvlowat);
1356 		break;
1357 
1358 	case SO_SNDLOWAT:
1359 		v.val = 1;
1360 		break;
1361 
1362 	case SO_PASSCRED:
1363 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1364 		break;
1365 
1366 	case SO_PEERCRED:
1367 	{
1368 		struct ucred peercred;
1369 		if (len > sizeof(peercred))
1370 			len = sizeof(peercred);
1371 
1372 		spin_lock(&sk->sk_peer_lock);
1373 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1374 		spin_unlock(&sk->sk_peer_lock);
1375 
1376 		if (copy_to_user(optval, &peercred, len))
1377 			return -EFAULT;
1378 		goto lenout;
1379 	}
1380 
1381 	case SO_PEERGROUPS:
1382 	{
1383 		const struct cred *cred;
1384 		int ret, n;
1385 
1386 		cred = sk_get_peer_cred(sk);
1387 		if (!cred)
1388 			return -ENODATA;
1389 
1390 		n = cred->group_info->ngroups;
1391 		if (len < n * sizeof(gid_t)) {
1392 			len = n * sizeof(gid_t);
1393 			put_cred(cred);
1394 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1395 		}
1396 		len = n * sizeof(gid_t);
1397 
1398 		ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1399 		put_cred(cred);
1400 		if (ret)
1401 			return ret;
1402 		goto lenout;
1403 	}
1404 
1405 	case SO_PEERNAME:
1406 	{
1407 		char address[128];
1408 
1409 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1410 		if (lv < 0)
1411 			return -ENOTCONN;
1412 		if (lv < len)
1413 			return -EINVAL;
1414 		if (copy_to_user(optval, address, len))
1415 			return -EFAULT;
1416 		goto lenout;
1417 	}
1418 
1419 	/* Dubious BSD thing... Probably nobody even uses it, but
1420 	 * the UNIX standard wants it for whatever reason... -DaveM
1421 	 */
1422 	case SO_ACCEPTCONN:
1423 		v.val = sk->sk_state == TCP_LISTEN;
1424 		break;
1425 
1426 	case SO_PASSSEC:
1427 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1428 		break;
1429 
1430 	case SO_PEERSEC:
1431 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1432 
1433 	case SO_MARK:
1434 		v.val = sk->sk_mark;
1435 		break;
1436 
1437 	case SO_RXQ_OVFL:
1438 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1439 		break;
1440 
1441 	case SO_WIFI_STATUS:
1442 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1443 		break;
1444 
1445 	case SO_PEEK_OFF:
1446 		if (!sock->ops->set_peek_off)
1447 			return -EOPNOTSUPP;
1448 
1449 		v.val = READ_ONCE(sk->sk_peek_off);
1450 		break;
1451 	case SO_NOFCS:
1452 		v.val = sock_flag(sk, SOCK_NOFCS);
1453 		break;
1454 
1455 	case SO_BINDTODEVICE:
1456 		return sock_getbindtodevice(sk, optval, optlen, len);
1457 
1458 	case SO_GET_FILTER:
1459 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1460 		if (len < 0)
1461 			return len;
1462 
1463 		goto lenout;
1464 
1465 	case SO_LOCK_FILTER:
1466 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1467 		break;
1468 
1469 	case SO_BPF_EXTENSIONS:
1470 		v.val = bpf_tell_extensions();
1471 		break;
1472 
1473 	case SO_SELECT_ERR_QUEUE:
1474 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1475 		break;
1476 
1477 #ifdef CONFIG_NET_RX_BUSY_POLL
1478 	case SO_BUSY_POLL:
1479 		v.val = READ_ONCE(sk->sk_ll_usec);
1480 		break;
1481 #endif
1482 
1483 	case SO_MAX_PACING_RATE:
1484 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1485 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1486 			lv = sizeof(v.ulval);
1487 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1488 		} else {
1489 			/* 32bit version */
1490 			v.val = min_t(unsigned long, ~0U,
1491 				      READ_ONCE(sk->sk_max_pacing_rate));
1492 		}
1493 		break;
1494 
1495 	case SO_INCOMING_CPU:
1496 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1497 		break;
1498 
1499 	case SO_MEMINFO:
1500 	{
1501 		u32 meminfo[SK_MEMINFO_VARS];
1502 
1503 		sk_get_meminfo(sk, meminfo);
1504 
1505 		len = min_t(unsigned int, len, sizeof(meminfo));
1506 		if (copy_to_user(optval, &meminfo, len))
1507 			return -EFAULT;
1508 
1509 		goto lenout;
1510 	}
1511 
1512 #ifdef CONFIG_NET_RX_BUSY_POLL
1513 	case SO_INCOMING_NAPI_ID:
1514 		v.val = READ_ONCE(sk->sk_napi_id);
1515 
1516 		/* aggregate non-NAPI IDs down to 0 */
1517 		if (v.val < MIN_NAPI_ID)
1518 			v.val = 0;
1519 
1520 		break;
1521 #endif
1522 
1523 	case SO_COOKIE:
1524 		lv = sizeof(u64);
1525 		if (len < lv)
1526 			return -EINVAL;
1527 		v.val64 = sock_gen_cookie(sk);
1528 		break;
1529 
1530 	case SO_ZEROCOPY:
1531 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1532 		break;
1533 
1534 	case SO_TXTIME:
1535 		lv = sizeof(v.txtime);
1536 		v.txtime.clockid = sk->sk_clockid;
1537 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1538 				  SOF_TXTIME_DEADLINE_MODE : 0;
1539 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1540 				  SOF_TXTIME_REPORT_ERRORS : 0;
1541 		break;
1542 
1543 	case SO_BINDTOIFINDEX:
1544 		v.val = sk->sk_bound_dev_if;
1545 		break;
1546 
1547 	default:
1548 		/* We implement the SO_SNDLOWAT etc to not be settable
1549 		 * (1003.1g 7).
1550 		 */
1551 		return -ENOPROTOOPT;
1552 	}
1553 
1554 	if (len > lv)
1555 		len = lv;
1556 	if (copy_to_user(optval, &v, len))
1557 		return -EFAULT;
1558 lenout:
1559 	if (put_user(len, optlen))
1560 		return -EFAULT;
1561 	return 0;
1562 }
1563 
1564 /*
1565  * Initialize an sk_lock.
1566  *
1567  * (We also register the sk_lock with the lock validator.)
1568  */
sock_lock_init(struct sock * sk)1569 static inline void sock_lock_init(struct sock *sk)
1570 {
1571 	if (sk->sk_kern_sock)
1572 		sock_lock_init_class_and_name(
1573 			sk,
1574 			af_family_kern_slock_key_strings[sk->sk_family],
1575 			af_family_kern_slock_keys + sk->sk_family,
1576 			af_family_kern_key_strings[sk->sk_family],
1577 			af_family_kern_keys + sk->sk_family);
1578 	else
1579 		sock_lock_init_class_and_name(
1580 			sk,
1581 			af_family_slock_key_strings[sk->sk_family],
1582 			af_family_slock_keys + sk->sk_family,
1583 			af_family_key_strings[sk->sk_family],
1584 			af_family_keys + sk->sk_family);
1585 }
1586 
1587 /*
1588  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1589  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1590  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1591  */
sock_copy(struct sock * nsk,const struct sock * osk)1592 static void sock_copy(struct sock *nsk, const struct sock *osk)
1593 {
1594 #ifdef CONFIG_SECURITY_NETWORK
1595 	void *sptr = nsk->sk_security;
1596 #endif
1597 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1598 
1599 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1600 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1601 
1602 #ifdef CONFIG_SECURITY_NETWORK
1603 	nsk->sk_security = sptr;
1604 	security_sk_clone(osk, nsk);
1605 #endif
1606 }
1607 
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1608 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1609 		int family)
1610 {
1611 	struct sock *sk;
1612 	struct kmem_cache *slab;
1613 
1614 	slab = prot->slab;
1615 	if (slab != NULL) {
1616 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1617 		if (!sk)
1618 			return sk;
1619 		if (want_init_on_alloc(priority))
1620 			sk_prot_clear_nulls(sk, prot->obj_size);
1621 	} else
1622 		sk = kmalloc(prot->obj_size, priority);
1623 
1624 	if (sk != NULL) {
1625 		if (security_sk_alloc(sk, family, priority))
1626 			goto out_free;
1627 
1628 		trace_android_rvh_sk_alloc(sk);
1629 
1630 		if (!try_module_get(prot->owner))
1631 			goto out_free_sec;
1632 		sk_tx_queue_clear(sk);
1633 	}
1634 
1635 	return sk;
1636 
1637 out_free_sec:
1638 	security_sk_free(sk);
1639 	trace_android_rvh_sk_free(sk);
1640 out_free:
1641 	if (slab != NULL)
1642 		kmem_cache_free(slab, sk);
1643 	else
1644 		kfree(sk);
1645 	return NULL;
1646 }
1647 
sk_prot_free(struct proto * prot,struct sock * sk)1648 static void sk_prot_free(struct proto *prot, struct sock *sk)
1649 {
1650 	struct kmem_cache *slab;
1651 	struct module *owner;
1652 
1653 	owner = prot->owner;
1654 	slab = prot->slab;
1655 
1656 	cgroup_sk_free(&sk->sk_cgrp_data);
1657 	mem_cgroup_sk_free(sk);
1658 	security_sk_free(sk);
1659 	trace_android_rvh_sk_free(sk);
1660 	if (slab != NULL)
1661 		kmem_cache_free(slab, sk);
1662 	else
1663 		kfree(sk);
1664 	module_put(owner);
1665 }
1666 
1667 /**
1668  *	sk_alloc - All socket objects are allocated here
1669  *	@net: the applicable net namespace
1670  *	@family: protocol family
1671  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1672  *	@prot: struct proto associated with this new sock instance
1673  *	@kern: is this to be a kernel socket?
1674  */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)1675 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1676 		      struct proto *prot, int kern)
1677 {
1678 	struct sock *sk;
1679 
1680 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1681 	if (sk) {
1682 		sk->sk_family = family;
1683 		/*
1684 		 * See comment in struct sock definition to understand
1685 		 * why we need sk_prot_creator -acme
1686 		 */
1687 		sk->sk_prot = sk->sk_prot_creator = prot;
1688 		sk->sk_kern_sock = kern;
1689 		sock_lock_init(sk);
1690 		sk->sk_net_refcnt = kern ? 0 : 1;
1691 		if (likely(sk->sk_net_refcnt)) {
1692 			get_net(net);
1693 			sock_inuse_add(net, 1);
1694 		}
1695 
1696 		sock_net_set(sk, net);
1697 		refcount_set(&sk->sk_wmem_alloc, 1);
1698 
1699 		mem_cgroup_sk_alloc(sk);
1700 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1701 		sock_update_classid(&sk->sk_cgrp_data);
1702 		sock_update_netprioidx(&sk->sk_cgrp_data);
1703 		sk_tx_queue_clear(sk);
1704 	}
1705 
1706 	return sk;
1707 }
1708 EXPORT_SYMBOL(sk_alloc);
1709 
1710 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1711  * grace period. This is the case for UDP sockets and TCP listeners.
1712  */
__sk_destruct(struct rcu_head * head)1713 static void __sk_destruct(struct rcu_head *head)
1714 {
1715 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1716 	struct sk_filter *filter;
1717 
1718 	if (sk->sk_destruct)
1719 		sk->sk_destruct(sk);
1720 
1721 	filter = rcu_dereference_check(sk->sk_filter,
1722 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1723 	if (filter) {
1724 		sk_filter_uncharge(sk, filter);
1725 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1726 	}
1727 
1728 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1729 
1730 #ifdef CONFIG_BPF_SYSCALL
1731 	bpf_sk_storage_free(sk);
1732 #endif
1733 
1734 	if (atomic_read(&sk->sk_omem_alloc))
1735 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1736 			 __func__, atomic_read(&sk->sk_omem_alloc));
1737 
1738 	if (sk->sk_frag.page) {
1739 		put_page(sk->sk_frag.page);
1740 		sk->sk_frag.page = NULL;
1741 	}
1742 
1743 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1744 	put_cred(sk->sk_peer_cred);
1745 	put_pid(sk->sk_peer_pid);
1746 
1747 	if (likely(sk->sk_net_refcnt))
1748 		put_net(sock_net(sk));
1749 	sk_prot_free(sk->sk_prot_creator, sk);
1750 }
1751 
sk_destruct(struct sock * sk)1752 void sk_destruct(struct sock *sk)
1753 {
1754 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1755 
1756 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1757 		reuseport_detach_sock(sk);
1758 		use_call_rcu = true;
1759 	}
1760 
1761 	if (use_call_rcu)
1762 		call_rcu(&sk->sk_rcu, __sk_destruct);
1763 	else
1764 		__sk_destruct(&sk->sk_rcu);
1765 }
1766 
__sk_free(struct sock * sk)1767 static void __sk_free(struct sock *sk)
1768 {
1769 	if (likely(sk->sk_net_refcnt))
1770 		sock_inuse_add(sock_net(sk), -1);
1771 
1772 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1773 		sock_diag_broadcast_destroy(sk);
1774 	else
1775 		sk_destruct(sk);
1776 }
1777 
sk_free(struct sock * sk)1778 void sk_free(struct sock *sk)
1779 {
1780 	/*
1781 	 * We subtract one from sk_wmem_alloc and can know if
1782 	 * some packets are still in some tx queue.
1783 	 * If not null, sock_wfree() will call __sk_free(sk) later
1784 	 */
1785 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1786 		__sk_free(sk);
1787 }
1788 EXPORT_SYMBOL(sk_free);
1789 
sk_init_common(struct sock * sk)1790 static void sk_init_common(struct sock *sk)
1791 {
1792 	skb_queue_head_init(&sk->sk_receive_queue);
1793 	skb_queue_head_init(&sk->sk_write_queue);
1794 	skb_queue_head_init(&sk->sk_error_queue);
1795 
1796 	rwlock_init(&sk->sk_callback_lock);
1797 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1798 			af_rlock_keys + sk->sk_family,
1799 			af_family_rlock_key_strings[sk->sk_family]);
1800 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1801 			af_wlock_keys + sk->sk_family,
1802 			af_family_wlock_key_strings[sk->sk_family]);
1803 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1804 			af_elock_keys + sk->sk_family,
1805 			af_family_elock_key_strings[sk->sk_family]);
1806 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1807 			af_callback_keys + sk->sk_family,
1808 			af_family_clock_key_strings[sk->sk_family]);
1809 }
1810 
1811 /**
1812  *	sk_clone_lock - clone a socket, and lock its clone
1813  *	@sk: the socket to clone
1814  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1815  *
1816  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1817  */
sk_clone_lock(const struct sock * sk,const gfp_t priority)1818 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1819 {
1820 	struct sock *newsk;
1821 	bool is_charged = true;
1822 
1823 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1824 	if (newsk != NULL) {
1825 		struct sk_filter *filter;
1826 
1827 		sock_copy(newsk, sk);
1828 
1829 		newsk->sk_prot_creator = sk->sk_prot;
1830 
1831 		/* SANITY */
1832 		if (likely(newsk->sk_net_refcnt))
1833 			get_net(sock_net(newsk));
1834 		sk_node_init(&newsk->sk_node);
1835 		sock_lock_init(newsk);
1836 		bh_lock_sock(newsk);
1837 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1838 		newsk->sk_backlog.len = 0;
1839 
1840 		atomic_set(&newsk->sk_rmem_alloc, 0);
1841 		/*
1842 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1843 		 */
1844 		refcount_set(&newsk->sk_wmem_alloc, 1);
1845 		atomic_set(&newsk->sk_omem_alloc, 0);
1846 		sk_init_common(newsk);
1847 
1848 		newsk->sk_dst_cache	= NULL;
1849 		newsk->sk_dst_pending_confirm = 0;
1850 		newsk->sk_wmem_queued	= 0;
1851 		newsk->sk_forward_alloc = 0;
1852 		atomic_set(&newsk->sk_drops, 0);
1853 		newsk->sk_send_head	= NULL;
1854 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1855 		atomic_set(&newsk->sk_zckey, 0);
1856 
1857 		sock_reset_flag(newsk, SOCK_DONE);
1858 
1859 		/* sk->sk_memcg will be populated at accept() time */
1860 		newsk->sk_memcg = NULL;
1861 
1862 		cgroup_sk_clone(&newsk->sk_cgrp_data);
1863 
1864 		rcu_read_lock();
1865 		filter = rcu_dereference(sk->sk_filter);
1866 		if (filter != NULL)
1867 			/* though it's an empty new sock, the charging may fail
1868 			 * if sysctl_optmem_max was changed between creation of
1869 			 * original socket and cloning
1870 			 */
1871 			is_charged = sk_filter_charge(newsk, filter);
1872 		RCU_INIT_POINTER(newsk->sk_filter, filter);
1873 		rcu_read_unlock();
1874 
1875 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1876 			/* We need to make sure that we don't uncharge the new
1877 			 * socket if we couldn't charge it in the first place
1878 			 * as otherwise we uncharge the parent's filter.
1879 			 */
1880 			if (!is_charged)
1881 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1882 			sk_free_unlock_clone(newsk);
1883 			newsk = NULL;
1884 			goto out;
1885 		}
1886 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1887 
1888 		if (bpf_sk_storage_clone(sk, newsk)) {
1889 			sk_free_unlock_clone(newsk);
1890 			newsk = NULL;
1891 			goto out;
1892 		}
1893 
1894 		newsk->sk_err	   = 0;
1895 		newsk->sk_err_soft = 0;
1896 		newsk->sk_priority = 0;
1897 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1898 		if (likely(newsk->sk_net_refcnt))
1899 			sock_inuse_add(sock_net(newsk), 1);
1900 
1901 		/*
1902 		 * Before updating sk_refcnt, we must commit prior changes to memory
1903 		 * (Documentation/RCU/rculist_nulls.txt for details)
1904 		 */
1905 		smp_wmb();
1906 		refcount_set(&newsk->sk_refcnt, 2);
1907 
1908 		/*
1909 		 * Increment the counter in the same struct proto as the master
1910 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1911 		 * is the same as sk->sk_prot->socks, as this field was copied
1912 		 * with memcpy).
1913 		 *
1914 		 * This _changes_ the previous behaviour, where
1915 		 * tcp_create_openreq_child always was incrementing the
1916 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1917 		 * to be taken into account in all callers. -acme
1918 		 */
1919 		sk_refcnt_debug_inc(newsk);
1920 		sk_set_socket(newsk, NULL);
1921 		sk_tx_queue_clear(newsk);
1922 		RCU_INIT_POINTER(newsk->sk_wq, NULL);
1923 
1924 		if (newsk->sk_prot->sockets_allocated)
1925 			sk_sockets_allocated_inc(newsk);
1926 
1927 		if (sock_needs_netstamp(sk) &&
1928 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1929 			net_enable_timestamp();
1930 	}
1931 out:
1932 	return newsk;
1933 }
1934 EXPORT_SYMBOL_GPL(sk_clone_lock);
1935 
sk_free_unlock_clone(struct sock * sk)1936 void sk_free_unlock_clone(struct sock *sk)
1937 {
1938 	/* It is still raw copy of parent, so invalidate
1939 	 * destructor and make plain sk_free() */
1940 	sk->sk_destruct = NULL;
1941 	bh_unlock_sock(sk);
1942 	sk_free(sk);
1943 }
1944 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1945 
sk_setup_caps(struct sock * sk,struct dst_entry * dst)1946 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1947 {
1948 	u32 max_segs = 1;
1949 
1950 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1951 	if (sk->sk_route_caps & NETIF_F_GSO)
1952 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1953 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1954 	if (sk_can_gso(sk)) {
1955 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1956 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1957 		} else {
1958 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1959 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1960 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1961 		}
1962 	}
1963 	sk->sk_gso_max_segs = max_segs;
1964 	sk_dst_set(sk, dst);
1965 }
1966 EXPORT_SYMBOL_GPL(sk_setup_caps);
1967 
1968 /*
1969  *	Simple resource managers for sockets.
1970  */
1971 
1972 
1973 /*
1974  * Write buffer destructor automatically called from kfree_skb.
1975  */
sock_wfree(struct sk_buff * skb)1976 void sock_wfree(struct sk_buff *skb)
1977 {
1978 	struct sock *sk = skb->sk;
1979 	unsigned int len = skb->truesize;
1980 
1981 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1982 		/*
1983 		 * Keep a reference on sk_wmem_alloc, this will be released
1984 		 * after sk_write_space() call
1985 		 */
1986 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1987 		sk->sk_write_space(sk);
1988 		len = 1;
1989 	}
1990 	/*
1991 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1992 	 * could not do because of in-flight packets
1993 	 */
1994 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1995 		__sk_free(sk);
1996 }
1997 EXPORT_SYMBOL(sock_wfree);
1998 
1999 /* This variant of sock_wfree() is used by TCP,
2000  * since it sets SOCK_USE_WRITE_QUEUE.
2001  */
__sock_wfree(struct sk_buff * skb)2002 void __sock_wfree(struct sk_buff *skb)
2003 {
2004 	struct sock *sk = skb->sk;
2005 
2006 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2007 		__sk_free(sk);
2008 }
2009 
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2010 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2011 {
2012 	skb_orphan(skb);
2013 	skb->sk = sk;
2014 #ifdef CONFIG_INET
2015 	if (unlikely(!sk_fullsock(sk))) {
2016 		skb->destructor = sock_edemux;
2017 		sock_hold(sk);
2018 		return;
2019 	}
2020 #endif
2021 	skb->destructor = sock_wfree;
2022 	skb_set_hash_from_sk(skb, sk);
2023 	/*
2024 	 * We used to take a refcount on sk, but following operation
2025 	 * is enough to guarantee sk_free() wont free this sock until
2026 	 * all in-flight packets are completed
2027 	 */
2028 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2029 }
2030 EXPORT_SYMBOL(skb_set_owner_w);
2031 
can_skb_orphan_partial(const struct sk_buff * skb)2032 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2033 {
2034 #ifdef CONFIG_TLS_DEVICE
2035 	/* Drivers depend on in-order delivery for crypto offload,
2036 	 * partial orphan breaks out-of-order-OK logic.
2037 	 */
2038 	if (skb->decrypted)
2039 		return false;
2040 #endif
2041 	return (skb->destructor == sock_wfree ||
2042 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2043 }
2044 
2045 /* This helper is used by netem, as it can hold packets in its
2046  * delay queue. We want to allow the owner socket to send more
2047  * packets, as if they were already TX completed by a typical driver.
2048  * But we also want to keep skb->sk set because some packet schedulers
2049  * rely on it (sch_fq for example).
2050  */
skb_orphan_partial(struct sk_buff * skb)2051 void skb_orphan_partial(struct sk_buff *skb)
2052 {
2053 	if (skb_is_tcp_pure_ack(skb))
2054 		return;
2055 
2056 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2057 		return;
2058 
2059 	skb_orphan(skb);
2060 }
2061 EXPORT_SYMBOL(skb_orphan_partial);
2062 
2063 /*
2064  * Read buffer destructor automatically called from kfree_skb.
2065  */
sock_rfree(struct sk_buff * skb)2066 void sock_rfree(struct sk_buff *skb)
2067 {
2068 	struct sock *sk = skb->sk;
2069 	unsigned int len = skb->truesize;
2070 
2071 	atomic_sub(len, &sk->sk_rmem_alloc);
2072 	sk_mem_uncharge(sk, len);
2073 }
2074 EXPORT_SYMBOL(sock_rfree);
2075 
2076 /*
2077  * Buffer destructor for skbs that are not used directly in read or write
2078  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2079  */
sock_efree(struct sk_buff * skb)2080 void sock_efree(struct sk_buff *skb)
2081 {
2082 	sock_put(skb->sk);
2083 }
2084 EXPORT_SYMBOL(sock_efree);
2085 
sock_i_uid(struct sock * sk)2086 kuid_t sock_i_uid(struct sock *sk)
2087 {
2088 	kuid_t uid;
2089 
2090 	read_lock_bh(&sk->sk_callback_lock);
2091 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2092 	read_unlock_bh(&sk->sk_callback_lock);
2093 	return uid;
2094 }
2095 EXPORT_SYMBOL(sock_i_uid);
2096 
__sock_i_ino(struct sock * sk)2097 unsigned long __sock_i_ino(struct sock *sk)
2098 {
2099 	unsigned long ino;
2100 
2101 	read_lock(&sk->sk_callback_lock);
2102 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2103 	read_unlock(&sk->sk_callback_lock);
2104 	return ino;
2105 }
2106 EXPORT_SYMBOL(__sock_i_ino);
2107 
sock_i_ino(struct sock * sk)2108 unsigned long sock_i_ino(struct sock *sk)
2109 {
2110 	unsigned long ino;
2111 
2112 	local_bh_disable();
2113 	ino = __sock_i_ino(sk);
2114 	local_bh_enable();
2115 	return ino;
2116 }
2117 EXPORT_SYMBOL(sock_i_ino);
2118 
2119 /*
2120  * Allocate a skb from the socket's send buffer.
2121  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2122 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2123 			     gfp_t priority)
2124 {
2125 	if (force ||
2126 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2127 		struct sk_buff *skb = alloc_skb(size, priority);
2128 
2129 		if (skb) {
2130 			skb_set_owner_w(skb, sk);
2131 			return skb;
2132 		}
2133 	}
2134 	return NULL;
2135 }
2136 EXPORT_SYMBOL(sock_wmalloc);
2137 
sock_ofree(struct sk_buff * skb)2138 static void sock_ofree(struct sk_buff *skb)
2139 {
2140 	struct sock *sk = skb->sk;
2141 
2142 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2143 }
2144 
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2145 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2146 			     gfp_t priority)
2147 {
2148 	struct sk_buff *skb;
2149 
2150 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2151 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2152 	    sysctl_optmem_max)
2153 		return NULL;
2154 
2155 	skb = alloc_skb(size, priority);
2156 	if (!skb)
2157 		return NULL;
2158 
2159 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2160 	skb->sk = sk;
2161 	skb->destructor = sock_ofree;
2162 	return skb;
2163 }
2164 
2165 /*
2166  * Allocate a memory block from the socket's option memory buffer.
2167  */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2168 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2169 {
2170 	if ((unsigned int)size <= sysctl_optmem_max &&
2171 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2172 		void *mem;
2173 		/* First do the add, to avoid the race if kmalloc
2174 		 * might sleep.
2175 		 */
2176 		atomic_add(size, &sk->sk_omem_alloc);
2177 		mem = kmalloc(size, priority);
2178 		if (mem)
2179 			return mem;
2180 		atomic_sub(size, &sk->sk_omem_alloc);
2181 	}
2182 	return NULL;
2183 }
2184 EXPORT_SYMBOL(sock_kmalloc);
2185 
2186 /* Free an option memory block. Note, we actually want the inline
2187  * here as this allows gcc to detect the nullify and fold away the
2188  * condition entirely.
2189  */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2190 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2191 				  const bool nullify)
2192 {
2193 	if (WARN_ON_ONCE(!mem))
2194 		return;
2195 	if (nullify)
2196 		kzfree(mem);
2197 	else
2198 		kfree(mem);
2199 	atomic_sub(size, &sk->sk_omem_alloc);
2200 }
2201 
sock_kfree_s(struct sock * sk,void * mem,int size)2202 void sock_kfree_s(struct sock *sk, void *mem, int size)
2203 {
2204 	__sock_kfree_s(sk, mem, size, false);
2205 }
2206 EXPORT_SYMBOL(sock_kfree_s);
2207 
sock_kzfree_s(struct sock * sk,void * mem,int size)2208 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2209 {
2210 	__sock_kfree_s(sk, mem, size, true);
2211 }
2212 EXPORT_SYMBOL(sock_kzfree_s);
2213 
2214 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2215    I think, these locks should be removed for datagram sockets.
2216  */
sock_wait_for_wmem(struct sock * sk,long timeo)2217 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2218 {
2219 	DEFINE_WAIT(wait);
2220 
2221 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2222 	for (;;) {
2223 		if (!timeo)
2224 			break;
2225 		if (signal_pending(current))
2226 			break;
2227 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2228 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2229 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2230 			break;
2231 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2232 			break;
2233 		if (READ_ONCE(sk->sk_err))
2234 			break;
2235 		timeo = schedule_timeout(timeo);
2236 	}
2237 	finish_wait(sk_sleep(sk), &wait);
2238 	return timeo;
2239 }
2240 
2241 
2242 /*
2243  *	Generic send/receive buffer handlers
2244  */
2245 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2246 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2247 				     unsigned long data_len, int noblock,
2248 				     int *errcode, int max_page_order)
2249 {
2250 	struct sk_buff *skb;
2251 	long timeo;
2252 	int err;
2253 
2254 	timeo = sock_sndtimeo(sk, noblock);
2255 	for (;;) {
2256 		err = sock_error(sk);
2257 		if (err != 0)
2258 			goto failure;
2259 
2260 		err = -EPIPE;
2261 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2262 			goto failure;
2263 
2264 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2265 			break;
2266 
2267 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2268 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2269 		err = -EAGAIN;
2270 		if (!timeo)
2271 			goto failure;
2272 		if (signal_pending(current))
2273 			goto interrupted;
2274 		timeo = sock_wait_for_wmem(sk, timeo);
2275 	}
2276 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2277 				   errcode, sk->sk_allocation);
2278 	if (skb)
2279 		skb_set_owner_w(skb, sk);
2280 	return skb;
2281 
2282 interrupted:
2283 	err = sock_intr_errno(timeo);
2284 failure:
2285 	*errcode = err;
2286 	return NULL;
2287 }
2288 EXPORT_SYMBOL(sock_alloc_send_pskb);
2289 
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)2290 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2291 				    int noblock, int *errcode)
2292 {
2293 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2294 }
2295 EXPORT_SYMBOL(sock_alloc_send_skb);
2296 
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2297 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2298 		     struct sockcm_cookie *sockc)
2299 {
2300 	u32 tsflags;
2301 
2302 	switch (cmsg->cmsg_type) {
2303 	case SO_MARK:
2304 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2305 			return -EPERM;
2306 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2307 			return -EINVAL;
2308 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2309 		break;
2310 	case SO_TIMESTAMPING_OLD:
2311 	case SO_TIMESTAMPING_NEW:
2312 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2313 			return -EINVAL;
2314 
2315 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2316 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2317 			return -EINVAL;
2318 
2319 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2320 		sockc->tsflags |= tsflags;
2321 		break;
2322 	case SCM_TXTIME:
2323 		if (!sock_flag(sk, SOCK_TXTIME))
2324 			return -EINVAL;
2325 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2326 			return -EINVAL;
2327 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2328 		break;
2329 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2330 	case SCM_RIGHTS:
2331 	case SCM_CREDENTIALS:
2332 		break;
2333 	default:
2334 		return -EINVAL;
2335 	}
2336 	return 0;
2337 }
2338 EXPORT_SYMBOL(__sock_cmsg_send);
2339 
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2340 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2341 		   struct sockcm_cookie *sockc)
2342 {
2343 	struct cmsghdr *cmsg;
2344 	int ret;
2345 
2346 	for_each_cmsghdr(cmsg, msg) {
2347 		if (!CMSG_OK(msg, cmsg))
2348 			return -EINVAL;
2349 		if (cmsg->cmsg_level != SOL_SOCKET)
2350 			continue;
2351 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2352 		if (ret)
2353 			return ret;
2354 	}
2355 	return 0;
2356 }
2357 EXPORT_SYMBOL(sock_cmsg_send);
2358 
sk_enter_memory_pressure(struct sock * sk)2359 static void sk_enter_memory_pressure(struct sock *sk)
2360 {
2361 	if (!sk->sk_prot->enter_memory_pressure)
2362 		return;
2363 
2364 	sk->sk_prot->enter_memory_pressure(sk);
2365 }
2366 
sk_leave_memory_pressure(struct sock * sk)2367 static void sk_leave_memory_pressure(struct sock *sk)
2368 {
2369 	if (sk->sk_prot->leave_memory_pressure) {
2370 		sk->sk_prot->leave_memory_pressure(sk);
2371 	} else {
2372 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2373 
2374 		if (memory_pressure && READ_ONCE(*memory_pressure))
2375 			WRITE_ONCE(*memory_pressure, 0);
2376 	}
2377 }
2378 
2379 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2380 
2381 /**
2382  * skb_page_frag_refill - check that a page_frag contains enough room
2383  * @sz: minimum size of the fragment we want to get
2384  * @pfrag: pointer to page_frag
2385  * @gfp: priority for memory allocation
2386  *
2387  * Note: While this allocator tries to use high order pages, there is
2388  * no guarantee that allocations succeed. Therefore, @sz MUST be
2389  * less or equal than PAGE_SIZE.
2390  */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2391 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2392 {
2393 	if (pfrag->page) {
2394 		if (page_ref_count(pfrag->page) == 1) {
2395 			pfrag->offset = 0;
2396 			return true;
2397 		}
2398 		if (pfrag->offset + sz <= pfrag->size)
2399 			return true;
2400 		put_page(pfrag->page);
2401 	}
2402 
2403 	pfrag->offset = 0;
2404 	if (SKB_FRAG_PAGE_ORDER &&
2405 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2406 		/* Avoid direct reclaim but allow kswapd to wake */
2407 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2408 					  __GFP_COMP | __GFP_NOWARN |
2409 					  __GFP_NORETRY,
2410 					  SKB_FRAG_PAGE_ORDER);
2411 		if (likely(pfrag->page)) {
2412 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2413 			return true;
2414 		}
2415 	}
2416 	pfrag->page = alloc_page(gfp);
2417 	if (likely(pfrag->page)) {
2418 		pfrag->size = PAGE_SIZE;
2419 		return true;
2420 	}
2421 	return false;
2422 }
2423 EXPORT_SYMBOL(skb_page_frag_refill);
2424 
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2425 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2426 {
2427 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2428 		return true;
2429 
2430 	sk_enter_memory_pressure(sk);
2431 	sk_stream_moderate_sndbuf(sk);
2432 	return false;
2433 }
2434 EXPORT_SYMBOL(sk_page_frag_refill);
2435 
__lock_sock(struct sock * sk)2436 static void __lock_sock(struct sock *sk)
2437 	__releases(&sk->sk_lock.slock)
2438 	__acquires(&sk->sk_lock.slock)
2439 {
2440 	DEFINE_WAIT(wait);
2441 
2442 	for (;;) {
2443 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2444 					TASK_UNINTERRUPTIBLE);
2445 		spin_unlock_bh(&sk->sk_lock.slock);
2446 		schedule();
2447 		spin_lock_bh(&sk->sk_lock.slock);
2448 		if (!sock_owned_by_user(sk))
2449 			break;
2450 	}
2451 	finish_wait(&sk->sk_lock.wq, &wait);
2452 }
2453 
__release_sock(struct sock * sk)2454 void __release_sock(struct sock *sk)
2455 	__releases(&sk->sk_lock.slock)
2456 	__acquires(&sk->sk_lock.slock)
2457 {
2458 	struct sk_buff *skb, *next;
2459 
2460 	while ((skb = sk->sk_backlog.head) != NULL) {
2461 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2462 
2463 		spin_unlock_bh(&sk->sk_lock.slock);
2464 
2465 		do {
2466 			next = skb->next;
2467 			prefetch(next);
2468 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2469 			skb_mark_not_on_list(skb);
2470 			sk_backlog_rcv(sk, skb);
2471 
2472 			cond_resched();
2473 
2474 			skb = next;
2475 		} while (skb != NULL);
2476 
2477 		spin_lock_bh(&sk->sk_lock.slock);
2478 	}
2479 
2480 	/*
2481 	 * Doing the zeroing here guarantee we can not loop forever
2482 	 * while a wild producer attempts to flood us.
2483 	 */
2484 	sk->sk_backlog.len = 0;
2485 }
2486 
__sk_flush_backlog(struct sock * sk)2487 void __sk_flush_backlog(struct sock *sk)
2488 {
2489 	spin_lock_bh(&sk->sk_lock.slock);
2490 	__release_sock(sk);
2491 	spin_unlock_bh(&sk->sk_lock.slock);
2492 }
2493 
2494 /**
2495  * sk_wait_data - wait for data to arrive at sk_receive_queue
2496  * @sk:    sock to wait on
2497  * @timeo: for how long
2498  * @skb:   last skb seen on sk_receive_queue
2499  *
2500  * Now socket state including sk->sk_err is changed only under lock,
2501  * hence we may omit checks after joining wait queue.
2502  * We check receive queue before schedule() only as optimization;
2503  * it is very likely that release_sock() added new data.
2504  */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2505 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2506 {
2507 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2508 	int rc;
2509 
2510 	add_wait_queue(sk_sleep(sk), &wait);
2511 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2512 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2513 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2514 	remove_wait_queue(sk_sleep(sk), &wait);
2515 	return rc;
2516 }
2517 EXPORT_SYMBOL(sk_wait_data);
2518 
2519 /**
2520  *	__sk_mem_raise_allocated - increase memory_allocated
2521  *	@sk: socket
2522  *	@size: memory size to allocate
2523  *	@amt: pages to allocate
2524  *	@kind: allocation type
2525  *
2526  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2527  */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)2528 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2529 {
2530 	struct proto *prot = sk->sk_prot;
2531 	long allocated = sk_memory_allocated_add(sk, amt);
2532 	bool charged = true;
2533 
2534 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2535 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2536 		goto suppress_allocation;
2537 
2538 	/* Under limit. */
2539 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2540 		sk_leave_memory_pressure(sk);
2541 		return 1;
2542 	}
2543 
2544 	/* Under pressure. */
2545 	if (allocated > sk_prot_mem_limits(sk, 1))
2546 		sk_enter_memory_pressure(sk);
2547 
2548 	/* Over hard limit. */
2549 	if (allocated > sk_prot_mem_limits(sk, 2))
2550 		goto suppress_allocation;
2551 
2552 	/* guarantee minimum buffer size under pressure */
2553 	if (kind == SK_MEM_RECV) {
2554 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2555 			return 1;
2556 
2557 	} else { /* SK_MEM_SEND */
2558 		int wmem0 = sk_get_wmem0(sk, prot);
2559 
2560 		if (sk->sk_type == SOCK_STREAM) {
2561 			if (sk->sk_wmem_queued < wmem0)
2562 				return 1;
2563 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2564 				return 1;
2565 		}
2566 	}
2567 
2568 	if (sk_has_memory_pressure(sk)) {
2569 		u64 alloc;
2570 
2571 		if (!sk_under_memory_pressure(sk))
2572 			return 1;
2573 		alloc = sk_sockets_allocated_read_positive(sk);
2574 		if (sk_prot_mem_limits(sk, 2) > alloc *
2575 		    sk_mem_pages(sk->sk_wmem_queued +
2576 				 atomic_read(&sk->sk_rmem_alloc) +
2577 				 sk->sk_forward_alloc))
2578 			return 1;
2579 	}
2580 
2581 suppress_allocation:
2582 
2583 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2584 		sk_stream_moderate_sndbuf(sk);
2585 
2586 		/* Fail only if socket is _under_ its sndbuf.
2587 		 * In this case we cannot block, so that we have to fail.
2588 		 */
2589 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2590 			return 1;
2591 	}
2592 
2593 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2594 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2595 
2596 	sk_memory_allocated_sub(sk, amt);
2597 
2598 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2599 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2600 
2601 	return 0;
2602 }
2603 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2604 
2605 /**
2606  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2607  *	@sk: socket
2608  *	@size: memory size to allocate
2609  *	@kind: allocation type
2610  *
2611  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2612  *	rmem allocation. This function assumes that protocols which have
2613  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2614  */
__sk_mem_schedule(struct sock * sk,int size,int kind)2615 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2616 {
2617 	int ret, amt = sk_mem_pages(size);
2618 
2619 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2620 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2621 	if (!ret)
2622 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2623 	return ret;
2624 }
2625 EXPORT_SYMBOL(__sk_mem_schedule);
2626 
2627 /**
2628  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2629  *	@sk: socket
2630  *	@amount: number of quanta
2631  *
2632  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2633  */
__sk_mem_reduce_allocated(struct sock * sk,int amount)2634 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2635 {
2636 	sk_memory_allocated_sub(sk, amount);
2637 
2638 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2639 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2640 
2641 	if (sk_under_global_memory_pressure(sk) &&
2642 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2643 		sk_leave_memory_pressure(sk);
2644 }
2645 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2646 
2647 /**
2648  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2649  *	@sk: socket
2650  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2651  */
__sk_mem_reclaim(struct sock * sk,int amount)2652 void __sk_mem_reclaim(struct sock *sk, int amount)
2653 {
2654 	amount >>= SK_MEM_QUANTUM_SHIFT;
2655 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2656 	__sk_mem_reduce_allocated(sk, amount);
2657 }
2658 EXPORT_SYMBOL(__sk_mem_reclaim);
2659 
sk_set_peek_off(struct sock * sk,int val)2660 int sk_set_peek_off(struct sock *sk, int val)
2661 {
2662 	WRITE_ONCE(sk->sk_peek_off, val);
2663 	return 0;
2664 }
2665 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2666 
2667 /*
2668  * Set of default routines for initialising struct proto_ops when
2669  * the protocol does not support a particular function. In certain
2670  * cases where it makes no sense for a protocol to have a "do nothing"
2671  * function, some default processing is provided.
2672  */
2673 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2674 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2675 {
2676 	return -EOPNOTSUPP;
2677 }
2678 EXPORT_SYMBOL(sock_no_bind);
2679 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2680 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2681 		    int len, int flags)
2682 {
2683 	return -EOPNOTSUPP;
2684 }
2685 EXPORT_SYMBOL(sock_no_connect);
2686 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2687 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2688 {
2689 	return -EOPNOTSUPP;
2690 }
2691 EXPORT_SYMBOL(sock_no_socketpair);
2692 
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)2693 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2694 		   bool kern)
2695 {
2696 	return -EOPNOTSUPP;
2697 }
2698 EXPORT_SYMBOL(sock_no_accept);
2699 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)2700 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2701 		    int peer)
2702 {
2703 	return -EOPNOTSUPP;
2704 }
2705 EXPORT_SYMBOL(sock_no_getname);
2706 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2707 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2708 {
2709 	return -EOPNOTSUPP;
2710 }
2711 EXPORT_SYMBOL(sock_no_ioctl);
2712 
sock_no_listen(struct socket * sock,int backlog)2713 int sock_no_listen(struct socket *sock, int backlog)
2714 {
2715 	return -EOPNOTSUPP;
2716 }
2717 EXPORT_SYMBOL(sock_no_listen);
2718 
sock_no_shutdown(struct socket * sock,int how)2719 int sock_no_shutdown(struct socket *sock, int how)
2720 {
2721 	return -EOPNOTSUPP;
2722 }
2723 EXPORT_SYMBOL(sock_no_shutdown);
2724 
sock_no_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2725 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2726 		    char __user *optval, unsigned int optlen)
2727 {
2728 	return -EOPNOTSUPP;
2729 }
2730 EXPORT_SYMBOL(sock_no_setsockopt);
2731 
sock_no_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2732 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2733 		    char __user *optval, int __user *optlen)
2734 {
2735 	return -EOPNOTSUPP;
2736 }
2737 EXPORT_SYMBOL(sock_no_getsockopt);
2738 
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)2739 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2740 {
2741 	return -EOPNOTSUPP;
2742 }
2743 EXPORT_SYMBOL(sock_no_sendmsg);
2744 
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)2745 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2746 {
2747 	return -EOPNOTSUPP;
2748 }
2749 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2750 
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)2751 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2752 		    int flags)
2753 {
2754 	return -EOPNOTSUPP;
2755 }
2756 EXPORT_SYMBOL(sock_no_recvmsg);
2757 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)2758 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2759 {
2760 	/* Mirror missing mmap method error code */
2761 	return -ENODEV;
2762 }
2763 EXPORT_SYMBOL(sock_no_mmap);
2764 
2765 /*
2766  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2767  * various sock-based usage counts.
2768  */
__receive_sock(struct file * file)2769 void __receive_sock(struct file *file)
2770 {
2771 	struct socket *sock;
2772 	int error;
2773 
2774 	/*
2775 	 * The resulting value of "error" is ignored here since we only
2776 	 * need to take action when the file is a socket and testing
2777 	 * "sock" for NULL is sufficient.
2778 	 */
2779 	sock = sock_from_file(file, &error);
2780 	if (sock) {
2781 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2782 		sock_update_classid(&sock->sk->sk_cgrp_data);
2783 	}
2784 }
2785 
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)2786 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2787 {
2788 	ssize_t res;
2789 	struct msghdr msg = {.msg_flags = flags};
2790 	struct kvec iov;
2791 	char *kaddr = kmap(page);
2792 	iov.iov_base = kaddr + offset;
2793 	iov.iov_len = size;
2794 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2795 	kunmap(page);
2796 	return res;
2797 }
2798 EXPORT_SYMBOL(sock_no_sendpage);
2799 
sock_no_sendpage_locked(struct sock * sk,struct page * page,int offset,size_t size,int flags)2800 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2801 				int offset, size_t size, int flags)
2802 {
2803 	ssize_t res;
2804 	struct msghdr msg = {.msg_flags = flags};
2805 	struct kvec iov;
2806 	char *kaddr = kmap(page);
2807 
2808 	iov.iov_base = kaddr + offset;
2809 	iov.iov_len = size;
2810 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2811 	kunmap(page);
2812 	return res;
2813 }
2814 EXPORT_SYMBOL(sock_no_sendpage_locked);
2815 
2816 /*
2817  *	Default Socket Callbacks
2818  */
2819 
sock_def_wakeup(struct sock * sk)2820 static void sock_def_wakeup(struct sock *sk)
2821 {
2822 	struct socket_wq *wq;
2823 
2824 	rcu_read_lock();
2825 	wq = rcu_dereference(sk->sk_wq);
2826 	if (skwq_has_sleeper(wq))
2827 		wake_up_interruptible_all(&wq->wait);
2828 	rcu_read_unlock();
2829 }
2830 
sock_def_error_report(struct sock * sk)2831 static void sock_def_error_report(struct sock *sk)
2832 {
2833 	struct socket_wq *wq;
2834 
2835 	rcu_read_lock();
2836 	wq = rcu_dereference(sk->sk_wq);
2837 	if (skwq_has_sleeper(wq))
2838 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2839 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2840 	rcu_read_unlock();
2841 }
2842 
sock_def_readable(struct sock * sk)2843 static void sock_def_readable(struct sock *sk)
2844 {
2845 	struct socket_wq *wq;
2846 
2847 	rcu_read_lock();
2848 	wq = rcu_dereference(sk->sk_wq);
2849 	if (skwq_has_sleeper(wq))
2850 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2851 						EPOLLRDNORM | EPOLLRDBAND);
2852 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2853 	rcu_read_unlock();
2854 }
2855 
sock_def_write_space(struct sock * sk)2856 static void sock_def_write_space(struct sock *sk)
2857 {
2858 	struct socket_wq *wq;
2859 
2860 	rcu_read_lock();
2861 
2862 	/* Do not wake up a writer until he can make "significant"
2863 	 * progress.  --DaveM
2864 	 */
2865 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2866 		wq = rcu_dereference(sk->sk_wq);
2867 		if (skwq_has_sleeper(wq))
2868 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2869 						EPOLLWRNORM | EPOLLWRBAND);
2870 
2871 		/* Should agree with poll, otherwise some programs break */
2872 		if (sock_writeable(sk))
2873 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2874 	}
2875 
2876 	rcu_read_unlock();
2877 }
2878 
sock_def_destruct(struct sock * sk)2879 static void sock_def_destruct(struct sock *sk)
2880 {
2881 }
2882 
sk_send_sigurg(struct sock * sk)2883 void sk_send_sigurg(struct sock *sk)
2884 {
2885 	if (sk->sk_socket && sk->sk_socket->file)
2886 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2887 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2888 }
2889 EXPORT_SYMBOL(sk_send_sigurg);
2890 
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)2891 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2892 		    unsigned long expires)
2893 {
2894 	if (!mod_timer(timer, expires))
2895 		sock_hold(sk);
2896 }
2897 EXPORT_SYMBOL(sk_reset_timer);
2898 
sk_stop_timer(struct sock * sk,struct timer_list * timer)2899 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2900 {
2901 	if (del_timer(timer))
2902 		__sock_put(sk);
2903 }
2904 EXPORT_SYMBOL(sk_stop_timer);
2905 
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)2906 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
2907 {
2908 	if (del_timer_sync(timer))
2909 		__sock_put(sk);
2910 }
2911 EXPORT_SYMBOL(sk_stop_timer_sync);
2912 
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)2913 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
2914 {
2915 	sk_init_common(sk);
2916 	sk->sk_send_head	=	NULL;
2917 
2918 	timer_setup(&sk->sk_timer, NULL, 0);
2919 
2920 	sk->sk_allocation	=	GFP_KERNEL;
2921 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2922 	sk->sk_sndbuf		=	sysctl_wmem_default;
2923 	sk->sk_state		=	TCP_CLOSE;
2924 	sk_set_socket(sk, sock);
2925 
2926 	sock_set_flag(sk, SOCK_ZAPPED);
2927 
2928 	if (sock) {
2929 		sk->sk_type	=	sock->type;
2930 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2931 		sock->sk	=	sk;
2932 	} else {
2933 		RCU_INIT_POINTER(sk->sk_wq, NULL);
2934 	}
2935 	sk->sk_uid	=	uid;
2936 
2937 	rwlock_init(&sk->sk_callback_lock);
2938 	if (sk->sk_kern_sock)
2939 		lockdep_set_class_and_name(
2940 			&sk->sk_callback_lock,
2941 			af_kern_callback_keys + sk->sk_family,
2942 			af_family_kern_clock_key_strings[sk->sk_family]);
2943 	else
2944 		lockdep_set_class_and_name(
2945 			&sk->sk_callback_lock,
2946 			af_callback_keys + sk->sk_family,
2947 			af_family_clock_key_strings[sk->sk_family]);
2948 
2949 	sk->sk_state_change	=	sock_def_wakeup;
2950 	sk->sk_data_ready	=	sock_def_readable;
2951 	sk->sk_write_space	=	sock_def_write_space;
2952 	sk->sk_error_report	=	sock_def_error_report;
2953 	sk->sk_destruct		=	sock_def_destruct;
2954 
2955 	sk->sk_frag.page	=	NULL;
2956 	sk->sk_frag.offset	=	0;
2957 	sk->sk_peek_off		=	-1;
2958 
2959 	sk->sk_peer_pid 	=	NULL;
2960 	sk->sk_peer_cred	=	NULL;
2961 	spin_lock_init(&sk->sk_peer_lock);
2962 
2963 	sk->sk_write_pending	=	0;
2964 	sk->sk_rcvlowat		=	1;
2965 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2966 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2967 
2968 	sk->sk_stamp = SK_DEFAULT_STAMP;
2969 #if BITS_PER_LONG==32
2970 	seqlock_init(&sk->sk_stamp_seq);
2971 #endif
2972 	atomic_set(&sk->sk_zckey, 0);
2973 
2974 #ifdef CONFIG_NET_RX_BUSY_POLL
2975 	sk->sk_napi_id		=	0;
2976 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
2977 #endif
2978 
2979 	sk->sk_max_pacing_rate = ~0UL;
2980 	sk->sk_pacing_rate = ~0UL;
2981 	WRITE_ONCE(sk->sk_pacing_shift, 10);
2982 	sk->sk_incoming_cpu = -1;
2983 
2984 	sk_rx_queue_clear(sk);
2985 	/*
2986 	 * Before updating sk_refcnt, we must commit prior changes to memory
2987 	 * (Documentation/RCU/rculist_nulls.txt for details)
2988 	 */
2989 	smp_wmb();
2990 	refcount_set(&sk->sk_refcnt, 1);
2991 	atomic_set(&sk->sk_drops, 0);
2992 }
2993 EXPORT_SYMBOL(sock_init_data_uid);
2994 
sock_init_data(struct socket * sock,struct sock * sk)2995 void sock_init_data(struct socket *sock, struct sock *sk)
2996 {
2997 	kuid_t uid = sock ?
2998 		SOCK_INODE(sock)->i_uid :
2999 		make_kuid(sock_net(sk)->user_ns, 0);
3000 
3001 	sock_init_data_uid(sock, sk, uid);
3002 }
3003 EXPORT_SYMBOL(sock_init_data);
3004 
lock_sock_nested(struct sock * sk,int subclass)3005 void lock_sock_nested(struct sock *sk, int subclass)
3006 {
3007 	might_sleep();
3008 	spin_lock_bh(&sk->sk_lock.slock);
3009 	if (sk->sk_lock.owned)
3010 		__lock_sock(sk);
3011 	sk->sk_lock.owned = 1;
3012 	spin_unlock(&sk->sk_lock.slock);
3013 	/*
3014 	 * The sk_lock has mutex_lock() semantics here:
3015 	 */
3016 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3017 	local_bh_enable();
3018 }
3019 EXPORT_SYMBOL(lock_sock_nested);
3020 
release_sock(struct sock * sk)3021 void release_sock(struct sock *sk)
3022 {
3023 	spin_lock_bh(&sk->sk_lock.slock);
3024 	if (sk->sk_backlog.tail)
3025 		__release_sock(sk);
3026 
3027 	/* Warning : release_cb() might need to release sk ownership,
3028 	 * ie call sock_release_ownership(sk) before us.
3029 	 */
3030 	if (sk->sk_prot->release_cb)
3031 		sk->sk_prot->release_cb(sk);
3032 
3033 	sock_release_ownership(sk);
3034 	if (waitqueue_active(&sk->sk_lock.wq))
3035 		wake_up(&sk->sk_lock.wq);
3036 	spin_unlock_bh(&sk->sk_lock.slock);
3037 }
3038 EXPORT_SYMBOL(release_sock);
3039 
3040 /**
3041  * lock_sock_fast - fast version of lock_sock
3042  * @sk: socket
3043  *
3044  * This version should be used for very small section, where process wont block
3045  * return false if fast path is taken:
3046  *
3047  *   sk_lock.slock locked, owned = 0, BH disabled
3048  *
3049  * return true if slow path is taken:
3050  *
3051  *   sk_lock.slock unlocked, owned = 1, BH enabled
3052  */
lock_sock_fast(struct sock * sk)3053 bool lock_sock_fast(struct sock *sk)
3054 {
3055 	might_sleep();
3056 	spin_lock_bh(&sk->sk_lock.slock);
3057 
3058 	if (!sk->sk_lock.owned)
3059 		/*
3060 		 * Note : We must disable BH
3061 		 */
3062 		return false;
3063 
3064 	__lock_sock(sk);
3065 	sk->sk_lock.owned = 1;
3066 	spin_unlock(&sk->sk_lock.slock);
3067 	/*
3068 	 * The sk_lock has mutex_lock() semantics here:
3069 	 */
3070 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3071 	local_bh_enable();
3072 	return true;
3073 }
3074 EXPORT_SYMBOL(lock_sock_fast);
3075 
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3076 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3077 		   bool timeval, bool time32)
3078 {
3079 	struct sock *sk = sock->sk;
3080 	struct timespec64 ts;
3081 
3082 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3083 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3084 	if (ts.tv_sec == -1)
3085 		return -ENOENT;
3086 	if (ts.tv_sec == 0) {
3087 		ktime_t kt = ktime_get_real();
3088 		sock_write_timestamp(sk, kt);;
3089 		ts = ktime_to_timespec64(kt);
3090 	}
3091 
3092 	if (timeval)
3093 		ts.tv_nsec /= 1000;
3094 
3095 #ifdef CONFIG_COMPAT_32BIT_TIME
3096 	if (time32)
3097 		return put_old_timespec32(&ts, userstamp);
3098 #endif
3099 #ifdef CONFIG_SPARC64
3100 	/* beware of padding in sparc64 timeval */
3101 	if (timeval && !in_compat_syscall()) {
3102 		struct __kernel_old_timeval __user tv = {
3103 			.tv_sec = ts.tv_sec,
3104 			.tv_usec = ts.tv_nsec,
3105 		};
3106 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3107 			return -EFAULT;
3108 		return 0;
3109 	}
3110 #endif
3111 	return put_timespec64(&ts, userstamp);
3112 }
3113 EXPORT_SYMBOL(sock_gettstamp);
3114 
sock_enable_timestamp(struct sock * sk,int flag)3115 void sock_enable_timestamp(struct sock *sk, int flag)
3116 {
3117 	if (!sock_flag(sk, flag)) {
3118 		unsigned long previous_flags = sk->sk_flags;
3119 
3120 		sock_set_flag(sk, flag);
3121 		/*
3122 		 * we just set one of the two flags which require net
3123 		 * time stamping, but time stamping might have been on
3124 		 * already because of the other one
3125 		 */
3126 		if (sock_needs_netstamp(sk) &&
3127 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3128 			net_enable_timestamp();
3129 	}
3130 }
3131 
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3132 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3133 		       int level, int type)
3134 {
3135 	struct sock_exterr_skb *serr;
3136 	struct sk_buff *skb;
3137 	int copied, err;
3138 
3139 	err = -EAGAIN;
3140 	skb = sock_dequeue_err_skb(sk);
3141 	if (skb == NULL)
3142 		goto out;
3143 
3144 	copied = skb->len;
3145 	if (copied > len) {
3146 		msg->msg_flags |= MSG_TRUNC;
3147 		copied = len;
3148 	}
3149 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3150 	if (err)
3151 		goto out_free_skb;
3152 
3153 	sock_recv_timestamp(msg, sk, skb);
3154 
3155 	serr = SKB_EXT_ERR(skb);
3156 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3157 
3158 	msg->msg_flags |= MSG_ERRQUEUE;
3159 	err = copied;
3160 
3161 out_free_skb:
3162 	kfree_skb(skb);
3163 out:
3164 	return err;
3165 }
3166 EXPORT_SYMBOL(sock_recv_errqueue);
3167 
3168 /*
3169  *	Get a socket option on an socket.
3170  *
3171  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3172  *	asynchronous errors should be reported by getsockopt. We assume
3173  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3174  */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3175 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3176 			   char __user *optval, int __user *optlen)
3177 {
3178 	struct sock *sk = sock->sk;
3179 
3180 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3181 }
3182 EXPORT_SYMBOL(sock_common_getsockopt);
3183 
3184 #ifdef CONFIG_COMPAT
compat_sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3185 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3186 				  char __user *optval, int __user *optlen)
3187 {
3188 	struct sock *sk = sock->sk;
3189 
3190 	if (sk->sk_prot->compat_getsockopt != NULL)
3191 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
3192 						      optval, optlen);
3193 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3194 }
3195 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3196 #endif
3197 
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3198 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3199 			int flags)
3200 {
3201 	struct sock *sk = sock->sk;
3202 	int addr_len = 0;
3203 	int err;
3204 
3205 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3206 				   flags & ~MSG_DONTWAIT, &addr_len);
3207 	if (err >= 0)
3208 		msg->msg_namelen = addr_len;
3209 	return err;
3210 }
3211 EXPORT_SYMBOL(sock_common_recvmsg);
3212 
3213 /*
3214  *	Set socket options on an inet socket.
3215  */
sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)3216 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3217 			   char __user *optval, unsigned int optlen)
3218 {
3219 	struct sock *sk = sock->sk;
3220 
3221 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3222 }
3223 EXPORT_SYMBOL(sock_common_setsockopt);
3224 
3225 #ifdef CONFIG_COMPAT
compat_sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)3226 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3227 				  char __user *optval, unsigned int optlen)
3228 {
3229 	struct sock *sk = sock->sk;
3230 
3231 	if (sk->sk_prot->compat_setsockopt != NULL)
3232 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
3233 						      optval, optlen);
3234 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3235 }
3236 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3237 #endif
3238 
sk_common_release(struct sock * sk)3239 void sk_common_release(struct sock *sk)
3240 {
3241 	if (sk->sk_prot->destroy)
3242 		sk->sk_prot->destroy(sk);
3243 
3244 	/*
3245 	 * Observation: when sock_common_release is called, processes have
3246 	 * no access to socket. But net still has.
3247 	 * Step one, detach it from networking:
3248 	 *
3249 	 * A. Remove from hash tables.
3250 	 */
3251 
3252 	sk->sk_prot->unhash(sk);
3253 
3254 	/*
3255 	 * In this point socket cannot receive new packets, but it is possible
3256 	 * that some packets are in flight because some CPU runs receiver and
3257 	 * did hash table lookup before we unhashed socket. They will achieve
3258 	 * receive queue and will be purged by socket destructor.
3259 	 *
3260 	 * Also we still have packets pending on receive queue and probably,
3261 	 * our own packets waiting in device queues. sock_destroy will drain
3262 	 * receive queue, but transmitted packets will delay socket destruction
3263 	 * until the last reference will be released.
3264 	 */
3265 
3266 	sock_orphan(sk);
3267 
3268 	xfrm_sk_free_policy(sk);
3269 
3270 	sk_refcnt_debug_release(sk);
3271 
3272 	sock_put(sk);
3273 }
3274 EXPORT_SYMBOL(sk_common_release);
3275 
sk_get_meminfo(const struct sock * sk,u32 * mem)3276 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3277 {
3278 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3279 
3280 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3281 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3282 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3283 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3284 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3285 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3286 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3287 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3288 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3289 }
3290 
3291 #ifdef CONFIG_PROC_FS
3292 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3293 struct prot_inuse {
3294 	int val[PROTO_INUSE_NR];
3295 };
3296 
3297 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3298 
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)3299 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3300 {
3301 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3302 }
3303 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3304 
sock_prot_inuse_get(struct net * net,struct proto * prot)3305 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3306 {
3307 	int cpu, idx = prot->inuse_idx;
3308 	int res = 0;
3309 
3310 	for_each_possible_cpu(cpu)
3311 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3312 
3313 	return res >= 0 ? res : 0;
3314 }
3315 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3316 
sock_inuse_add(struct net * net,int val)3317 static void sock_inuse_add(struct net *net, int val)
3318 {
3319 	this_cpu_add(*net->core.sock_inuse, val);
3320 }
3321 
sock_inuse_get(struct net * net)3322 int sock_inuse_get(struct net *net)
3323 {
3324 	int cpu, res = 0;
3325 
3326 	for_each_possible_cpu(cpu)
3327 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3328 
3329 	return res;
3330 }
3331 
3332 EXPORT_SYMBOL_GPL(sock_inuse_get);
3333 
sock_inuse_init_net(struct net * net)3334 static int __net_init sock_inuse_init_net(struct net *net)
3335 {
3336 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3337 	if (net->core.prot_inuse == NULL)
3338 		return -ENOMEM;
3339 
3340 	net->core.sock_inuse = alloc_percpu(int);
3341 	if (net->core.sock_inuse == NULL)
3342 		goto out;
3343 
3344 	return 0;
3345 
3346 out:
3347 	free_percpu(net->core.prot_inuse);
3348 	return -ENOMEM;
3349 }
3350 
sock_inuse_exit_net(struct net * net)3351 static void __net_exit sock_inuse_exit_net(struct net *net)
3352 {
3353 	free_percpu(net->core.prot_inuse);
3354 	free_percpu(net->core.sock_inuse);
3355 }
3356 
3357 static struct pernet_operations net_inuse_ops = {
3358 	.init = sock_inuse_init_net,
3359 	.exit = sock_inuse_exit_net,
3360 };
3361 
net_inuse_init(void)3362 static __init int net_inuse_init(void)
3363 {
3364 	if (register_pernet_subsys(&net_inuse_ops))
3365 		panic("Cannot initialize net inuse counters");
3366 
3367 	return 0;
3368 }
3369 
3370 core_initcall(net_inuse_init);
3371 
assign_proto_idx(struct proto * prot)3372 static int assign_proto_idx(struct proto *prot)
3373 {
3374 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3375 
3376 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3377 		pr_err("PROTO_INUSE_NR exhausted\n");
3378 		return -ENOSPC;
3379 	}
3380 
3381 	set_bit(prot->inuse_idx, proto_inuse_idx);
3382 	return 0;
3383 }
3384 
release_proto_idx(struct proto * prot)3385 static void release_proto_idx(struct proto *prot)
3386 {
3387 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3388 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3389 }
3390 #else
assign_proto_idx(struct proto * prot)3391 static inline int assign_proto_idx(struct proto *prot)
3392 {
3393 	return 0;
3394 }
3395 
release_proto_idx(struct proto * prot)3396 static inline void release_proto_idx(struct proto *prot)
3397 {
3398 }
3399 
sock_inuse_add(struct net * net,int val)3400 static void sock_inuse_add(struct net *net, int val)
3401 {
3402 }
3403 #endif
3404 
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3405 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3406 {
3407 	if (!twsk_prot)
3408 		return;
3409 	kfree(twsk_prot->twsk_slab_name);
3410 	twsk_prot->twsk_slab_name = NULL;
3411 	kmem_cache_destroy(twsk_prot->twsk_slab);
3412 	twsk_prot->twsk_slab = NULL;
3413 }
3414 
req_prot_cleanup(struct request_sock_ops * rsk_prot)3415 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3416 {
3417 	if (!rsk_prot)
3418 		return;
3419 	kfree(rsk_prot->slab_name);
3420 	rsk_prot->slab_name = NULL;
3421 	kmem_cache_destroy(rsk_prot->slab);
3422 	rsk_prot->slab = NULL;
3423 }
3424 
req_prot_init(const struct proto * prot)3425 static int req_prot_init(const struct proto *prot)
3426 {
3427 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3428 
3429 	if (!rsk_prot)
3430 		return 0;
3431 
3432 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3433 					prot->name);
3434 	if (!rsk_prot->slab_name)
3435 		return -ENOMEM;
3436 
3437 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3438 					   rsk_prot->obj_size, 0,
3439 					   SLAB_ACCOUNT | prot->slab_flags,
3440 					   NULL);
3441 
3442 	if (!rsk_prot->slab) {
3443 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3444 			prot->name);
3445 		return -ENOMEM;
3446 	}
3447 	return 0;
3448 }
3449 
proto_register(struct proto * prot,int alloc_slab)3450 int proto_register(struct proto *prot, int alloc_slab)
3451 {
3452 	int ret = -ENOBUFS;
3453 
3454 	if (alloc_slab) {
3455 		prot->slab = kmem_cache_create_usercopy(prot->name,
3456 					prot->obj_size, 0,
3457 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3458 					prot->slab_flags,
3459 					prot->useroffset, prot->usersize,
3460 					NULL);
3461 
3462 		if (prot->slab == NULL) {
3463 			pr_crit("%s: Can't create sock SLAB cache!\n",
3464 				prot->name);
3465 			goto out;
3466 		}
3467 
3468 		if (req_prot_init(prot))
3469 			goto out_free_request_sock_slab;
3470 
3471 		if (prot->twsk_prot != NULL) {
3472 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3473 
3474 			if (prot->twsk_prot->twsk_slab_name == NULL)
3475 				goto out_free_request_sock_slab;
3476 
3477 			prot->twsk_prot->twsk_slab =
3478 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3479 						  prot->twsk_prot->twsk_obj_size,
3480 						  0,
3481 						  SLAB_ACCOUNT |
3482 						  prot->slab_flags,
3483 						  NULL);
3484 			if (prot->twsk_prot->twsk_slab == NULL)
3485 				goto out_free_timewait_sock_slab;
3486 		}
3487 	}
3488 
3489 	mutex_lock(&proto_list_mutex);
3490 	ret = assign_proto_idx(prot);
3491 	if (ret) {
3492 		mutex_unlock(&proto_list_mutex);
3493 		goto out_free_timewait_sock_slab;
3494 	}
3495 	list_add(&prot->node, &proto_list);
3496 	mutex_unlock(&proto_list_mutex);
3497 	return ret;
3498 
3499 out_free_timewait_sock_slab:
3500 	if (alloc_slab && prot->twsk_prot)
3501 		tw_prot_cleanup(prot->twsk_prot);
3502 out_free_request_sock_slab:
3503 	if (alloc_slab) {
3504 		req_prot_cleanup(prot->rsk_prot);
3505 
3506 		kmem_cache_destroy(prot->slab);
3507 		prot->slab = NULL;
3508 	}
3509 out:
3510 	return ret;
3511 }
3512 EXPORT_SYMBOL(proto_register);
3513 
proto_unregister(struct proto * prot)3514 void proto_unregister(struct proto *prot)
3515 {
3516 	mutex_lock(&proto_list_mutex);
3517 	release_proto_idx(prot);
3518 	list_del(&prot->node);
3519 	mutex_unlock(&proto_list_mutex);
3520 
3521 	kmem_cache_destroy(prot->slab);
3522 	prot->slab = NULL;
3523 
3524 	req_prot_cleanup(prot->rsk_prot);
3525 	tw_prot_cleanup(prot->twsk_prot);
3526 }
3527 EXPORT_SYMBOL(proto_unregister);
3528 
sock_load_diag_module(int family,int protocol)3529 int sock_load_diag_module(int family, int protocol)
3530 {
3531 	if (!protocol) {
3532 		if (!sock_is_registered(family))
3533 			return -ENOENT;
3534 
3535 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3536 				      NETLINK_SOCK_DIAG, family);
3537 	}
3538 
3539 #ifdef CONFIG_INET
3540 	if (family == AF_INET &&
3541 	    protocol != IPPROTO_RAW &&
3542 	    !rcu_access_pointer(inet_protos[protocol]))
3543 		return -ENOENT;
3544 #endif
3545 
3546 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3547 			      NETLINK_SOCK_DIAG, family, protocol);
3548 }
3549 EXPORT_SYMBOL(sock_load_diag_module);
3550 
3551 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)3552 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3553 	__acquires(proto_list_mutex)
3554 {
3555 	mutex_lock(&proto_list_mutex);
3556 	return seq_list_start_head(&proto_list, *pos);
3557 }
3558 
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)3559 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3560 {
3561 	return seq_list_next(v, &proto_list, pos);
3562 }
3563 
proto_seq_stop(struct seq_file * seq,void * v)3564 static void proto_seq_stop(struct seq_file *seq, void *v)
3565 	__releases(proto_list_mutex)
3566 {
3567 	mutex_unlock(&proto_list_mutex);
3568 }
3569 
proto_method_implemented(const void * method)3570 static char proto_method_implemented(const void *method)
3571 {
3572 	return method == NULL ? 'n' : 'y';
3573 }
sock_prot_memory_allocated(struct proto * proto)3574 static long sock_prot_memory_allocated(struct proto *proto)
3575 {
3576 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3577 }
3578 
sock_prot_memory_pressure(struct proto * proto)3579 static const char *sock_prot_memory_pressure(struct proto *proto)
3580 {
3581 	return proto->memory_pressure != NULL ?
3582 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3583 }
3584 
proto_seq_printf(struct seq_file * seq,struct proto * proto)3585 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3586 {
3587 
3588 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3589 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3590 		   proto->name,
3591 		   proto->obj_size,
3592 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3593 		   sock_prot_memory_allocated(proto),
3594 		   sock_prot_memory_pressure(proto),
3595 		   proto->max_header,
3596 		   proto->slab == NULL ? "no" : "yes",
3597 		   module_name(proto->owner),
3598 		   proto_method_implemented(proto->close),
3599 		   proto_method_implemented(proto->connect),
3600 		   proto_method_implemented(proto->disconnect),
3601 		   proto_method_implemented(proto->accept),
3602 		   proto_method_implemented(proto->ioctl),
3603 		   proto_method_implemented(proto->init),
3604 		   proto_method_implemented(proto->destroy),
3605 		   proto_method_implemented(proto->shutdown),
3606 		   proto_method_implemented(proto->setsockopt),
3607 		   proto_method_implemented(proto->getsockopt),
3608 		   proto_method_implemented(proto->sendmsg),
3609 		   proto_method_implemented(proto->recvmsg),
3610 		   proto_method_implemented(proto->sendpage),
3611 		   proto_method_implemented(proto->bind),
3612 		   proto_method_implemented(proto->backlog_rcv),
3613 		   proto_method_implemented(proto->hash),
3614 		   proto_method_implemented(proto->unhash),
3615 		   proto_method_implemented(proto->get_port),
3616 		   proto_method_implemented(proto->enter_memory_pressure));
3617 }
3618 
proto_seq_show(struct seq_file * seq,void * v)3619 static int proto_seq_show(struct seq_file *seq, void *v)
3620 {
3621 	if (v == &proto_list)
3622 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3623 			   "protocol",
3624 			   "size",
3625 			   "sockets",
3626 			   "memory",
3627 			   "press",
3628 			   "maxhdr",
3629 			   "slab",
3630 			   "module",
3631 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3632 	else
3633 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3634 	return 0;
3635 }
3636 
3637 static const struct seq_operations proto_seq_ops = {
3638 	.start  = proto_seq_start,
3639 	.next   = proto_seq_next,
3640 	.stop   = proto_seq_stop,
3641 	.show   = proto_seq_show,
3642 };
3643 
proto_init_net(struct net * net)3644 static __net_init int proto_init_net(struct net *net)
3645 {
3646 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3647 			sizeof(struct seq_net_private)))
3648 		return -ENOMEM;
3649 
3650 	return 0;
3651 }
3652 
proto_exit_net(struct net * net)3653 static __net_exit void proto_exit_net(struct net *net)
3654 {
3655 	remove_proc_entry("protocols", net->proc_net);
3656 }
3657 
3658 
3659 static __net_initdata struct pernet_operations proto_net_ops = {
3660 	.init = proto_init_net,
3661 	.exit = proto_exit_net,
3662 };
3663 
proto_init(void)3664 static int __init proto_init(void)
3665 {
3666 	return register_pernet_subsys(&proto_net_ops);
3667 }
3668 
3669 subsys_initcall(proto_init);
3670 
3671 #endif /* PROC_FS */
3672 
3673 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)3674 bool sk_busy_loop_end(void *p, unsigned long start_time)
3675 {
3676 	struct sock *sk = p;
3677 
3678 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3679 	       sk_busy_loop_timeout(sk, start_time);
3680 }
3681 EXPORT_SYMBOL(sk_busy_loop_end);
3682 #endif /* CONFIG_NET_RX_BUSY_POLL */
3683