• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <asm/unaligned.h>
95 #include <linux/capability.h>
96 #include <linux/errno.h>
97 #include <linux/errqueue.h>
98 #include <linux/types.h>
99 #include <linux/socket.h>
100 #include <linux/in.h>
101 #include <linux/kernel.h>
102 #include <linux/module.h>
103 #include <linux/proc_fs.h>
104 #include <linux/seq_file.h>
105 #include <linux/sched.h>
106 #include <linux/sched/mm.h>
107 #include <linux/timer.h>
108 #include <linux/string.h>
109 #include <linux/sockios.h>
110 #include <linux/net.h>
111 #include <linux/mm.h>
112 #include <linux/slab.h>
113 #include <linux/interrupt.h>
114 #include <linux/poll.h>
115 #include <linux/tcp.h>
116 #include <linux/init.h>
117 #include <linux/highmem.h>
118 #include <linux/user_namespace.h>
119 #include <linux/static_key.h>
120 #include <linux/memcontrol.h>
121 #include <linux/prefetch.h>
122 
123 #include <linux/uaccess.h>
124 
125 #include <linux/netdevice.h>
126 #include <net/protocol.h>
127 #include <linux/skbuff.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <linux/net_tstamp.h>
132 #include <net/xfrm.h>
133 #include <linux/ipsec.h>
134 #include <net/cls_cgroup.h>
135 #include <net/netprio_cgroup.h>
136 #include <linux/sock_diag.h>
137 
138 #include <linux/filter.h>
139 #include <net/sock_reuseport.h>
140 
141 #include <trace/events/sock.h>
142 
143 #include <net/tcp.h>
144 #include <net/busy_poll.h>
145 
146 static DEFINE_MUTEX(proto_list_mutex);
147 static LIST_HEAD(proto_list);
148 
149 static void sock_inuse_add(struct net *net, int val);
150 
151 /**
152  * sk_ns_capable - General socket capability test
153  * @sk: Socket to use a capability on or through
154  * @user_ns: The user namespace of the capability to use
155  * @cap: The capability to use
156  *
157  * Test to see if the opener of the socket had when the socket was
158  * created and the current process has the capability @cap in the user
159  * namespace @user_ns.
160  */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)161 bool sk_ns_capable(const struct sock *sk,
162 		   struct user_namespace *user_ns, int cap)
163 {
164 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
165 		ns_capable(user_ns, cap);
166 }
167 EXPORT_SYMBOL(sk_ns_capable);
168 
169 /**
170  * sk_capable - Socket global capability test
171  * @sk: Socket to use a capability on or through
172  * @cap: The global capability to use
173  *
174  * Test to see if the opener of the socket had when the socket was
175  * created and the current process has the capability @cap in all user
176  * namespaces.
177  */
sk_capable(const struct sock * sk,int cap)178 bool sk_capable(const struct sock *sk, int cap)
179 {
180 	return sk_ns_capable(sk, &init_user_ns, cap);
181 }
182 EXPORT_SYMBOL(sk_capable);
183 
184 /**
185  * sk_net_capable - Network namespace socket capability test
186  * @sk: Socket to use a capability on or through
187  * @cap: The capability to use
188  *
189  * Test to see if the opener of the socket had when the socket was created
190  * and the current process has the capability @cap over the network namespace
191  * the socket is a member of.
192  */
sk_net_capable(const struct sock * sk,int cap)193 bool sk_net_capable(const struct sock *sk, int cap)
194 {
195 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
196 }
197 EXPORT_SYMBOL(sk_net_capable);
198 
199 /*
200  * Each address family might have different locking rules, so we have
201  * one slock key per address family and separate keys for internal and
202  * userspace sockets.
203  */
204 static struct lock_class_key af_family_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_keys[AF_MAX];
206 static struct lock_class_key af_family_slock_keys[AF_MAX];
207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
208 
209 /*
210  * Make lock validator output more readable. (we pre-construct these
211  * strings build-time, so that runtime initialization of socket
212  * locks is fast):
213  */
214 
215 #define _sock_locks(x)						  \
216   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
217   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
218   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
219   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
220   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
221   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
222   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
223   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
224   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
225   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
226   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
227   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
228   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
229   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
230   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
231   x "AF_MAX"
232 
233 static const char *const af_family_key_strings[AF_MAX+1] = {
234 	_sock_locks("sk_lock-")
235 };
236 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
237 	_sock_locks("slock-")
238 };
239 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
240 	_sock_locks("clock-")
241 };
242 
243 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
244 	_sock_locks("k-sk_lock-")
245 };
246 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
247 	_sock_locks("k-slock-")
248 };
249 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
250 	_sock_locks("k-clock-")
251 };
252 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
253 	_sock_locks("rlock-")
254 };
255 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
256 	_sock_locks("wlock-")
257 };
258 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
259 	_sock_locks("elock-")
260 };
261 
262 /*
263  * sk_callback_lock and sk queues locking rules are per-address-family,
264  * so split the lock classes by using a per-AF key:
265  */
266 static struct lock_class_key af_callback_keys[AF_MAX];
267 static struct lock_class_key af_rlock_keys[AF_MAX];
268 static struct lock_class_key af_wlock_keys[AF_MAX];
269 static struct lock_class_key af_elock_keys[AF_MAX];
270 static struct lock_class_key af_kern_callback_keys[AF_MAX];
271 
272 /* Run time adjustable parameters. */
273 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
274 EXPORT_SYMBOL(sysctl_wmem_max);
275 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
276 EXPORT_SYMBOL(sysctl_rmem_max);
277 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
278 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
279 
280 /* Maximal space eaten by iovec or ancillary data plus some space */
281 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
282 EXPORT_SYMBOL(sysctl_optmem_max);
283 
284 int sysctl_tstamp_allow_data __read_mostly = 1;
285 
286 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
287 EXPORT_SYMBOL_GPL(memalloc_socks_key);
288 
289 /**
290  * sk_set_memalloc - sets %SOCK_MEMALLOC
291  * @sk: socket to set it on
292  *
293  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
294  * It's the responsibility of the admin to adjust min_free_kbytes
295  * to meet the requirements
296  */
sk_set_memalloc(struct sock * sk)297 void sk_set_memalloc(struct sock *sk)
298 {
299 	sock_set_flag(sk, SOCK_MEMALLOC);
300 	sk->sk_allocation |= __GFP_MEMALLOC;
301 	static_branch_inc(&memalloc_socks_key);
302 }
303 EXPORT_SYMBOL_GPL(sk_set_memalloc);
304 
sk_clear_memalloc(struct sock * sk)305 void sk_clear_memalloc(struct sock *sk)
306 {
307 	sock_reset_flag(sk, SOCK_MEMALLOC);
308 	sk->sk_allocation &= ~__GFP_MEMALLOC;
309 	static_branch_dec(&memalloc_socks_key);
310 
311 	/*
312 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
313 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
314 	 * it has rmem allocations due to the last swapfile being deactivated
315 	 * but there is a risk that the socket is unusable due to exceeding
316 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
317 	 */
318 	sk_mem_reclaim(sk);
319 }
320 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
321 
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)322 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
323 {
324 	int ret;
325 	unsigned int noreclaim_flag;
326 
327 	/* these should have been dropped before queueing */
328 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
329 
330 	noreclaim_flag = memalloc_noreclaim_save();
331 	ret = sk->sk_backlog_rcv(sk, skb);
332 	memalloc_noreclaim_restore(noreclaim_flag);
333 
334 	return ret;
335 }
336 EXPORT_SYMBOL(__sk_backlog_rcv);
337 
sock_set_timeout(long * timeo_p,char __user * optval,int optlen)338 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
339 {
340 	struct timeval tv;
341 
342 	if (optlen < sizeof(tv))
343 		return -EINVAL;
344 	if (copy_from_user(&tv, optval, sizeof(tv)))
345 		return -EFAULT;
346 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
347 		return -EDOM;
348 
349 	if (tv.tv_sec < 0) {
350 		static int warned __read_mostly;
351 
352 		*timeo_p = 0;
353 		if (warned < 10 && net_ratelimit()) {
354 			warned++;
355 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
356 				__func__, current->comm, task_pid_nr(current));
357 		}
358 		return 0;
359 	}
360 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
361 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
362 		return 0;
363 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
364 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
365 	return 0;
366 }
367 
sock_warn_obsolete_bsdism(const char * name)368 static void sock_warn_obsolete_bsdism(const char *name)
369 {
370 	static int warned;
371 	static char warncomm[TASK_COMM_LEN];
372 	if (strcmp(warncomm, current->comm) && warned < 5) {
373 		strcpy(warncomm,  current->comm);
374 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
375 			warncomm, name);
376 		warned++;
377 	}
378 }
379 
sock_needs_netstamp(const struct sock * sk)380 static bool sock_needs_netstamp(const struct sock *sk)
381 {
382 	switch (sk->sk_family) {
383 	case AF_UNSPEC:
384 	case AF_UNIX:
385 		return false;
386 	default:
387 		return true;
388 	}
389 }
390 
sock_disable_timestamp(struct sock * sk,unsigned long flags)391 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
392 {
393 	if (sk->sk_flags & flags) {
394 		sk->sk_flags &= ~flags;
395 		if (sock_needs_netstamp(sk) &&
396 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
397 			net_disable_timestamp();
398 	}
399 }
400 
401 
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)402 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
403 {
404 	unsigned long flags;
405 	struct sk_buff_head *list = &sk->sk_receive_queue;
406 
407 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
408 		atomic_inc(&sk->sk_drops);
409 		trace_sock_rcvqueue_full(sk, skb);
410 		return -ENOMEM;
411 	}
412 
413 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
414 		atomic_inc(&sk->sk_drops);
415 		return -ENOBUFS;
416 	}
417 
418 	skb->dev = NULL;
419 	skb_set_owner_r(skb, sk);
420 
421 	/* we escape from rcu protected region, make sure we dont leak
422 	 * a norefcounted dst
423 	 */
424 	skb_dst_force(skb);
425 
426 	spin_lock_irqsave(&list->lock, flags);
427 	sock_skb_set_dropcount(sk, skb);
428 	__skb_queue_tail(list, skb);
429 	spin_unlock_irqrestore(&list->lock, flags);
430 
431 	if (!sock_flag(sk, SOCK_DEAD))
432 		sk->sk_data_ready(sk);
433 	return 0;
434 }
435 EXPORT_SYMBOL(__sock_queue_rcv_skb);
436 
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)437 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
438 {
439 	int err;
440 
441 	err = sk_filter(sk, skb);
442 	if (err)
443 		return err;
444 
445 	return __sock_queue_rcv_skb(sk, skb);
446 }
447 EXPORT_SYMBOL(sock_queue_rcv_skb);
448 
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)449 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
450 		     const int nested, unsigned int trim_cap, bool refcounted)
451 {
452 	int rc = NET_RX_SUCCESS;
453 
454 	if (sk_filter_trim_cap(sk, skb, trim_cap))
455 		goto discard_and_relse;
456 
457 	skb->dev = NULL;
458 
459 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
460 		atomic_inc(&sk->sk_drops);
461 		goto discard_and_relse;
462 	}
463 	if (nested)
464 		bh_lock_sock_nested(sk);
465 	else
466 		bh_lock_sock(sk);
467 	if (!sock_owned_by_user(sk)) {
468 		/*
469 		 * trylock + unlock semantics:
470 		 */
471 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
472 
473 		rc = sk_backlog_rcv(sk, skb);
474 
475 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
476 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
477 		bh_unlock_sock(sk);
478 		atomic_inc(&sk->sk_drops);
479 		goto discard_and_relse;
480 	}
481 
482 	bh_unlock_sock(sk);
483 out:
484 	if (refcounted)
485 		sock_put(sk);
486 	return rc;
487 discard_and_relse:
488 	kfree_skb(skb);
489 	goto out;
490 }
491 EXPORT_SYMBOL(__sk_receive_skb);
492 
__sk_dst_check(struct sock * sk,u32 cookie)493 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
494 {
495 	struct dst_entry *dst = __sk_dst_get(sk);
496 
497 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
498 		sk_tx_queue_clear(sk);
499 		sk->sk_dst_pending_confirm = 0;
500 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
501 		dst_release(dst);
502 		return NULL;
503 	}
504 
505 	return dst;
506 }
507 EXPORT_SYMBOL(__sk_dst_check);
508 
sk_dst_check(struct sock * sk,u32 cookie)509 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
510 {
511 	struct dst_entry *dst = sk_dst_get(sk);
512 
513 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
514 		sk_dst_reset(sk);
515 		dst_release(dst);
516 		return NULL;
517 	}
518 
519 	return dst;
520 }
521 EXPORT_SYMBOL(sk_dst_check);
522 
sock_setbindtodevice(struct sock * sk,char __user * optval,int optlen)523 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
524 				int optlen)
525 {
526 	int ret = -ENOPROTOOPT;
527 #ifdef CONFIG_NETDEVICES
528 	struct net *net = sock_net(sk);
529 	char devname[IFNAMSIZ];
530 	int index;
531 
532 	/* Sorry... */
533 	ret = -EPERM;
534 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
535 		goto out;
536 
537 	ret = -EINVAL;
538 	if (optlen < 0)
539 		goto out;
540 
541 	/* Bind this socket to a particular device like "eth0",
542 	 * as specified in the passed interface name. If the
543 	 * name is "" or the option length is zero the socket
544 	 * is not bound.
545 	 */
546 	if (optlen > IFNAMSIZ - 1)
547 		optlen = IFNAMSIZ - 1;
548 	memset(devname, 0, sizeof(devname));
549 
550 	ret = -EFAULT;
551 	if (copy_from_user(devname, optval, optlen))
552 		goto out;
553 
554 	index = 0;
555 	if (devname[0] != '\0') {
556 		struct net_device *dev;
557 
558 		rcu_read_lock();
559 		dev = dev_get_by_name_rcu(net, devname);
560 		if (dev)
561 			index = dev->ifindex;
562 		rcu_read_unlock();
563 		ret = -ENODEV;
564 		if (!dev)
565 			goto out;
566 	}
567 
568 	lock_sock(sk);
569 	sk->sk_bound_dev_if = index;
570 	sk_dst_reset(sk);
571 	release_sock(sk);
572 
573 	ret = 0;
574 
575 out:
576 #endif
577 
578 	return ret;
579 }
580 
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)581 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
582 				int __user *optlen, int len)
583 {
584 	int ret = -ENOPROTOOPT;
585 #ifdef CONFIG_NETDEVICES
586 	struct net *net = sock_net(sk);
587 	char devname[IFNAMSIZ];
588 
589 	if (sk->sk_bound_dev_if == 0) {
590 		len = 0;
591 		goto zero;
592 	}
593 
594 	ret = -EINVAL;
595 	if (len < IFNAMSIZ)
596 		goto out;
597 
598 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
599 	if (ret)
600 		goto out;
601 
602 	len = strlen(devname) + 1;
603 
604 	ret = -EFAULT;
605 	if (copy_to_user(optval, devname, len))
606 		goto out;
607 
608 zero:
609 	ret = -EFAULT;
610 	if (put_user(len, optlen))
611 		goto out;
612 
613 	ret = 0;
614 
615 out:
616 #endif
617 
618 	return ret;
619 }
620 
sock_valbool_flag(struct sock * sk,int bit,int valbool)621 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
622 {
623 	if (valbool)
624 		sock_set_flag(sk, bit);
625 	else
626 		sock_reset_flag(sk, bit);
627 }
628 
sk_mc_loop(struct sock * sk)629 bool sk_mc_loop(struct sock *sk)
630 {
631 	if (dev_recursion_level())
632 		return false;
633 	if (!sk)
634 		return true;
635 	switch (sk->sk_family) {
636 	case AF_INET:
637 		return inet_sk(sk)->mc_loop;
638 #if IS_ENABLED(CONFIG_IPV6)
639 	case AF_INET6:
640 		return inet6_sk(sk)->mc_loop;
641 #endif
642 	}
643 	WARN_ON_ONCE(1);
644 	return true;
645 }
646 EXPORT_SYMBOL(sk_mc_loop);
647 
648 /*
649  *	This is meant for all protocols to use and covers goings on
650  *	at the socket level. Everything here is generic.
651  */
652 
sock_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)653 int sock_setsockopt(struct socket *sock, int level, int optname,
654 		    char __user *optval, unsigned int optlen)
655 {
656 	struct sock_txtime sk_txtime;
657 	struct sock *sk = sock->sk;
658 	int val;
659 	int valbool;
660 	struct linger ling;
661 	int ret = 0;
662 
663 	/*
664 	 *	Options without arguments
665 	 */
666 
667 	if (optname == SO_BINDTODEVICE)
668 		return sock_setbindtodevice(sk, optval, optlen);
669 
670 	if (optlen < sizeof(int))
671 		return -EINVAL;
672 
673 	if (get_user(val, (int __user *)optval))
674 		return -EFAULT;
675 
676 	valbool = val ? 1 : 0;
677 
678 	lock_sock(sk);
679 
680 	switch (optname) {
681 	case SO_DEBUG:
682 		if (val && !capable(CAP_NET_ADMIN))
683 			ret = -EACCES;
684 		else
685 			sock_valbool_flag(sk, SOCK_DBG, valbool);
686 		break;
687 	case SO_REUSEADDR:
688 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
689 		break;
690 	case SO_REUSEPORT:
691 		sk->sk_reuseport = valbool;
692 		break;
693 	case SO_TYPE:
694 	case SO_PROTOCOL:
695 	case SO_DOMAIN:
696 	case SO_ERROR:
697 		ret = -ENOPROTOOPT;
698 		break;
699 	case SO_DONTROUTE:
700 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
701 		sk_dst_reset(sk);
702 		break;
703 	case SO_BROADCAST:
704 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
705 		break;
706 	case SO_SNDBUF:
707 		/* Don't error on this BSD doesn't and if you think
708 		 * about it this is right. Otherwise apps have to
709 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
710 		 * are treated in BSD as hints
711 		 */
712 		val = min_t(u32, val, sysctl_wmem_max);
713 set_sndbuf:
714 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
715 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
716 		/* Wake up sending tasks if we upped the value. */
717 		sk->sk_write_space(sk);
718 		break;
719 
720 	case SO_SNDBUFFORCE:
721 		if (!capable(CAP_NET_ADMIN)) {
722 			ret = -EPERM;
723 			break;
724 		}
725 		goto set_sndbuf;
726 
727 	case SO_RCVBUF:
728 		/* Don't error on this BSD doesn't and if you think
729 		 * about it this is right. Otherwise apps have to
730 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
731 		 * are treated in BSD as hints
732 		 */
733 		val = min_t(u32, val, sysctl_rmem_max);
734 set_rcvbuf:
735 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
736 		/*
737 		 * We double it on the way in to account for
738 		 * "struct sk_buff" etc. overhead.   Applications
739 		 * assume that the SO_RCVBUF setting they make will
740 		 * allow that much actual data to be received on that
741 		 * socket.
742 		 *
743 		 * Applications are unaware that "struct sk_buff" and
744 		 * other overheads allocate from the receive buffer
745 		 * during socket buffer allocation.
746 		 *
747 		 * And after considering the possible alternatives,
748 		 * returning the value we actually used in getsockopt
749 		 * is the most desirable behavior.
750 		 */
751 		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
752 		break;
753 
754 	case SO_RCVBUFFORCE:
755 		if (!capable(CAP_NET_ADMIN)) {
756 			ret = -EPERM;
757 			break;
758 		}
759 		goto set_rcvbuf;
760 
761 	case SO_KEEPALIVE:
762 		if (sk->sk_prot->keepalive)
763 			sk->sk_prot->keepalive(sk, valbool);
764 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
765 		break;
766 
767 	case SO_OOBINLINE:
768 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
769 		break;
770 
771 	case SO_NO_CHECK:
772 		sk->sk_no_check_tx = valbool;
773 		break;
774 
775 	case SO_PRIORITY:
776 		if ((val >= 0 && val <= 6) ||
777 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
778 			sk->sk_priority = val;
779 		else
780 			ret = -EPERM;
781 		break;
782 
783 	case SO_LINGER:
784 		if (optlen < sizeof(ling)) {
785 			ret = -EINVAL;	/* 1003.1g */
786 			break;
787 		}
788 		if (copy_from_user(&ling, optval, sizeof(ling))) {
789 			ret = -EFAULT;
790 			break;
791 		}
792 		if (!ling.l_onoff)
793 			sock_reset_flag(sk, SOCK_LINGER);
794 		else {
795 #if (BITS_PER_LONG == 32)
796 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
797 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
798 			else
799 #endif
800 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
801 			sock_set_flag(sk, SOCK_LINGER);
802 		}
803 		break;
804 
805 	case SO_BSDCOMPAT:
806 		sock_warn_obsolete_bsdism("setsockopt");
807 		break;
808 
809 	case SO_PASSCRED:
810 		if (valbool)
811 			set_bit(SOCK_PASSCRED, &sock->flags);
812 		else
813 			clear_bit(SOCK_PASSCRED, &sock->flags);
814 		break;
815 
816 	case SO_TIMESTAMP:
817 	case SO_TIMESTAMPNS:
818 		if (valbool)  {
819 			if (optname == SO_TIMESTAMP)
820 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
821 			else
822 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
823 			sock_set_flag(sk, SOCK_RCVTSTAMP);
824 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
825 		} else {
826 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
827 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
828 		}
829 		break;
830 
831 	case SO_TIMESTAMPING:
832 		if (val & ~SOF_TIMESTAMPING_MASK) {
833 			ret = -EINVAL;
834 			break;
835 		}
836 
837 		if (val & SOF_TIMESTAMPING_OPT_ID &&
838 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
839 			if (sk->sk_protocol == IPPROTO_TCP &&
840 			    sk->sk_type == SOCK_STREAM) {
841 				if ((1 << sk->sk_state) &
842 				    (TCPF_CLOSE | TCPF_LISTEN)) {
843 					ret = -EINVAL;
844 					break;
845 				}
846 				sk->sk_tskey = tcp_sk(sk)->snd_una;
847 			} else {
848 				sk->sk_tskey = 0;
849 			}
850 		}
851 
852 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
853 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
854 			ret = -EINVAL;
855 			break;
856 		}
857 
858 		sk->sk_tsflags = val;
859 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
860 			sock_enable_timestamp(sk,
861 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
862 		else
863 			sock_disable_timestamp(sk,
864 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
865 		break;
866 
867 	case SO_RCVLOWAT:
868 		if (val < 0)
869 			val = INT_MAX;
870 		if (sock->ops->set_rcvlowat)
871 			ret = sock->ops->set_rcvlowat(sk, val);
872 		else
873 			sk->sk_rcvlowat = val ? : 1;
874 		break;
875 
876 	case SO_RCVTIMEO:
877 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
878 		break;
879 
880 	case SO_SNDTIMEO:
881 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
882 		break;
883 
884 	case SO_ATTACH_FILTER:
885 		ret = -EINVAL;
886 		if (optlen == sizeof(struct sock_fprog)) {
887 			struct sock_fprog fprog;
888 
889 			ret = -EFAULT;
890 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
891 				break;
892 
893 			ret = sk_attach_filter(&fprog, sk);
894 		}
895 		break;
896 
897 	case SO_ATTACH_BPF:
898 		ret = -EINVAL;
899 		if (optlen == sizeof(u32)) {
900 			u32 ufd;
901 
902 			ret = -EFAULT;
903 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
904 				break;
905 
906 			ret = sk_attach_bpf(ufd, sk);
907 		}
908 		break;
909 
910 	case SO_ATTACH_REUSEPORT_CBPF:
911 		ret = -EINVAL;
912 		if (optlen == sizeof(struct sock_fprog)) {
913 			struct sock_fprog fprog;
914 
915 			ret = -EFAULT;
916 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
917 				break;
918 
919 			ret = sk_reuseport_attach_filter(&fprog, sk);
920 		}
921 		break;
922 
923 	case SO_ATTACH_REUSEPORT_EBPF:
924 		ret = -EINVAL;
925 		if (optlen == sizeof(u32)) {
926 			u32 ufd;
927 
928 			ret = -EFAULT;
929 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
930 				break;
931 
932 			ret = sk_reuseport_attach_bpf(ufd, sk);
933 		}
934 		break;
935 
936 	case SO_DETACH_FILTER:
937 		ret = sk_detach_filter(sk);
938 		break;
939 
940 	case SO_LOCK_FILTER:
941 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
942 			ret = -EPERM;
943 		else
944 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
945 		break;
946 
947 	case SO_PASSSEC:
948 		if (valbool)
949 			set_bit(SOCK_PASSSEC, &sock->flags);
950 		else
951 			clear_bit(SOCK_PASSSEC, &sock->flags);
952 		break;
953 	case SO_MARK:
954 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
955 			ret = -EPERM;
956 		else
957 			sk->sk_mark = val;
958 		break;
959 
960 	case SO_RXQ_OVFL:
961 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
962 		break;
963 
964 	case SO_WIFI_STATUS:
965 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
966 		break;
967 
968 	case SO_PEEK_OFF:
969 		if (sock->ops->set_peek_off)
970 			ret = sock->ops->set_peek_off(sk, val);
971 		else
972 			ret = -EOPNOTSUPP;
973 		break;
974 
975 	case SO_NOFCS:
976 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
977 		break;
978 
979 	case SO_SELECT_ERR_QUEUE:
980 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
981 		break;
982 
983 #ifdef CONFIG_NET_RX_BUSY_POLL
984 	case SO_BUSY_POLL:
985 		/* allow unprivileged users to decrease the value */
986 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
987 			ret = -EPERM;
988 		else {
989 			if (val < 0)
990 				ret = -EINVAL;
991 			else
992 				sk->sk_ll_usec = val;
993 		}
994 		break;
995 #endif
996 
997 	case SO_MAX_PACING_RATE:
998 		if (val != ~0U)
999 			cmpxchg(&sk->sk_pacing_status,
1000 				SK_PACING_NONE,
1001 				SK_PACING_NEEDED);
1002 		sk->sk_max_pacing_rate = val;
1003 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1004 					 sk->sk_max_pacing_rate);
1005 		break;
1006 
1007 	case SO_INCOMING_CPU:
1008 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1009 		break;
1010 
1011 	case SO_CNX_ADVICE:
1012 		if (val == 1)
1013 			dst_negative_advice(sk);
1014 		break;
1015 
1016 	case SO_ZEROCOPY:
1017 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1018 			if (sk->sk_protocol != IPPROTO_TCP)
1019 				ret = -ENOTSUPP;
1020 		} else if (sk->sk_family != PF_RDS) {
1021 			ret = -ENOTSUPP;
1022 		}
1023 		if (!ret) {
1024 			if (val < 0 || val > 1)
1025 				ret = -EINVAL;
1026 			else
1027 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1028 		}
1029 		break;
1030 
1031 	case SO_TXTIME:
1032 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1033 			ret = -EPERM;
1034 		} else if (optlen != sizeof(struct sock_txtime)) {
1035 			ret = -EINVAL;
1036 		} else if (copy_from_user(&sk_txtime, optval,
1037 			   sizeof(struct sock_txtime))) {
1038 			ret = -EFAULT;
1039 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1040 			ret = -EINVAL;
1041 		} else {
1042 			sock_valbool_flag(sk, SOCK_TXTIME, true);
1043 			sk->sk_clockid = sk_txtime.clockid;
1044 			sk->sk_txtime_deadline_mode =
1045 				!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1046 			sk->sk_txtime_report_errors =
1047 				!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1048 		}
1049 		break;
1050 
1051 	default:
1052 		ret = -ENOPROTOOPT;
1053 		break;
1054 	}
1055 	release_sock(sk);
1056 	return ret;
1057 }
1058 EXPORT_SYMBOL(sock_setsockopt);
1059 
1060 
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1061 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1062 			  struct ucred *ucred)
1063 {
1064 	ucred->pid = pid_vnr(pid);
1065 	ucred->uid = ucred->gid = -1;
1066 	if (cred) {
1067 		struct user_namespace *current_ns = current_user_ns();
1068 
1069 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1070 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1071 	}
1072 }
1073 
groups_to_user(gid_t __user * dst,const struct group_info * src)1074 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1075 {
1076 	struct user_namespace *user_ns = current_user_ns();
1077 	int i;
1078 
1079 	for (i = 0; i < src->ngroups; i++)
1080 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1081 			return -EFAULT;
1082 
1083 	return 0;
1084 }
1085 
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1086 int sock_getsockopt(struct socket *sock, int level, int optname,
1087 		    char __user *optval, int __user *optlen)
1088 {
1089 	struct sock *sk = sock->sk;
1090 
1091 	union {
1092 		int val;
1093 		u64 val64;
1094 		struct linger ling;
1095 		struct timeval tm;
1096 		struct sock_txtime txtime;
1097 	} v;
1098 
1099 	int lv = sizeof(int);
1100 	int len;
1101 
1102 	if (get_user(len, optlen))
1103 		return -EFAULT;
1104 	if (len < 0)
1105 		return -EINVAL;
1106 
1107 	memset(&v, 0, sizeof(v));
1108 
1109 	switch (optname) {
1110 	case SO_DEBUG:
1111 		v.val = sock_flag(sk, SOCK_DBG);
1112 		break;
1113 
1114 	case SO_DONTROUTE:
1115 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1116 		break;
1117 
1118 	case SO_BROADCAST:
1119 		v.val = sock_flag(sk, SOCK_BROADCAST);
1120 		break;
1121 
1122 	case SO_SNDBUF:
1123 		v.val = sk->sk_sndbuf;
1124 		break;
1125 
1126 	case SO_RCVBUF:
1127 		v.val = sk->sk_rcvbuf;
1128 		break;
1129 
1130 	case SO_REUSEADDR:
1131 		v.val = sk->sk_reuse;
1132 		break;
1133 
1134 	case SO_REUSEPORT:
1135 		v.val = sk->sk_reuseport;
1136 		break;
1137 
1138 	case SO_KEEPALIVE:
1139 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1140 		break;
1141 
1142 	case SO_TYPE:
1143 		v.val = sk->sk_type;
1144 		break;
1145 
1146 	case SO_PROTOCOL:
1147 		v.val = sk->sk_protocol;
1148 		break;
1149 
1150 	case SO_DOMAIN:
1151 		v.val = sk->sk_family;
1152 		break;
1153 
1154 	case SO_ERROR:
1155 		v.val = -sock_error(sk);
1156 		if (v.val == 0)
1157 			v.val = xchg(&sk->sk_err_soft, 0);
1158 		break;
1159 
1160 	case SO_OOBINLINE:
1161 		v.val = sock_flag(sk, SOCK_URGINLINE);
1162 		break;
1163 
1164 	case SO_NO_CHECK:
1165 		v.val = sk->sk_no_check_tx;
1166 		break;
1167 
1168 	case SO_PRIORITY:
1169 		v.val = sk->sk_priority;
1170 		break;
1171 
1172 	case SO_LINGER:
1173 		lv		= sizeof(v.ling);
1174 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1175 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1176 		break;
1177 
1178 	case SO_BSDCOMPAT:
1179 		sock_warn_obsolete_bsdism("getsockopt");
1180 		break;
1181 
1182 	case SO_TIMESTAMP:
1183 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1184 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1185 		break;
1186 
1187 	case SO_TIMESTAMPNS:
1188 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1189 		break;
1190 
1191 	case SO_TIMESTAMPING:
1192 		v.val = sk->sk_tsflags;
1193 		break;
1194 
1195 	case SO_RCVTIMEO:
1196 		lv = sizeof(struct timeval);
1197 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1198 			v.tm.tv_sec = 0;
1199 			v.tm.tv_usec = 0;
1200 		} else {
1201 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1202 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1203 		}
1204 		break;
1205 
1206 	case SO_SNDTIMEO:
1207 		lv = sizeof(struct timeval);
1208 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1209 			v.tm.tv_sec = 0;
1210 			v.tm.tv_usec = 0;
1211 		} else {
1212 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1213 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1214 		}
1215 		break;
1216 
1217 	case SO_RCVLOWAT:
1218 		v.val = sk->sk_rcvlowat;
1219 		break;
1220 
1221 	case SO_SNDLOWAT:
1222 		v.val = 1;
1223 		break;
1224 
1225 	case SO_PASSCRED:
1226 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1227 		break;
1228 
1229 	case SO_PEERCRED:
1230 	{
1231 		struct ucred peercred;
1232 		if (len > sizeof(peercred))
1233 			len = sizeof(peercred);
1234 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1235 		if (copy_to_user(optval, &peercred, len))
1236 			return -EFAULT;
1237 		goto lenout;
1238 	}
1239 
1240 	case SO_PEERGROUPS:
1241 	{
1242 		int ret, n;
1243 
1244 		if (!sk->sk_peer_cred)
1245 			return -ENODATA;
1246 
1247 		n = sk->sk_peer_cred->group_info->ngroups;
1248 		if (len < n * sizeof(gid_t)) {
1249 			len = n * sizeof(gid_t);
1250 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1251 		}
1252 		len = n * sizeof(gid_t);
1253 
1254 		ret = groups_to_user((gid_t __user *)optval,
1255 				     sk->sk_peer_cred->group_info);
1256 		if (ret)
1257 			return ret;
1258 		goto lenout;
1259 	}
1260 
1261 	case SO_PEERNAME:
1262 	{
1263 		char address[128];
1264 
1265 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1266 		if (lv < 0)
1267 			return -ENOTCONN;
1268 		if (lv < len)
1269 			return -EINVAL;
1270 		if (copy_to_user(optval, address, len))
1271 			return -EFAULT;
1272 		goto lenout;
1273 	}
1274 
1275 	/* Dubious BSD thing... Probably nobody even uses it, but
1276 	 * the UNIX standard wants it for whatever reason... -DaveM
1277 	 */
1278 	case SO_ACCEPTCONN:
1279 		v.val = sk->sk_state == TCP_LISTEN;
1280 		break;
1281 
1282 	case SO_PASSSEC:
1283 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1284 		break;
1285 
1286 	case SO_PEERSEC:
1287 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1288 
1289 	case SO_MARK:
1290 		v.val = sk->sk_mark;
1291 		break;
1292 
1293 	case SO_RXQ_OVFL:
1294 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1295 		break;
1296 
1297 	case SO_WIFI_STATUS:
1298 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1299 		break;
1300 
1301 	case SO_PEEK_OFF:
1302 		if (!sock->ops->set_peek_off)
1303 			return -EOPNOTSUPP;
1304 
1305 		v.val = sk->sk_peek_off;
1306 		break;
1307 	case SO_NOFCS:
1308 		v.val = sock_flag(sk, SOCK_NOFCS);
1309 		break;
1310 
1311 	case SO_BINDTODEVICE:
1312 		return sock_getbindtodevice(sk, optval, optlen, len);
1313 
1314 	case SO_GET_FILTER:
1315 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1316 		if (len < 0)
1317 			return len;
1318 
1319 		goto lenout;
1320 
1321 	case SO_LOCK_FILTER:
1322 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1323 		break;
1324 
1325 	case SO_BPF_EXTENSIONS:
1326 		v.val = bpf_tell_extensions();
1327 		break;
1328 
1329 	case SO_SELECT_ERR_QUEUE:
1330 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1331 		break;
1332 
1333 #ifdef CONFIG_NET_RX_BUSY_POLL
1334 	case SO_BUSY_POLL:
1335 		v.val = sk->sk_ll_usec;
1336 		break;
1337 #endif
1338 
1339 	case SO_MAX_PACING_RATE:
1340 		v.val = sk->sk_max_pacing_rate;
1341 		break;
1342 
1343 	case SO_INCOMING_CPU:
1344 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1345 		break;
1346 
1347 	case SO_MEMINFO:
1348 	{
1349 		u32 meminfo[SK_MEMINFO_VARS];
1350 
1351 		sk_get_meminfo(sk, meminfo);
1352 
1353 		len = min_t(unsigned int, len, sizeof(meminfo));
1354 		if (copy_to_user(optval, &meminfo, len))
1355 			return -EFAULT;
1356 
1357 		goto lenout;
1358 	}
1359 
1360 #ifdef CONFIG_NET_RX_BUSY_POLL
1361 	case SO_INCOMING_NAPI_ID:
1362 		v.val = READ_ONCE(sk->sk_napi_id);
1363 
1364 		/* aggregate non-NAPI IDs down to 0 */
1365 		if (v.val < MIN_NAPI_ID)
1366 			v.val = 0;
1367 
1368 		break;
1369 #endif
1370 
1371 	case SO_COOKIE:
1372 		lv = sizeof(u64);
1373 		if (len < lv)
1374 			return -EINVAL;
1375 		v.val64 = sock_gen_cookie(sk);
1376 		break;
1377 
1378 	case SO_ZEROCOPY:
1379 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1380 		break;
1381 
1382 	case SO_TXTIME:
1383 		lv = sizeof(v.txtime);
1384 		v.txtime.clockid = sk->sk_clockid;
1385 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1386 				  SOF_TXTIME_DEADLINE_MODE : 0;
1387 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1388 				  SOF_TXTIME_REPORT_ERRORS : 0;
1389 		break;
1390 
1391 	default:
1392 		/* We implement the SO_SNDLOWAT etc to not be settable
1393 		 * (1003.1g 7).
1394 		 */
1395 		return -ENOPROTOOPT;
1396 	}
1397 
1398 	if (len > lv)
1399 		len = lv;
1400 	if (copy_to_user(optval, &v, len))
1401 		return -EFAULT;
1402 lenout:
1403 	if (put_user(len, optlen))
1404 		return -EFAULT;
1405 	return 0;
1406 }
1407 
1408 /*
1409  * Initialize an sk_lock.
1410  *
1411  * (We also register the sk_lock with the lock validator.)
1412  */
sock_lock_init(struct sock * sk)1413 static inline void sock_lock_init(struct sock *sk)
1414 {
1415 	if (sk->sk_kern_sock)
1416 		sock_lock_init_class_and_name(
1417 			sk,
1418 			af_family_kern_slock_key_strings[sk->sk_family],
1419 			af_family_kern_slock_keys + sk->sk_family,
1420 			af_family_kern_key_strings[sk->sk_family],
1421 			af_family_kern_keys + sk->sk_family);
1422 	else
1423 		sock_lock_init_class_and_name(
1424 			sk,
1425 			af_family_slock_key_strings[sk->sk_family],
1426 			af_family_slock_keys + sk->sk_family,
1427 			af_family_key_strings[sk->sk_family],
1428 			af_family_keys + sk->sk_family);
1429 }
1430 
1431 /*
1432  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1433  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1434  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1435  */
sock_copy(struct sock * nsk,const struct sock * osk)1436 static void sock_copy(struct sock *nsk, const struct sock *osk)
1437 {
1438 #ifdef CONFIG_SECURITY_NETWORK
1439 	void *sptr = nsk->sk_security;
1440 #endif
1441 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1442 
1443 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1444 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1445 
1446 #ifdef CONFIG_SECURITY_NETWORK
1447 	nsk->sk_security = sptr;
1448 	security_sk_clone(osk, nsk);
1449 #endif
1450 }
1451 
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1452 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1453 		int family)
1454 {
1455 	struct sock *sk;
1456 	struct kmem_cache *slab;
1457 
1458 	slab = prot->slab;
1459 	if (slab != NULL) {
1460 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1461 		if (!sk)
1462 			return sk;
1463 		if (priority & __GFP_ZERO)
1464 			sk_prot_clear_nulls(sk, prot->obj_size);
1465 	} else
1466 		sk = kmalloc(prot->obj_size, priority);
1467 
1468 	if (sk != NULL) {
1469 		if (security_sk_alloc(sk, family, priority))
1470 			goto out_free;
1471 
1472 		if (!try_module_get(prot->owner))
1473 			goto out_free_sec;
1474 		sk_tx_queue_clear(sk);
1475 	}
1476 
1477 	return sk;
1478 
1479 out_free_sec:
1480 	security_sk_free(sk);
1481 out_free:
1482 	if (slab != NULL)
1483 		kmem_cache_free(slab, sk);
1484 	else
1485 		kfree(sk);
1486 	return NULL;
1487 }
1488 
sk_prot_free(struct proto * prot,struct sock * sk)1489 static void sk_prot_free(struct proto *prot, struct sock *sk)
1490 {
1491 	struct kmem_cache *slab;
1492 	struct module *owner;
1493 
1494 	owner = prot->owner;
1495 	slab = prot->slab;
1496 
1497 	cgroup_sk_free(&sk->sk_cgrp_data);
1498 	mem_cgroup_sk_free(sk);
1499 	security_sk_free(sk);
1500 	if (slab != NULL)
1501 		kmem_cache_free(slab, sk);
1502 	else
1503 		kfree(sk);
1504 	module_put(owner);
1505 }
1506 
1507 /**
1508  *	sk_alloc - All socket objects are allocated here
1509  *	@net: the applicable net namespace
1510  *	@family: protocol family
1511  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1512  *	@prot: struct proto associated with this new sock instance
1513  *	@kern: is this to be a kernel socket?
1514  */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)1515 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1516 		      struct proto *prot, int kern)
1517 {
1518 	struct sock *sk;
1519 
1520 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1521 	if (sk) {
1522 		sk->sk_family = family;
1523 		/*
1524 		 * See comment in struct sock definition to understand
1525 		 * why we need sk_prot_creator -acme
1526 		 */
1527 		sk->sk_prot = sk->sk_prot_creator = prot;
1528 		sk->sk_kern_sock = kern;
1529 		sock_lock_init(sk);
1530 		sk->sk_net_refcnt = kern ? 0 : 1;
1531 		if (likely(sk->sk_net_refcnt)) {
1532 			get_net(net);
1533 			sock_inuse_add(net, 1);
1534 		}
1535 
1536 		sock_net_set(sk, net);
1537 		refcount_set(&sk->sk_wmem_alloc, 1);
1538 
1539 		mem_cgroup_sk_alloc(sk);
1540 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1541 		sock_update_classid(&sk->sk_cgrp_data);
1542 		sock_update_netprioidx(&sk->sk_cgrp_data);
1543 		sk_tx_queue_clear(sk);
1544 	}
1545 
1546 	return sk;
1547 }
1548 EXPORT_SYMBOL(sk_alloc);
1549 
1550 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1551  * grace period. This is the case for UDP sockets and TCP listeners.
1552  */
__sk_destruct(struct rcu_head * head)1553 static void __sk_destruct(struct rcu_head *head)
1554 {
1555 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1556 	struct sk_filter *filter;
1557 
1558 	if (sk->sk_destruct)
1559 		sk->sk_destruct(sk);
1560 
1561 	filter = rcu_dereference_check(sk->sk_filter,
1562 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1563 	if (filter) {
1564 		sk_filter_uncharge(sk, filter);
1565 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1566 	}
1567 
1568 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1569 
1570 	if (atomic_read(&sk->sk_omem_alloc))
1571 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1572 			 __func__, atomic_read(&sk->sk_omem_alloc));
1573 
1574 	if (sk->sk_frag.page) {
1575 		put_page(sk->sk_frag.page);
1576 		sk->sk_frag.page = NULL;
1577 	}
1578 
1579 	if (sk->sk_peer_cred)
1580 		put_cred(sk->sk_peer_cred);
1581 	put_pid(sk->sk_peer_pid);
1582 	if (likely(sk->sk_net_refcnt))
1583 		put_net(sock_net(sk));
1584 	sk_prot_free(sk->sk_prot_creator, sk);
1585 }
1586 
sk_destruct(struct sock * sk)1587 void sk_destruct(struct sock *sk)
1588 {
1589 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1590 
1591 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1592 		reuseport_detach_sock(sk);
1593 		use_call_rcu = true;
1594 	}
1595 
1596 	if (use_call_rcu)
1597 		call_rcu(&sk->sk_rcu, __sk_destruct);
1598 	else
1599 		__sk_destruct(&sk->sk_rcu);
1600 }
1601 
__sk_free(struct sock * sk)1602 static void __sk_free(struct sock *sk)
1603 {
1604 	if (likely(sk->sk_net_refcnt))
1605 		sock_inuse_add(sock_net(sk), -1);
1606 
1607 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1608 		sock_diag_broadcast_destroy(sk);
1609 	else
1610 		sk_destruct(sk);
1611 }
1612 
sk_free(struct sock * sk)1613 void sk_free(struct sock *sk)
1614 {
1615 	/*
1616 	 * We subtract one from sk_wmem_alloc and can know if
1617 	 * some packets are still in some tx queue.
1618 	 * If not null, sock_wfree() will call __sk_free(sk) later
1619 	 */
1620 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1621 		__sk_free(sk);
1622 }
1623 EXPORT_SYMBOL(sk_free);
1624 
sk_init_common(struct sock * sk)1625 static void sk_init_common(struct sock *sk)
1626 {
1627 	skb_queue_head_init(&sk->sk_receive_queue);
1628 	skb_queue_head_init(&sk->sk_write_queue);
1629 	skb_queue_head_init(&sk->sk_error_queue);
1630 
1631 	rwlock_init(&sk->sk_callback_lock);
1632 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1633 			af_rlock_keys + sk->sk_family,
1634 			af_family_rlock_key_strings[sk->sk_family]);
1635 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1636 			af_wlock_keys + sk->sk_family,
1637 			af_family_wlock_key_strings[sk->sk_family]);
1638 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1639 			af_elock_keys + sk->sk_family,
1640 			af_family_elock_key_strings[sk->sk_family]);
1641 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1642 			af_callback_keys + sk->sk_family,
1643 			af_family_clock_key_strings[sk->sk_family]);
1644 }
1645 
1646 /**
1647  *	sk_clone_lock - clone a socket, and lock its clone
1648  *	@sk: the socket to clone
1649  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1650  *
1651  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1652  */
sk_clone_lock(const struct sock * sk,const gfp_t priority)1653 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1654 {
1655 	struct sock *newsk;
1656 	bool is_charged = true;
1657 
1658 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1659 	if (newsk != NULL) {
1660 		struct sk_filter *filter;
1661 
1662 		sock_copy(newsk, sk);
1663 
1664 		newsk->sk_prot_creator = sk->sk_prot;
1665 
1666 		/* SANITY */
1667 		if (likely(newsk->sk_net_refcnt))
1668 			get_net(sock_net(newsk));
1669 		sk_node_init(&newsk->sk_node);
1670 		sock_lock_init(newsk);
1671 		bh_lock_sock(newsk);
1672 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1673 		newsk->sk_backlog.len = 0;
1674 
1675 		atomic_set(&newsk->sk_rmem_alloc, 0);
1676 		/*
1677 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1678 		 */
1679 		refcount_set(&newsk->sk_wmem_alloc, 1);
1680 		atomic_set(&newsk->sk_omem_alloc, 0);
1681 		sk_init_common(newsk);
1682 
1683 		newsk->sk_dst_cache	= NULL;
1684 		newsk->sk_dst_pending_confirm = 0;
1685 		newsk->sk_wmem_queued	= 0;
1686 		newsk->sk_forward_alloc = 0;
1687 		atomic_set(&newsk->sk_drops, 0);
1688 		newsk->sk_send_head	= NULL;
1689 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1690 		atomic_set(&newsk->sk_zckey, 0);
1691 
1692 		sock_reset_flag(newsk, SOCK_DONE);
1693 
1694 		/* sk->sk_memcg will be populated at accept() time */
1695 		newsk->sk_memcg = NULL;
1696 
1697 		cgroup_sk_clone(&newsk->sk_cgrp_data);
1698 
1699 		rcu_read_lock();
1700 		filter = rcu_dereference(sk->sk_filter);
1701 		if (filter != NULL)
1702 			/* though it's an empty new sock, the charging may fail
1703 			 * if sysctl_optmem_max was changed between creation of
1704 			 * original socket and cloning
1705 			 */
1706 			is_charged = sk_filter_charge(newsk, filter);
1707 		RCU_INIT_POINTER(newsk->sk_filter, filter);
1708 		rcu_read_unlock();
1709 
1710 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1711 			/* We need to make sure that we don't uncharge the new
1712 			 * socket if we couldn't charge it in the first place
1713 			 * as otherwise we uncharge the parent's filter.
1714 			 */
1715 			if (!is_charged)
1716 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1717 			sk_free_unlock_clone(newsk);
1718 			newsk = NULL;
1719 			goto out;
1720 		}
1721 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1722 
1723 		newsk->sk_err	   = 0;
1724 		newsk->sk_err_soft = 0;
1725 		newsk->sk_priority = 0;
1726 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1727 		atomic64_set(&newsk->sk_cookie, 0);
1728 		if (likely(newsk->sk_net_refcnt))
1729 			sock_inuse_add(sock_net(newsk), 1);
1730 
1731 		/*
1732 		 * Before updating sk_refcnt, we must commit prior changes to memory
1733 		 * (Documentation/RCU/rculist_nulls.txt for details)
1734 		 */
1735 		smp_wmb();
1736 		refcount_set(&newsk->sk_refcnt, 2);
1737 
1738 		/*
1739 		 * Increment the counter in the same struct proto as the master
1740 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1741 		 * is the same as sk->sk_prot->socks, as this field was copied
1742 		 * with memcpy).
1743 		 *
1744 		 * This _changes_ the previous behaviour, where
1745 		 * tcp_create_openreq_child always was incrementing the
1746 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1747 		 * to be taken into account in all callers. -acme
1748 		 */
1749 		sk_refcnt_debug_inc(newsk);
1750 		sk_set_socket(newsk, NULL);
1751 		sk_tx_queue_clear(newsk);
1752 		newsk->sk_wq = NULL;
1753 
1754 		if (newsk->sk_prot->sockets_allocated)
1755 			sk_sockets_allocated_inc(newsk);
1756 
1757 		if (sock_needs_netstamp(sk) &&
1758 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1759 			net_enable_timestamp();
1760 	}
1761 out:
1762 	return newsk;
1763 }
1764 EXPORT_SYMBOL_GPL(sk_clone_lock);
1765 
sk_free_unlock_clone(struct sock * sk)1766 void sk_free_unlock_clone(struct sock *sk)
1767 {
1768 	/* It is still raw copy of parent, so invalidate
1769 	 * destructor and make plain sk_free() */
1770 	sk->sk_destruct = NULL;
1771 	bh_unlock_sock(sk);
1772 	sk_free(sk);
1773 }
1774 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1775 
sk_setup_caps(struct sock * sk,struct dst_entry * dst)1776 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1777 {
1778 	u32 max_segs = 1;
1779 
1780 	sk_dst_set(sk, dst);
1781 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1782 	if (sk->sk_route_caps & NETIF_F_GSO)
1783 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1784 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1785 	if (sk_can_gso(sk)) {
1786 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1787 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1788 		} else {
1789 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1790 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1791 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1792 		}
1793 	}
1794 	sk->sk_gso_max_segs = max_segs;
1795 }
1796 EXPORT_SYMBOL_GPL(sk_setup_caps);
1797 
1798 /*
1799  *	Simple resource managers for sockets.
1800  */
1801 
1802 
1803 /*
1804  * Write buffer destructor automatically called from kfree_skb.
1805  */
sock_wfree(struct sk_buff * skb)1806 void sock_wfree(struct sk_buff *skb)
1807 {
1808 	struct sock *sk = skb->sk;
1809 	unsigned int len = skb->truesize;
1810 
1811 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1812 		/*
1813 		 * Keep a reference on sk_wmem_alloc, this will be released
1814 		 * after sk_write_space() call
1815 		 */
1816 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1817 		sk->sk_write_space(sk);
1818 		len = 1;
1819 	}
1820 	/*
1821 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1822 	 * could not do because of in-flight packets
1823 	 */
1824 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1825 		__sk_free(sk);
1826 }
1827 EXPORT_SYMBOL(sock_wfree);
1828 
1829 /* This variant of sock_wfree() is used by TCP,
1830  * since it sets SOCK_USE_WRITE_QUEUE.
1831  */
__sock_wfree(struct sk_buff * skb)1832 void __sock_wfree(struct sk_buff *skb)
1833 {
1834 	struct sock *sk = skb->sk;
1835 
1836 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1837 		__sk_free(sk);
1838 }
1839 
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)1840 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1841 {
1842 	skb_orphan(skb);
1843 	skb->sk = sk;
1844 #ifdef CONFIG_INET
1845 	if (unlikely(!sk_fullsock(sk))) {
1846 		skb->destructor = sock_edemux;
1847 		sock_hold(sk);
1848 		return;
1849 	}
1850 #endif
1851 	skb->destructor = sock_wfree;
1852 	skb_set_hash_from_sk(skb, sk);
1853 	/*
1854 	 * We used to take a refcount on sk, but following operation
1855 	 * is enough to guarantee sk_free() wont free this sock until
1856 	 * all in-flight packets are completed
1857 	 */
1858 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1859 }
1860 EXPORT_SYMBOL(skb_set_owner_w);
1861 
1862 /* This helper is used by netem, as it can hold packets in its
1863  * delay queue. We want to allow the owner socket to send more
1864  * packets, as if they were already TX completed by a typical driver.
1865  * But we also want to keep skb->sk set because some packet schedulers
1866  * rely on it (sch_fq for example).
1867  */
skb_orphan_partial(struct sk_buff * skb)1868 void skb_orphan_partial(struct sk_buff *skb)
1869 {
1870 	if (skb_is_tcp_pure_ack(skb))
1871 		return;
1872 
1873 	if (skb->destructor == sock_wfree
1874 #ifdef CONFIG_INET
1875 	    || skb->destructor == tcp_wfree
1876 #endif
1877 		) {
1878 		struct sock *sk = skb->sk;
1879 
1880 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1881 			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1882 			skb->destructor = sock_efree;
1883 		}
1884 	} else {
1885 		skb_orphan(skb);
1886 	}
1887 }
1888 EXPORT_SYMBOL(skb_orphan_partial);
1889 
1890 /*
1891  * Read buffer destructor automatically called from kfree_skb.
1892  */
sock_rfree(struct sk_buff * skb)1893 void sock_rfree(struct sk_buff *skb)
1894 {
1895 	struct sock *sk = skb->sk;
1896 	unsigned int len = skb->truesize;
1897 
1898 	atomic_sub(len, &sk->sk_rmem_alloc);
1899 	sk_mem_uncharge(sk, len);
1900 }
1901 EXPORT_SYMBOL(sock_rfree);
1902 
1903 /*
1904  * Buffer destructor for skbs that are not used directly in read or write
1905  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1906  */
sock_efree(struct sk_buff * skb)1907 void sock_efree(struct sk_buff *skb)
1908 {
1909 	sock_put(skb->sk);
1910 }
1911 EXPORT_SYMBOL(sock_efree);
1912 
sock_i_uid(struct sock * sk)1913 kuid_t sock_i_uid(struct sock *sk)
1914 {
1915 	kuid_t uid;
1916 
1917 	read_lock_bh(&sk->sk_callback_lock);
1918 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1919 	read_unlock_bh(&sk->sk_callback_lock);
1920 	return uid;
1921 }
1922 EXPORT_SYMBOL(sock_i_uid);
1923 
sock_i_ino(struct sock * sk)1924 unsigned long sock_i_ino(struct sock *sk)
1925 {
1926 	unsigned long ino;
1927 
1928 	read_lock_bh(&sk->sk_callback_lock);
1929 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1930 	read_unlock_bh(&sk->sk_callback_lock);
1931 	return ino;
1932 }
1933 EXPORT_SYMBOL(sock_i_ino);
1934 
1935 /*
1936  * Allocate a skb from the socket's send buffer.
1937  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)1938 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1939 			     gfp_t priority)
1940 {
1941 	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1942 		struct sk_buff *skb = alloc_skb(size, priority);
1943 		if (skb) {
1944 			skb_set_owner_w(skb, sk);
1945 			return skb;
1946 		}
1947 	}
1948 	return NULL;
1949 }
1950 EXPORT_SYMBOL(sock_wmalloc);
1951 
sock_ofree(struct sk_buff * skb)1952 static void sock_ofree(struct sk_buff *skb)
1953 {
1954 	struct sock *sk = skb->sk;
1955 
1956 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1957 }
1958 
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)1959 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1960 			     gfp_t priority)
1961 {
1962 	struct sk_buff *skb;
1963 
1964 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1965 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1966 	    sysctl_optmem_max)
1967 		return NULL;
1968 
1969 	skb = alloc_skb(size, priority);
1970 	if (!skb)
1971 		return NULL;
1972 
1973 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
1974 	skb->sk = sk;
1975 	skb->destructor = sock_ofree;
1976 	return skb;
1977 }
1978 
1979 /*
1980  * Allocate a memory block from the socket's option memory buffer.
1981  */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)1982 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1983 {
1984 	if ((unsigned int)size <= sysctl_optmem_max &&
1985 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1986 		void *mem;
1987 		/* First do the add, to avoid the race if kmalloc
1988 		 * might sleep.
1989 		 */
1990 		atomic_add(size, &sk->sk_omem_alloc);
1991 		mem = kmalloc(size, priority);
1992 		if (mem)
1993 			return mem;
1994 		atomic_sub(size, &sk->sk_omem_alloc);
1995 	}
1996 	return NULL;
1997 }
1998 EXPORT_SYMBOL(sock_kmalloc);
1999 
2000 /* Free an option memory block. Note, we actually want the inline
2001  * here as this allows gcc to detect the nullify and fold away the
2002  * condition entirely.
2003  */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2004 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2005 				  const bool nullify)
2006 {
2007 	if (WARN_ON_ONCE(!mem))
2008 		return;
2009 	if (nullify)
2010 		kzfree(mem);
2011 	else
2012 		kfree(mem);
2013 	atomic_sub(size, &sk->sk_omem_alloc);
2014 }
2015 
sock_kfree_s(struct sock * sk,void * mem,int size)2016 void sock_kfree_s(struct sock *sk, void *mem, int size)
2017 {
2018 	__sock_kfree_s(sk, mem, size, false);
2019 }
2020 EXPORT_SYMBOL(sock_kfree_s);
2021 
sock_kzfree_s(struct sock * sk,void * mem,int size)2022 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2023 {
2024 	__sock_kfree_s(sk, mem, size, true);
2025 }
2026 EXPORT_SYMBOL(sock_kzfree_s);
2027 
2028 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2029    I think, these locks should be removed for datagram sockets.
2030  */
sock_wait_for_wmem(struct sock * sk,long timeo)2031 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2032 {
2033 	DEFINE_WAIT(wait);
2034 
2035 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2036 	for (;;) {
2037 		if (!timeo)
2038 			break;
2039 		if (signal_pending(current))
2040 			break;
2041 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2042 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2043 		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2044 			break;
2045 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2046 			break;
2047 		if (sk->sk_err)
2048 			break;
2049 		timeo = schedule_timeout(timeo);
2050 	}
2051 	finish_wait(sk_sleep(sk), &wait);
2052 	return timeo;
2053 }
2054 
2055 
2056 /*
2057  *	Generic send/receive buffer handlers
2058  */
2059 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2060 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2061 				     unsigned long data_len, int noblock,
2062 				     int *errcode, int max_page_order)
2063 {
2064 	struct sk_buff *skb;
2065 	long timeo;
2066 	int err;
2067 
2068 	timeo = sock_sndtimeo(sk, noblock);
2069 	for (;;) {
2070 		err = sock_error(sk);
2071 		if (err != 0)
2072 			goto failure;
2073 
2074 		err = -EPIPE;
2075 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2076 			goto failure;
2077 
2078 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2079 			break;
2080 
2081 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2082 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2083 		err = -EAGAIN;
2084 		if (!timeo)
2085 			goto failure;
2086 		if (signal_pending(current))
2087 			goto interrupted;
2088 		timeo = sock_wait_for_wmem(sk, timeo);
2089 	}
2090 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2091 				   errcode, sk->sk_allocation);
2092 	if (skb)
2093 		skb_set_owner_w(skb, sk);
2094 	return skb;
2095 
2096 interrupted:
2097 	err = sock_intr_errno(timeo);
2098 failure:
2099 	*errcode = err;
2100 	return NULL;
2101 }
2102 EXPORT_SYMBOL(sock_alloc_send_pskb);
2103 
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)2104 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2105 				    int noblock, int *errcode)
2106 {
2107 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2108 }
2109 EXPORT_SYMBOL(sock_alloc_send_skb);
2110 
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2111 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2112 		     struct sockcm_cookie *sockc)
2113 {
2114 	u32 tsflags;
2115 
2116 	switch (cmsg->cmsg_type) {
2117 	case SO_MARK:
2118 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2119 			return -EPERM;
2120 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2121 			return -EINVAL;
2122 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2123 		break;
2124 	case SO_TIMESTAMPING:
2125 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2126 			return -EINVAL;
2127 
2128 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2129 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2130 			return -EINVAL;
2131 
2132 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2133 		sockc->tsflags |= tsflags;
2134 		break;
2135 	case SCM_TXTIME:
2136 		if (!sock_flag(sk, SOCK_TXTIME))
2137 			return -EINVAL;
2138 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2139 			return -EINVAL;
2140 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2141 		break;
2142 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2143 	case SCM_RIGHTS:
2144 	case SCM_CREDENTIALS:
2145 		break;
2146 	default:
2147 		return -EINVAL;
2148 	}
2149 	return 0;
2150 }
2151 EXPORT_SYMBOL(__sock_cmsg_send);
2152 
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2153 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2154 		   struct sockcm_cookie *sockc)
2155 {
2156 	struct cmsghdr *cmsg;
2157 	int ret;
2158 
2159 	for_each_cmsghdr(cmsg, msg) {
2160 		if (!CMSG_OK(msg, cmsg))
2161 			return -EINVAL;
2162 		if (cmsg->cmsg_level != SOL_SOCKET)
2163 			continue;
2164 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2165 		if (ret)
2166 			return ret;
2167 	}
2168 	return 0;
2169 }
2170 EXPORT_SYMBOL(sock_cmsg_send);
2171 
sk_enter_memory_pressure(struct sock * sk)2172 static void sk_enter_memory_pressure(struct sock *sk)
2173 {
2174 	if (!sk->sk_prot->enter_memory_pressure)
2175 		return;
2176 
2177 	sk->sk_prot->enter_memory_pressure(sk);
2178 }
2179 
sk_leave_memory_pressure(struct sock * sk)2180 static void sk_leave_memory_pressure(struct sock *sk)
2181 {
2182 	if (sk->sk_prot->leave_memory_pressure) {
2183 		sk->sk_prot->leave_memory_pressure(sk);
2184 	} else {
2185 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2186 
2187 		if (memory_pressure && READ_ONCE(*memory_pressure))
2188 			WRITE_ONCE(*memory_pressure, 0);
2189 	}
2190 }
2191 
2192 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2193 
2194 /**
2195  * skb_page_frag_refill - check that a page_frag contains enough room
2196  * @sz: minimum size of the fragment we want to get
2197  * @pfrag: pointer to page_frag
2198  * @gfp: priority for memory allocation
2199  *
2200  * Note: While this allocator tries to use high order pages, there is
2201  * no guarantee that allocations succeed. Therefore, @sz MUST be
2202  * less or equal than PAGE_SIZE.
2203  */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2204 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2205 {
2206 	if (pfrag->page) {
2207 		if (page_ref_count(pfrag->page) == 1) {
2208 			pfrag->offset = 0;
2209 			return true;
2210 		}
2211 		if (pfrag->offset + sz <= pfrag->size)
2212 			return true;
2213 		put_page(pfrag->page);
2214 	}
2215 
2216 	pfrag->offset = 0;
2217 	if (SKB_FRAG_PAGE_ORDER) {
2218 		/* Avoid direct reclaim but allow kswapd to wake */
2219 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2220 					  __GFP_COMP | __GFP_NOWARN |
2221 					  __GFP_NORETRY,
2222 					  SKB_FRAG_PAGE_ORDER);
2223 		if (likely(pfrag->page)) {
2224 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2225 			return true;
2226 		}
2227 	}
2228 	pfrag->page = alloc_page(gfp);
2229 	if (likely(pfrag->page)) {
2230 		pfrag->size = PAGE_SIZE;
2231 		return true;
2232 	}
2233 	return false;
2234 }
2235 EXPORT_SYMBOL(skb_page_frag_refill);
2236 
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2237 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2238 {
2239 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2240 		return true;
2241 
2242 	sk_enter_memory_pressure(sk);
2243 	sk_stream_moderate_sndbuf(sk);
2244 	return false;
2245 }
2246 EXPORT_SYMBOL(sk_page_frag_refill);
2247 
sk_alloc_sg(struct sock * sk,int len,struct scatterlist * sg,int sg_start,int * sg_curr_index,unsigned int * sg_curr_size,int first_coalesce)2248 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2249 		int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2250 		int first_coalesce)
2251 {
2252 	int sg_curr = *sg_curr_index, use = 0, rc = 0;
2253 	unsigned int size = *sg_curr_size;
2254 	struct page_frag *pfrag;
2255 	struct scatterlist *sge;
2256 
2257 	len -= size;
2258 	pfrag = sk_page_frag(sk);
2259 
2260 	while (len > 0) {
2261 		unsigned int orig_offset;
2262 
2263 		if (!sk_page_frag_refill(sk, pfrag)) {
2264 			rc = -ENOMEM;
2265 			goto out;
2266 		}
2267 
2268 		use = min_t(int, len, pfrag->size - pfrag->offset);
2269 
2270 		if (!sk_wmem_schedule(sk, use)) {
2271 			rc = -ENOMEM;
2272 			goto out;
2273 		}
2274 
2275 		sk_mem_charge(sk, use);
2276 		size += use;
2277 		orig_offset = pfrag->offset;
2278 		pfrag->offset += use;
2279 
2280 		sge = sg + sg_curr - 1;
2281 		if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
2282 		    sge->offset + sge->length == orig_offset) {
2283 			sge->length += use;
2284 		} else {
2285 			sge = sg + sg_curr;
2286 			sg_unmark_end(sge);
2287 			sg_set_page(sge, pfrag->page, use, orig_offset);
2288 			get_page(pfrag->page);
2289 			sg_curr++;
2290 
2291 			if (sg_curr == MAX_SKB_FRAGS)
2292 				sg_curr = 0;
2293 
2294 			if (sg_curr == sg_start) {
2295 				rc = -ENOSPC;
2296 				break;
2297 			}
2298 		}
2299 
2300 		len -= use;
2301 	}
2302 out:
2303 	*sg_curr_size = size;
2304 	*sg_curr_index = sg_curr;
2305 	return rc;
2306 }
2307 EXPORT_SYMBOL(sk_alloc_sg);
2308 
__lock_sock(struct sock * sk)2309 static void __lock_sock(struct sock *sk)
2310 	__releases(&sk->sk_lock.slock)
2311 	__acquires(&sk->sk_lock.slock)
2312 {
2313 	DEFINE_WAIT(wait);
2314 
2315 	for (;;) {
2316 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2317 					TASK_UNINTERRUPTIBLE);
2318 		spin_unlock_bh(&sk->sk_lock.slock);
2319 		schedule();
2320 		spin_lock_bh(&sk->sk_lock.slock);
2321 		if (!sock_owned_by_user(sk))
2322 			break;
2323 	}
2324 	finish_wait(&sk->sk_lock.wq, &wait);
2325 }
2326 
__release_sock(struct sock * sk)2327 void __release_sock(struct sock *sk)
2328 	__releases(&sk->sk_lock.slock)
2329 	__acquires(&sk->sk_lock.slock)
2330 {
2331 	struct sk_buff *skb, *next;
2332 
2333 	while ((skb = sk->sk_backlog.head) != NULL) {
2334 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2335 
2336 		spin_unlock_bh(&sk->sk_lock.slock);
2337 
2338 		do {
2339 			next = skb->next;
2340 			prefetch(next);
2341 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2342 			skb->next = NULL;
2343 			sk_backlog_rcv(sk, skb);
2344 
2345 			cond_resched();
2346 
2347 			skb = next;
2348 		} while (skb != NULL);
2349 
2350 		spin_lock_bh(&sk->sk_lock.slock);
2351 	}
2352 
2353 	/*
2354 	 * Doing the zeroing here guarantee we can not loop forever
2355 	 * while a wild producer attempts to flood us.
2356 	 */
2357 	sk->sk_backlog.len = 0;
2358 }
2359 
__sk_flush_backlog(struct sock * sk)2360 void __sk_flush_backlog(struct sock *sk)
2361 {
2362 	spin_lock_bh(&sk->sk_lock.slock);
2363 	__release_sock(sk);
2364 	spin_unlock_bh(&sk->sk_lock.slock);
2365 }
2366 
2367 /**
2368  * sk_wait_data - wait for data to arrive at sk_receive_queue
2369  * @sk:    sock to wait on
2370  * @timeo: for how long
2371  * @skb:   last skb seen on sk_receive_queue
2372  *
2373  * Now socket state including sk->sk_err is changed only under lock,
2374  * hence we may omit checks after joining wait queue.
2375  * We check receive queue before schedule() only as optimization;
2376  * it is very likely that release_sock() added new data.
2377  */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2378 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2379 {
2380 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2381 	int rc;
2382 
2383 	add_wait_queue(sk_sleep(sk), &wait);
2384 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2385 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2386 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2387 	remove_wait_queue(sk_sleep(sk), &wait);
2388 	return rc;
2389 }
2390 EXPORT_SYMBOL(sk_wait_data);
2391 
2392 /**
2393  *	__sk_mem_raise_allocated - increase memory_allocated
2394  *	@sk: socket
2395  *	@size: memory size to allocate
2396  *	@amt: pages to allocate
2397  *	@kind: allocation type
2398  *
2399  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2400  */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)2401 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2402 {
2403 	struct proto *prot = sk->sk_prot;
2404 	long allocated = sk_memory_allocated_add(sk, amt);
2405 	bool charged = true;
2406 
2407 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2408 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2409 		goto suppress_allocation;
2410 
2411 	/* Under limit. */
2412 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2413 		sk_leave_memory_pressure(sk);
2414 		return 1;
2415 	}
2416 
2417 	/* Under pressure. */
2418 	if (allocated > sk_prot_mem_limits(sk, 1))
2419 		sk_enter_memory_pressure(sk);
2420 
2421 	/* Over hard limit. */
2422 	if (allocated > sk_prot_mem_limits(sk, 2))
2423 		goto suppress_allocation;
2424 
2425 	/* guarantee minimum buffer size under pressure */
2426 	if (kind == SK_MEM_RECV) {
2427 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2428 			return 1;
2429 
2430 	} else { /* SK_MEM_SEND */
2431 		int wmem0 = sk_get_wmem0(sk, prot);
2432 
2433 		if (sk->sk_type == SOCK_STREAM) {
2434 			if (sk->sk_wmem_queued < wmem0)
2435 				return 1;
2436 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2437 				return 1;
2438 		}
2439 	}
2440 
2441 	if (sk_has_memory_pressure(sk)) {
2442 		u64 alloc;
2443 
2444 		if (!sk_under_memory_pressure(sk))
2445 			return 1;
2446 		alloc = sk_sockets_allocated_read_positive(sk);
2447 		if (sk_prot_mem_limits(sk, 2) > alloc *
2448 		    sk_mem_pages(sk->sk_wmem_queued +
2449 				 atomic_read(&sk->sk_rmem_alloc) +
2450 				 sk->sk_forward_alloc))
2451 			return 1;
2452 	}
2453 
2454 suppress_allocation:
2455 
2456 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2457 		sk_stream_moderate_sndbuf(sk);
2458 
2459 		/* Fail only if socket is _under_ its sndbuf.
2460 		 * In this case we cannot block, so that we have to fail.
2461 		 */
2462 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2463 			return 1;
2464 	}
2465 
2466 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2467 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2468 
2469 	sk_memory_allocated_sub(sk, amt);
2470 
2471 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2472 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2473 
2474 	return 0;
2475 }
2476 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2477 
2478 /**
2479  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2480  *	@sk: socket
2481  *	@size: memory size to allocate
2482  *	@kind: allocation type
2483  *
2484  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2485  *	rmem allocation. This function assumes that protocols which have
2486  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2487  */
__sk_mem_schedule(struct sock * sk,int size,int kind)2488 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2489 {
2490 	int ret, amt = sk_mem_pages(size);
2491 
2492 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2493 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2494 	if (!ret)
2495 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2496 	return ret;
2497 }
2498 EXPORT_SYMBOL(__sk_mem_schedule);
2499 
2500 /**
2501  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2502  *	@sk: socket
2503  *	@amount: number of quanta
2504  *
2505  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2506  */
__sk_mem_reduce_allocated(struct sock * sk,int amount)2507 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2508 {
2509 	sk_memory_allocated_sub(sk, amount);
2510 
2511 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2512 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2513 
2514 	if (sk_under_memory_pressure(sk) &&
2515 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2516 		sk_leave_memory_pressure(sk);
2517 }
2518 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2519 
2520 /**
2521  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2522  *	@sk: socket
2523  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2524  */
__sk_mem_reclaim(struct sock * sk,int amount)2525 void __sk_mem_reclaim(struct sock *sk, int amount)
2526 {
2527 	amount >>= SK_MEM_QUANTUM_SHIFT;
2528 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2529 	__sk_mem_reduce_allocated(sk, amount);
2530 }
2531 EXPORT_SYMBOL(__sk_mem_reclaim);
2532 
sk_set_peek_off(struct sock * sk,int val)2533 int sk_set_peek_off(struct sock *sk, int val)
2534 {
2535 	sk->sk_peek_off = val;
2536 	return 0;
2537 }
2538 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2539 
2540 /*
2541  * Set of default routines for initialising struct proto_ops when
2542  * the protocol does not support a particular function. In certain
2543  * cases where it makes no sense for a protocol to have a "do nothing"
2544  * function, some default processing is provided.
2545  */
2546 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2547 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2548 {
2549 	return -EOPNOTSUPP;
2550 }
2551 EXPORT_SYMBOL(sock_no_bind);
2552 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2553 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2554 		    int len, int flags)
2555 {
2556 	return -EOPNOTSUPP;
2557 }
2558 EXPORT_SYMBOL(sock_no_connect);
2559 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2560 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2561 {
2562 	return -EOPNOTSUPP;
2563 }
2564 EXPORT_SYMBOL(sock_no_socketpair);
2565 
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)2566 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2567 		   bool kern)
2568 {
2569 	return -EOPNOTSUPP;
2570 }
2571 EXPORT_SYMBOL(sock_no_accept);
2572 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)2573 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2574 		    int peer)
2575 {
2576 	return -EOPNOTSUPP;
2577 }
2578 EXPORT_SYMBOL(sock_no_getname);
2579 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2580 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2581 {
2582 	return -EOPNOTSUPP;
2583 }
2584 EXPORT_SYMBOL(sock_no_ioctl);
2585 
sock_no_listen(struct socket * sock,int backlog)2586 int sock_no_listen(struct socket *sock, int backlog)
2587 {
2588 	return -EOPNOTSUPP;
2589 }
2590 EXPORT_SYMBOL(sock_no_listen);
2591 
sock_no_shutdown(struct socket * sock,int how)2592 int sock_no_shutdown(struct socket *sock, int how)
2593 {
2594 	return -EOPNOTSUPP;
2595 }
2596 EXPORT_SYMBOL(sock_no_shutdown);
2597 
sock_no_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2598 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2599 		    char __user *optval, unsigned int optlen)
2600 {
2601 	return -EOPNOTSUPP;
2602 }
2603 EXPORT_SYMBOL(sock_no_setsockopt);
2604 
sock_no_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2605 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2606 		    char __user *optval, int __user *optlen)
2607 {
2608 	return -EOPNOTSUPP;
2609 }
2610 EXPORT_SYMBOL(sock_no_getsockopt);
2611 
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)2612 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2613 {
2614 	return -EOPNOTSUPP;
2615 }
2616 EXPORT_SYMBOL(sock_no_sendmsg);
2617 
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)2618 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2619 {
2620 	return -EOPNOTSUPP;
2621 }
2622 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2623 
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)2624 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2625 		    int flags)
2626 {
2627 	return -EOPNOTSUPP;
2628 }
2629 EXPORT_SYMBOL(sock_no_recvmsg);
2630 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)2631 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2632 {
2633 	/* Mirror missing mmap method error code */
2634 	return -ENODEV;
2635 }
2636 EXPORT_SYMBOL(sock_no_mmap);
2637 
2638 /*
2639  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2640  * various sock-based usage counts.
2641  */
__receive_sock(struct file * file)2642 void __receive_sock(struct file *file)
2643 {
2644 	struct socket *sock;
2645 	int error;
2646 
2647 	/*
2648 	 * The resulting value of "error" is ignored here since we only
2649 	 * need to take action when the file is a socket and testing
2650 	 * "sock" for NULL is sufficient.
2651 	 */
2652 	sock = sock_from_file(file, &error);
2653 	if (sock) {
2654 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2655 		sock_update_classid(&sock->sk->sk_cgrp_data);
2656 	}
2657 }
2658 
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)2659 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2660 {
2661 	ssize_t res;
2662 	struct msghdr msg = {.msg_flags = flags};
2663 	struct kvec iov;
2664 	char *kaddr = kmap(page);
2665 	iov.iov_base = kaddr + offset;
2666 	iov.iov_len = size;
2667 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2668 	kunmap(page);
2669 	return res;
2670 }
2671 EXPORT_SYMBOL(sock_no_sendpage);
2672 
sock_no_sendpage_locked(struct sock * sk,struct page * page,int offset,size_t size,int flags)2673 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2674 				int offset, size_t size, int flags)
2675 {
2676 	ssize_t res;
2677 	struct msghdr msg = {.msg_flags = flags};
2678 	struct kvec iov;
2679 	char *kaddr = kmap(page);
2680 
2681 	iov.iov_base = kaddr + offset;
2682 	iov.iov_len = size;
2683 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2684 	kunmap(page);
2685 	return res;
2686 }
2687 EXPORT_SYMBOL(sock_no_sendpage_locked);
2688 
2689 /*
2690  *	Default Socket Callbacks
2691  */
2692 
sock_def_wakeup(struct sock * sk)2693 static void sock_def_wakeup(struct sock *sk)
2694 {
2695 	struct socket_wq *wq;
2696 
2697 	rcu_read_lock();
2698 	wq = rcu_dereference(sk->sk_wq);
2699 	if (skwq_has_sleeper(wq))
2700 		wake_up_interruptible_all(&wq->wait);
2701 	rcu_read_unlock();
2702 }
2703 
sock_def_error_report(struct sock * sk)2704 static void sock_def_error_report(struct sock *sk)
2705 {
2706 	struct socket_wq *wq;
2707 
2708 	rcu_read_lock();
2709 	wq = rcu_dereference(sk->sk_wq);
2710 	if (skwq_has_sleeper(wq))
2711 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2712 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2713 	rcu_read_unlock();
2714 }
2715 
sock_def_readable(struct sock * sk)2716 static void sock_def_readable(struct sock *sk)
2717 {
2718 	struct socket_wq *wq;
2719 
2720 	rcu_read_lock();
2721 	wq = rcu_dereference(sk->sk_wq);
2722 	if (skwq_has_sleeper(wq))
2723 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2724 						EPOLLRDNORM | EPOLLRDBAND);
2725 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2726 	rcu_read_unlock();
2727 }
2728 
sock_def_write_space(struct sock * sk)2729 static void sock_def_write_space(struct sock *sk)
2730 {
2731 	struct socket_wq *wq;
2732 
2733 	rcu_read_lock();
2734 
2735 	/* Do not wake up a writer until he can make "significant"
2736 	 * progress.  --DaveM
2737 	 */
2738 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2739 		wq = rcu_dereference(sk->sk_wq);
2740 		if (skwq_has_sleeper(wq))
2741 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2742 						EPOLLWRNORM | EPOLLWRBAND);
2743 
2744 		/* Should agree with poll, otherwise some programs break */
2745 		if (sock_writeable(sk))
2746 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2747 	}
2748 
2749 	rcu_read_unlock();
2750 }
2751 
sock_def_destruct(struct sock * sk)2752 static void sock_def_destruct(struct sock *sk)
2753 {
2754 }
2755 
sk_send_sigurg(struct sock * sk)2756 void sk_send_sigurg(struct sock *sk)
2757 {
2758 	if (sk->sk_socket && sk->sk_socket->file)
2759 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2760 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2761 }
2762 EXPORT_SYMBOL(sk_send_sigurg);
2763 
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)2764 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2765 		    unsigned long expires)
2766 {
2767 	if (!mod_timer(timer, expires))
2768 		sock_hold(sk);
2769 }
2770 EXPORT_SYMBOL(sk_reset_timer);
2771 
sk_stop_timer(struct sock * sk,struct timer_list * timer)2772 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2773 {
2774 	if (del_timer(timer))
2775 		__sock_put(sk);
2776 }
2777 EXPORT_SYMBOL(sk_stop_timer);
2778 
sock_init_data(struct socket * sock,struct sock * sk)2779 void sock_init_data(struct socket *sock, struct sock *sk)
2780 {
2781 	sk_init_common(sk);
2782 	sk->sk_send_head	=	NULL;
2783 
2784 	timer_setup(&sk->sk_timer, NULL, 0);
2785 
2786 	sk->sk_allocation	=	GFP_KERNEL;
2787 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2788 	sk->sk_sndbuf		=	sysctl_wmem_default;
2789 	sk->sk_state		=	TCP_CLOSE;
2790 	sk_set_socket(sk, sock);
2791 
2792 	sock_set_flag(sk, SOCK_ZAPPED);
2793 
2794 	if (sock) {
2795 		sk->sk_type	=	sock->type;
2796 		sk->sk_wq	=	sock->wq;
2797 		sock->sk	=	sk;
2798 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2799 	} else {
2800 		sk->sk_wq	=	NULL;
2801 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2802 	}
2803 
2804 	rwlock_init(&sk->sk_callback_lock);
2805 	if (sk->sk_kern_sock)
2806 		lockdep_set_class_and_name(
2807 			&sk->sk_callback_lock,
2808 			af_kern_callback_keys + sk->sk_family,
2809 			af_family_kern_clock_key_strings[sk->sk_family]);
2810 	else
2811 		lockdep_set_class_and_name(
2812 			&sk->sk_callback_lock,
2813 			af_callback_keys + sk->sk_family,
2814 			af_family_clock_key_strings[sk->sk_family]);
2815 
2816 	sk->sk_state_change	=	sock_def_wakeup;
2817 	sk->sk_data_ready	=	sock_def_readable;
2818 	sk->sk_write_space	=	sock_def_write_space;
2819 	sk->sk_error_report	=	sock_def_error_report;
2820 	sk->sk_destruct		=	sock_def_destruct;
2821 
2822 	sk->sk_frag.page	=	NULL;
2823 	sk->sk_frag.offset	=	0;
2824 	sk->sk_peek_off		=	-1;
2825 
2826 	sk->sk_peer_pid 	=	NULL;
2827 	sk->sk_peer_cred	=	NULL;
2828 	sk->sk_write_pending	=	0;
2829 	sk->sk_rcvlowat		=	1;
2830 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2831 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2832 
2833 	sk->sk_stamp = SK_DEFAULT_STAMP;
2834 #if BITS_PER_LONG==32
2835 	seqlock_init(&sk->sk_stamp_seq);
2836 #endif
2837 	atomic_set(&sk->sk_zckey, 0);
2838 
2839 #ifdef CONFIG_NET_RX_BUSY_POLL
2840 	sk->sk_napi_id		=	0;
2841 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2842 #endif
2843 
2844 	sk->sk_max_pacing_rate = ~0U;
2845 	sk->sk_pacing_rate = ~0U;
2846 	sk->sk_pacing_shift = 10;
2847 	sk->sk_incoming_cpu = -1;
2848 
2849 	sk_rx_queue_clear(sk);
2850 	/*
2851 	 * Before updating sk_refcnt, we must commit prior changes to memory
2852 	 * (Documentation/RCU/rculist_nulls.txt for details)
2853 	 */
2854 	smp_wmb();
2855 	refcount_set(&sk->sk_refcnt, 1);
2856 	atomic_set(&sk->sk_drops, 0);
2857 }
2858 EXPORT_SYMBOL(sock_init_data);
2859 
lock_sock_nested(struct sock * sk,int subclass)2860 void lock_sock_nested(struct sock *sk, int subclass)
2861 {
2862 	might_sleep();
2863 	spin_lock_bh(&sk->sk_lock.slock);
2864 	if (sk->sk_lock.owned)
2865 		__lock_sock(sk);
2866 	sk->sk_lock.owned = 1;
2867 	spin_unlock(&sk->sk_lock.slock);
2868 	/*
2869 	 * The sk_lock has mutex_lock() semantics here:
2870 	 */
2871 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2872 	local_bh_enable();
2873 }
2874 EXPORT_SYMBOL(lock_sock_nested);
2875 
release_sock(struct sock * sk)2876 void release_sock(struct sock *sk)
2877 {
2878 	spin_lock_bh(&sk->sk_lock.slock);
2879 	if (sk->sk_backlog.tail)
2880 		__release_sock(sk);
2881 
2882 	/* Warning : release_cb() might need to release sk ownership,
2883 	 * ie call sock_release_ownership(sk) before us.
2884 	 */
2885 	if (sk->sk_prot->release_cb)
2886 		sk->sk_prot->release_cb(sk);
2887 
2888 	sock_release_ownership(sk);
2889 	if (waitqueue_active(&sk->sk_lock.wq))
2890 		wake_up(&sk->sk_lock.wq);
2891 	spin_unlock_bh(&sk->sk_lock.slock);
2892 }
2893 EXPORT_SYMBOL(release_sock);
2894 
2895 /**
2896  * lock_sock_fast - fast version of lock_sock
2897  * @sk: socket
2898  *
2899  * This version should be used for very small section, where process wont block
2900  * return false if fast path is taken:
2901  *
2902  *   sk_lock.slock locked, owned = 0, BH disabled
2903  *
2904  * return true if slow path is taken:
2905  *
2906  *   sk_lock.slock unlocked, owned = 1, BH enabled
2907  */
lock_sock_fast(struct sock * sk)2908 bool lock_sock_fast(struct sock *sk)
2909 {
2910 	might_sleep();
2911 	spin_lock_bh(&sk->sk_lock.slock);
2912 
2913 	if (!sk->sk_lock.owned)
2914 		/*
2915 		 * Note : We must disable BH
2916 		 */
2917 		return false;
2918 
2919 	__lock_sock(sk);
2920 	sk->sk_lock.owned = 1;
2921 	spin_unlock(&sk->sk_lock.slock);
2922 	/*
2923 	 * The sk_lock has mutex_lock() semantics here:
2924 	 */
2925 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2926 	local_bh_enable();
2927 	return true;
2928 }
2929 EXPORT_SYMBOL(lock_sock_fast);
2930 
sock_get_timestamp(struct sock * sk,struct timeval __user * userstamp)2931 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2932 {
2933 	struct timeval tv;
2934 
2935 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2936 	tv = ktime_to_timeval(sock_read_timestamp(sk));
2937 	if (tv.tv_sec == -1)
2938 		return -ENOENT;
2939 	if (tv.tv_sec == 0) {
2940 		ktime_t kt = ktime_get_real();
2941 		sock_write_timestamp(sk, kt);
2942 		tv = ktime_to_timeval(kt);
2943 	}
2944 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2945 }
2946 EXPORT_SYMBOL(sock_get_timestamp);
2947 
sock_get_timestampns(struct sock * sk,struct timespec __user * userstamp)2948 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2949 {
2950 	struct timespec ts;
2951 
2952 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2953 	ts = ktime_to_timespec(sock_read_timestamp(sk));
2954 	if (ts.tv_sec == -1)
2955 		return -ENOENT;
2956 	if (ts.tv_sec == 0) {
2957 		ktime_t kt = ktime_get_real();
2958 		sock_write_timestamp(sk, kt);
2959 		ts = ktime_to_timespec(sk->sk_stamp);
2960 	}
2961 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2962 }
2963 EXPORT_SYMBOL(sock_get_timestampns);
2964 
sock_enable_timestamp(struct sock * sk,int flag)2965 void sock_enable_timestamp(struct sock *sk, int flag)
2966 {
2967 	if (!sock_flag(sk, flag)) {
2968 		unsigned long previous_flags = sk->sk_flags;
2969 
2970 		sock_set_flag(sk, flag);
2971 		/*
2972 		 * we just set one of the two flags which require net
2973 		 * time stamping, but time stamping might have been on
2974 		 * already because of the other one
2975 		 */
2976 		if (sock_needs_netstamp(sk) &&
2977 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2978 			net_enable_timestamp();
2979 	}
2980 }
2981 
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)2982 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2983 		       int level, int type)
2984 {
2985 	struct sock_exterr_skb *serr;
2986 	struct sk_buff *skb;
2987 	int copied, err;
2988 
2989 	err = -EAGAIN;
2990 	skb = sock_dequeue_err_skb(sk);
2991 	if (skb == NULL)
2992 		goto out;
2993 
2994 	copied = skb->len;
2995 	if (copied > len) {
2996 		msg->msg_flags |= MSG_TRUNC;
2997 		copied = len;
2998 	}
2999 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3000 	if (err)
3001 		goto out_free_skb;
3002 
3003 	sock_recv_timestamp(msg, sk, skb);
3004 
3005 	serr = SKB_EXT_ERR(skb);
3006 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3007 
3008 	msg->msg_flags |= MSG_ERRQUEUE;
3009 	err = copied;
3010 
3011 out_free_skb:
3012 	kfree_skb(skb);
3013 out:
3014 	return err;
3015 }
3016 EXPORT_SYMBOL(sock_recv_errqueue);
3017 
3018 /*
3019  *	Get a socket option on an socket.
3020  *
3021  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3022  *	asynchronous errors should be reported by getsockopt. We assume
3023  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3024  */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3025 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3026 			   char __user *optval, int __user *optlen)
3027 {
3028 	struct sock *sk = sock->sk;
3029 
3030 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3031 }
3032 EXPORT_SYMBOL(sock_common_getsockopt);
3033 
3034 #ifdef CONFIG_COMPAT
compat_sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3035 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3036 				  char __user *optval, int __user *optlen)
3037 {
3038 	struct sock *sk = sock->sk;
3039 
3040 	if (sk->sk_prot->compat_getsockopt != NULL)
3041 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
3042 						      optval, optlen);
3043 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3044 }
3045 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3046 #endif
3047 
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3048 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3049 			int flags)
3050 {
3051 	struct sock *sk = sock->sk;
3052 	int addr_len = 0;
3053 	int err;
3054 
3055 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3056 				   flags & ~MSG_DONTWAIT, &addr_len);
3057 	if (err >= 0)
3058 		msg->msg_namelen = addr_len;
3059 	return err;
3060 }
3061 EXPORT_SYMBOL(sock_common_recvmsg);
3062 
3063 /*
3064  *	Set socket options on an inet socket.
3065  */
sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)3066 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3067 			   char __user *optval, unsigned int optlen)
3068 {
3069 	struct sock *sk = sock->sk;
3070 
3071 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3072 }
3073 EXPORT_SYMBOL(sock_common_setsockopt);
3074 
3075 #ifdef CONFIG_COMPAT
compat_sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)3076 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3077 				  char __user *optval, unsigned int optlen)
3078 {
3079 	struct sock *sk = sock->sk;
3080 
3081 	if (sk->sk_prot->compat_setsockopt != NULL)
3082 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
3083 						      optval, optlen);
3084 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3085 }
3086 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3087 #endif
3088 
sk_common_release(struct sock * sk)3089 void sk_common_release(struct sock *sk)
3090 {
3091 	if (sk->sk_prot->destroy)
3092 		sk->sk_prot->destroy(sk);
3093 
3094 	/*
3095 	 * Observation: when sock_common_release is called, processes have
3096 	 * no access to socket. But net still has.
3097 	 * Step one, detach it from networking:
3098 	 *
3099 	 * A. Remove from hash tables.
3100 	 */
3101 
3102 	sk->sk_prot->unhash(sk);
3103 
3104 	/*
3105 	 * In this point socket cannot receive new packets, but it is possible
3106 	 * that some packets are in flight because some CPU runs receiver and
3107 	 * did hash table lookup before we unhashed socket. They will achieve
3108 	 * receive queue and will be purged by socket destructor.
3109 	 *
3110 	 * Also we still have packets pending on receive queue and probably,
3111 	 * our own packets waiting in device queues. sock_destroy will drain
3112 	 * receive queue, but transmitted packets will delay socket destruction
3113 	 * until the last reference will be released.
3114 	 */
3115 
3116 	sock_orphan(sk);
3117 
3118 	xfrm_sk_free_policy(sk);
3119 
3120 	sk_refcnt_debug_release(sk);
3121 
3122 	sock_put(sk);
3123 }
3124 EXPORT_SYMBOL(sk_common_release);
3125 
sk_get_meminfo(const struct sock * sk,u32 * mem)3126 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3127 {
3128 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3129 
3130 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3131 	mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3132 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3133 	mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3134 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3135 	mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3136 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3137 	mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3138 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3139 }
3140 
3141 #ifdef CONFIG_PROC_FS
3142 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3143 struct prot_inuse {
3144 	int val[PROTO_INUSE_NR];
3145 };
3146 
3147 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3148 
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)3149 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3150 {
3151 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3152 }
3153 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3154 
sock_prot_inuse_get(struct net * net,struct proto * prot)3155 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3156 {
3157 	int cpu, idx = prot->inuse_idx;
3158 	int res = 0;
3159 
3160 	for_each_possible_cpu(cpu)
3161 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3162 
3163 	return res >= 0 ? res : 0;
3164 }
3165 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3166 
sock_inuse_add(struct net * net,int val)3167 static void sock_inuse_add(struct net *net, int val)
3168 {
3169 	this_cpu_add(*net->core.sock_inuse, val);
3170 }
3171 
sock_inuse_get(struct net * net)3172 int sock_inuse_get(struct net *net)
3173 {
3174 	int cpu, res = 0;
3175 
3176 	for_each_possible_cpu(cpu)
3177 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3178 
3179 	return res;
3180 }
3181 
3182 EXPORT_SYMBOL_GPL(sock_inuse_get);
3183 
sock_inuse_init_net(struct net * net)3184 static int __net_init sock_inuse_init_net(struct net *net)
3185 {
3186 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3187 	if (net->core.prot_inuse == NULL)
3188 		return -ENOMEM;
3189 
3190 	net->core.sock_inuse = alloc_percpu(int);
3191 	if (net->core.sock_inuse == NULL)
3192 		goto out;
3193 
3194 	return 0;
3195 
3196 out:
3197 	free_percpu(net->core.prot_inuse);
3198 	return -ENOMEM;
3199 }
3200 
sock_inuse_exit_net(struct net * net)3201 static void __net_exit sock_inuse_exit_net(struct net *net)
3202 {
3203 	free_percpu(net->core.prot_inuse);
3204 	free_percpu(net->core.sock_inuse);
3205 }
3206 
3207 static struct pernet_operations net_inuse_ops = {
3208 	.init = sock_inuse_init_net,
3209 	.exit = sock_inuse_exit_net,
3210 };
3211 
net_inuse_init(void)3212 static __init int net_inuse_init(void)
3213 {
3214 	if (register_pernet_subsys(&net_inuse_ops))
3215 		panic("Cannot initialize net inuse counters");
3216 
3217 	return 0;
3218 }
3219 
3220 core_initcall(net_inuse_init);
3221 
assign_proto_idx(struct proto * prot)3222 static void assign_proto_idx(struct proto *prot)
3223 {
3224 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3225 
3226 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3227 		pr_err("PROTO_INUSE_NR exhausted\n");
3228 		return;
3229 	}
3230 
3231 	set_bit(prot->inuse_idx, proto_inuse_idx);
3232 }
3233 
release_proto_idx(struct proto * prot)3234 static void release_proto_idx(struct proto *prot)
3235 {
3236 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3237 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3238 }
3239 #else
assign_proto_idx(struct proto * prot)3240 static inline void assign_proto_idx(struct proto *prot)
3241 {
3242 }
3243 
release_proto_idx(struct proto * prot)3244 static inline void release_proto_idx(struct proto *prot)
3245 {
3246 }
3247 
sock_inuse_add(struct net * net,int val)3248 static void sock_inuse_add(struct net *net, int val)
3249 {
3250 }
3251 #endif
3252 
req_prot_cleanup(struct request_sock_ops * rsk_prot)3253 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3254 {
3255 	if (!rsk_prot)
3256 		return;
3257 	kfree(rsk_prot->slab_name);
3258 	rsk_prot->slab_name = NULL;
3259 	kmem_cache_destroy(rsk_prot->slab);
3260 	rsk_prot->slab = NULL;
3261 }
3262 
req_prot_init(const struct proto * prot)3263 static int req_prot_init(const struct proto *prot)
3264 {
3265 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3266 
3267 	if (!rsk_prot)
3268 		return 0;
3269 
3270 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3271 					prot->name);
3272 	if (!rsk_prot->slab_name)
3273 		return -ENOMEM;
3274 
3275 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3276 					   rsk_prot->obj_size, 0,
3277 					   SLAB_ACCOUNT | prot->slab_flags,
3278 					   NULL);
3279 
3280 	if (!rsk_prot->slab) {
3281 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3282 			prot->name);
3283 		return -ENOMEM;
3284 	}
3285 	return 0;
3286 }
3287 
proto_register(struct proto * prot,int alloc_slab)3288 int proto_register(struct proto *prot, int alloc_slab)
3289 {
3290 	if (alloc_slab) {
3291 		prot->slab = kmem_cache_create_usercopy(prot->name,
3292 					prot->obj_size, 0,
3293 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3294 					prot->slab_flags,
3295 					prot->useroffset, prot->usersize,
3296 					NULL);
3297 
3298 		if (prot->slab == NULL) {
3299 			pr_crit("%s: Can't create sock SLAB cache!\n",
3300 				prot->name);
3301 			goto out;
3302 		}
3303 
3304 		if (req_prot_init(prot))
3305 			goto out_free_request_sock_slab;
3306 
3307 		if (prot->twsk_prot != NULL) {
3308 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3309 
3310 			if (prot->twsk_prot->twsk_slab_name == NULL)
3311 				goto out_free_request_sock_slab;
3312 
3313 			prot->twsk_prot->twsk_slab =
3314 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3315 						  prot->twsk_prot->twsk_obj_size,
3316 						  0,
3317 						  SLAB_ACCOUNT |
3318 						  prot->slab_flags,
3319 						  NULL);
3320 			if (prot->twsk_prot->twsk_slab == NULL)
3321 				goto out_free_timewait_sock_slab_name;
3322 		}
3323 	}
3324 
3325 	mutex_lock(&proto_list_mutex);
3326 	list_add(&prot->node, &proto_list);
3327 	assign_proto_idx(prot);
3328 	mutex_unlock(&proto_list_mutex);
3329 	return 0;
3330 
3331 out_free_timewait_sock_slab_name:
3332 	kfree(prot->twsk_prot->twsk_slab_name);
3333 out_free_request_sock_slab:
3334 	req_prot_cleanup(prot->rsk_prot);
3335 
3336 	kmem_cache_destroy(prot->slab);
3337 	prot->slab = NULL;
3338 out:
3339 	return -ENOBUFS;
3340 }
3341 EXPORT_SYMBOL(proto_register);
3342 
proto_unregister(struct proto * prot)3343 void proto_unregister(struct proto *prot)
3344 {
3345 	mutex_lock(&proto_list_mutex);
3346 	release_proto_idx(prot);
3347 	list_del(&prot->node);
3348 	mutex_unlock(&proto_list_mutex);
3349 
3350 	kmem_cache_destroy(prot->slab);
3351 	prot->slab = NULL;
3352 
3353 	req_prot_cleanup(prot->rsk_prot);
3354 
3355 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3356 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3357 		kfree(prot->twsk_prot->twsk_slab_name);
3358 		prot->twsk_prot->twsk_slab = NULL;
3359 	}
3360 }
3361 EXPORT_SYMBOL(proto_unregister);
3362 
sock_load_diag_module(int family,int protocol)3363 int sock_load_diag_module(int family, int protocol)
3364 {
3365 	if (!protocol) {
3366 		if (!sock_is_registered(family))
3367 			return -ENOENT;
3368 
3369 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3370 				      NETLINK_SOCK_DIAG, family);
3371 	}
3372 
3373 #ifdef CONFIG_INET
3374 	if (family == AF_INET &&
3375 	    protocol != IPPROTO_RAW &&
3376 	    !rcu_access_pointer(inet_protos[protocol]))
3377 		return -ENOENT;
3378 #endif
3379 
3380 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3381 			      NETLINK_SOCK_DIAG, family, protocol);
3382 }
3383 EXPORT_SYMBOL(sock_load_diag_module);
3384 
3385 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)3386 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3387 	__acquires(proto_list_mutex)
3388 {
3389 	mutex_lock(&proto_list_mutex);
3390 	return seq_list_start_head(&proto_list, *pos);
3391 }
3392 
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)3393 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3394 {
3395 	return seq_list_next(v, &proto_list, pos);
3396 }
3397 
proto_seq_stop(struct seq_file * seq,void * v)3398 static void proto_seq_stop(struct seq_file *seq, void *v)
3399 	__releases(proto_list_mutex)
3400 {
3401 	mutex_unlock(&proto_list_mutex);
3402 }
3403 
proto_method_implemented(const void * method)3404 static char proto_method_implemented(const void *method)
3405 {
3406 	return method == NULL ? 'n' : 'y';
3407 }
sock_prot_memory_allocated(struct proto * proto)3408 static long sock_prot_memory_allocated(struct proto *proto)
3409 {
3410 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3411 }
3412 
sock_prot_memory_pressure(struct proto * proto)3413 static char *sock_prot_memory_pressure(struct proto *proto)
3414 {
3415 	return proto->memory_pressure != NULL ?
3416 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3417 }
3418 
proto_seq_printf(struct seq_file * seq,struct proto * proto)3419 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3420 {
3421 
3422 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3423 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3424 		   proto->name,
3425 		   proto->obj_size,
3426 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3427 		   sock_prot_memory_allocated(proto),
3428 		   sock_prot_memory_pressure(proto),
3429 		   proto->max_header,
3430 		   proto->slab == NULL ? "no" : "yes",
3431 		   module_name(proto->owner),
3432 		   proto_method_implemented(proto->close),
3433 		   proto_method_implemented(proto->connect),
3434 		   proto_method_implemented(proto->disconnect),
3435 		   proto_method_implemented(proto->accept),
3436 		   proto_method_implemented(proto->ioctl),
3437 		   proto_method_implemented(proto->init),
3438 		   proto_method_implemented(proto->destroy),
3439 		   proto_method_implemented(proto->shutdown),
3440 		   proto_method_implemented(proto->setsockopt),
3441 		   proto_method_implemented(proto->getsockopt),
3442 		   proto_method_implemented(proto->sendmsg),
3443 		   proto_method_implemented(proto->recvmsg),
3444 		   proto_method_implemented(proto->sendpage),
3445 		   proto_method_implemented(proto->bind),
3446 		   proto_method_implemented(proto->backlog_rcv),
3447 		   proto_method_implemented(proto->hash),
3448 		   proto_method_implemented(proto->unhash),
3449 		   proto_method_implemented(proto->get_port),
3450 		   proto_method_implemented(proto->enter_memory_pressure));
3451 }
3452 
proto_seq_show(struct seq_file * seq,void * v)3453 static int proto_seq_show(struct seq_file *seq, void *v)
3454 {
3455 	if (v == &proto_list)
3456 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3457 			   "protocol",
3458 			   "size",
3459 			   "sockets",
3460 			   "memory",
3461 			   "press",
3462 			   "maxhdr",
3463 			   "slab",
3464 			   "module",
3465 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3466 	else
3467 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3468 	return 0;
3469 }
3470 
3471 static const struct seq_operations proto_seq_ops = {
3472 	.start  = proto_seq_start,
3473 	.next   = proto_seq_next,
3474 	.stop   = proto_seq_stop,
3475 	.show   = proto_seq_show,
3476 };
3477 
proto_init_net(struct net * net)3478 static __net_init int proto_init_net(struct net *net)
3479 {
3480 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3481 			sizeof(struct seq_net_private)))
3482 		return -ENOMEM;
3483 
3484 	return 0;
3485 }
3486 
proto_exit_net(struct net * net)3487 static __net_exit void proto_exit_net(struct net *net)
3488 {
3489 	remove_proc_entry("protocols", net->proc_net);
3490 }
3491 
3492 
3493 static __net_initdata struct pernet_operations proto_net_ops = {
3494 	.init = proto_init_net,
3495 	.exit = proto_exit_net,
3496 };
3497 
proto_init(void)3498 static int __init proto_init(void)
3499 {
3500 	return register_pernet_subsys(&proto_net_ops);
3501 }
3502 
3503 subsys_initcall(proto_init);
3504 
3505 #endif /* PROC_FS */
3506 
3507 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)3508 bool sk_busy_loop_end(void *p, unsigned long start_time)
3509 {
3510 	struct sock *sk = p;
3511 
3512 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3513 	       sk_busy_loop_timeout(sk, start_time);
3514 }
3515 EXPORT_SYMBOL(sk_busy_loop_end);
3516 #endif /* CONFIG_NET_RX_BUSY_POLL */
3517