• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 #include <trace/hooks/sched.h>
139 
140 #include <net/tcp.h>
141 #include <net/busy_poll.h>
142 
143 static DEFINE_MUTEX(proto_list_mutex);
144 static LIST_HEAD(proto_list);
145 
146 static void sock_inuse_add(struct net *net, int val);
147 
148 /**
149  * sk_ns_capable - General socket capability test
150  * @sk: Socket to use a capability on or through
151  * @user_ns: The user namespace of the capability to use
152  * @cap: The capability to use
153  *
154  * Test to see if the opener of the socket had when the socket was
155  * created and the current process has the capability @cap in the user
156  * namespace @user_ns.
157  */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)158 bool sk_ns_capable(const struct sock *sk,
159 		   struct user_namespace *user_ns, int cap)
160 {
161 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
162 		ns_capable(user_ns, cap);
163 }
164 EXPORT_SYMBOL(sk_ns_capable);
165 
166 /**
167  * sk_capable - Socket global capability test
168  * @sk: Socket to use a capability on or through
169  * @cap: The global capability to use
170  *
171  * Test to see if the opener of the socket had when the socket was
172  * created and the current process has the capability @cap in all user
173  * namespaces.
174  */
sk_capable(const struct sock * sk,int cap)175 bool sk_capable(const struct sock *sk, int cap)
176 {
177 	return sk_ns_capable(sk, &init_user_ns, cap);
178 }
179 EXPORT_SYMBOL(sk_capable);
180 
181 /**
182  * sk_net_capable - Network namespace socket capability test
183  * @sk: Socket to use a capability on or through
184  * @cap: The capability to use
185  *
186  * Test to see if the opener of the socket had when the socket was created
187  * and the current process has the capability @cap over the network namespace
188  * the socket is a member of.
189  */
sk_net_capable(const struct sock * sk,int cap)190 bool sk_net_capable(const struct sock *sk, int cap)
191 {
192 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
193 }
194 EXPORT_SYMBOL(sk_net_capable);
195 
196 /*
197  * Each address family might have different locking rules, so we have
198  * one slock key per address family and separate keys for internal and
199  * userspace sockets.
200  */
201 static struct lock_class_key af_family_keys[AF_MAX];
202 static struct lock_class_key af_family_kern_keys[AF_MAX];
203 static struct lock_class_key af_family_slock_keys[AF_MAX];
204 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
205 
206 /*
207  * Make lock validator output more readable. (we pre-construct these
208  * strings build-time, so that runtime initialization of socket
209  * locks is fast):
210  */
211 
212 #define _sock_locks(x)						  \
213   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
214   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
215   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
216   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
217   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
218   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
219   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
220   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
221   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
222   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
223   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
224   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
225   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
226   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
227   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
228   x "AF_MAX"
229 
230 static const char *const af_family_key_strings[AF_MAX+1] = {
231 	_sock_locks("sk_lock-")
232 };
233 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
234 	_sock_locks("slock-")
235 };
236 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
237 	_sock_locks("clock-")
238 };
239 
240 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
241 	_sock_locks("k-sk_lock-")
242 };
243 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
244 	_sock_locks("k-slock-")
245 };
246 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
247 	_sock_locks("k-clock-")
248 };
249 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
250 	_sock_locks("rlock-")
251 };
252 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
253 	_sock_locks("wlock-")
254 };
255 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
256 	_sock_locks("elock-")
257 };
258 
259 /*
260  * sk_callback_lock and sk queues locking rules are per-address-family,
261  * so split the lock classes by using a per-AF key:
262  */
263 static struct lock_class_key af_callback_keys[AF_MAX];
264 static struct lock_class_key af_rlock_keys[AF_MAX];
265 static struct lock_class_key af_wlock_keys[AF_MAX];
266 static struct lock_class_key af_elock_keys[AF_MAX];
267 static struct lock_class_key af_kern_callback_keys[AF_MAX];
268 
269 /* Run time adjustable parameters. */
270 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
271 EXPORT_SYMBOL(sysctl_wmem_max);
272 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
273 EXPORT_SYMBOL(sysctl_rmem_max);
274 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
275 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
276 
277 /* Maximal space eaten by iovec or ancillary data plus some space */
278 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
279 EXPORT_SYMBOL(sysctl_optmem_max);
280 
281 int sysctl_tstamp_allow_data __read_mostly = 1;
282 
283 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
284 EXPORT_SYMBOL_GPL(memalloc_socks_key);
285 
286 /**
287  * sk_set_memalloc - sets %SOCK_MEMALLOC
288  * @sk: socket to set it on
289  *
290  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
291  * It's the responsibility of the admin to adjust min_free_kbytes
292  * to meet the requirements
293  */
sk_set_memalloc(struct sock * sk)294 void sk_set_memalloc(struct sock *sk)
295 {
296 	sock_set_flag(sk, SOCK_MEMALLOC);
297 	sk->sk_allocation |= __GFP_MEMALLOC;
298 	static_branch_inc(&memalloc_socks_key);
299 }
300 EXPORT_SYMBOL_GPL(sk_set_memalloc);
301 
sk_clear_memalloc(struct sock * sk)302 void sk_clear_memalloc(struct sock *sk)
303 {
304 	sock_reset_flag(sk, SOCK_MEMALLOC);
305 	sk->sk_allocation &= ~__GFP_MEMALLOC;
306 	static_branch_dec(&memalloc_socks_key);
307 
308 	/*
309 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
310 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
311 	 * it has rmem allocations due to the last swapfile being deactivated
312 	 * but there is a risk that the socket is unusable due to exceeding
313 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
314 	 */
315 	sk_mem_reclaim(sk);
316 }
317 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
318 
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)319 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
320 {
321 	int ret;
322 	unsigned int noreclaim_flag;
323 
324 	/* these should have been dropped before queueing */
325 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
326 
327 	noreclaim_flag = memalloc_noreclaim_save();
328 	ret = sk->sk_backlog_rcv(sk, skb);
329 	memalloc_noreclaim_restore(noreclaim_flag);
330 
331 	return ret;
332 }
333 EXPORT_SYMBOL(__sk_backlog_rcv);
334 
sock_get_timeout(long timeo,void * optval,bool old_timeval)335 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
336 {
337 	struct __kernel_sock_timeval tv;
338 
339 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
340 		tv.tv_sec = 0;
341 		tv.tv_usec = 0;
342 	} else {
343 		tv.tv_sec = timeo / HZ;
344 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
345 	}
346 
347 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
348 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
349 		*(struct old_timeval32 *)optval = tv32;
350 		return sizeof(tv32);
351 	}
352 
353 	if (old_timeval) {
354 		struct __kernel_old_timeval old_tv;
355 		old_tv.tv_sec = tv.tv_sec;
356 		old_tv.tv_usec = tv.tv_usec;
357 		*(struct __kernel_old_timeval *)optval = old_tv;
358 		return sizeof(old_tv);
359 	}
360 
361 	*(struct __kernel_sock_timeval *)optval = tv;
362 	return sizeof(tv);
363 }
364 
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)365 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
366 			    bool old_timeval)
367 {
368 	struct __kernel_sock_timeval tv;
369 
370 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
371 		struct old_timeval32 tv32;
372 
373 		if (optlen < sizeof(tv32))
374 			return -EINVAL;
375 
376 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
377 			return -EFAULT;
378 		tv.tv_sec = tv32.tv_sec;
379 		tv.tv_usec = tv32.tv_usec;
380 	} else if (old_timeval) {
381 		struct __kernel_old_timeval old_tv;
382 
383 		if (optlen < sizeof(old_tv))
384 			return -EINVAL;
385 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
386 			return -EFAULT;
387 		tv.tv_sec = old_tv.tv_sec;
388 		tv.tv_usec = old_tv.tv_usec;
389 	} else {
390 		if (optlen < sizeof(tv))
391 			return -EINVAL;
392 		if (copy_from_sockptr(&tv, optval, sizeof(tv)))
393 			return -EFAULT;
394 	}
395 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
396 		return -EDOM;
397 
398 	if (tv.tv_sec < 0) {
399 		static int warned __read_mostly;
400 
401 		*timeo_p = 0;
402 		if (warned < 10 && net_ratelimit()) {
403 			warned++;
404 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
405 				__func__, current->comm, task_pid_nr(current));
406 		}
407 		return 0;
408 	}
409 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
410 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
411 		return 0;
412 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
413 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
414 	return 0;
415 }
416 
sock_needs_netstamp(const struct sock * sk)417 static bool sock_needs_netstamp(const struct sock *sk)
418 {
419 	switch (sk->sk_family) {
420 	case AF_UNSPEC:
421 	case AF_UNIX:
422 		return false;
423 	default:
424 		return true;
425 	}
426 }
427 
sock_disable_timestamp(struct sock * sk,unsigned long flags)428 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
429 {
430 	if (sk->sk_flags & flags) {
431 		sk->sk_flags &= ~flags;
432 		if (sock_needs_netstamp(sk) &&
433 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
434 			net_disable_timestamp();
435 	}
436 }
437 
438 
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)439 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
440 {
441 	unsigned long flags;
442 	struct sk_buff_head *list = &sk->sk_receive_queue;
443 
444 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
445 		atomic_inc(&sk->sk_drops);
446 		trace_sock_rcvqueue_full(sk, skb);
447 		return -ENOMEM;
448 	}
449 
450 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
451 		atomic_inc(&sk->sk_drops);
452 		return -ENOBUFS;
453 	}
454 
455 	skb->dev = NULL;
456 	skb_set_owner_r(skb, sk);
457 
458 	/* we escape from rcu protected region, make sure we dont leak
459 	 * a norefcounted dst
460 	 */
461 	skb_dst_force(skb);
462 
463 	spin_lock_irqsave(&list->lock, flags);
464 	sock_skb_set_dropcount(sk, skb);
465 	__skb_queue_tail(list, skb);
466 	spin_unlock_irqrestore(&list->lock, flags);
467 
468 	if (!sock_flag(sk, SOCK_DEAD))
469 		sk->sk_data_ready(sk);
470 	return 0;
471 }
472 EXPORT_SYMBOL(__sock_queue_rcv_skb);
473 
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)474 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
475 {
476 	int err;
477 
478 	err = sk_filter(sk, skb);
479 	if (err)
480 		return err;
481 
482 	return __sock_queue_rcv_skb(sk, skb);
483 }
484 EXPORT_SYMBOL(sock_queue_rcv_skb);
485 
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)486 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
487 		     const int nested, unsigned int trim_cap, bool refcounted)
488 {
489 	int rc = NET_RX_SUCCESS;
490 
491 	if (sk_filter_trim_cap(sk, skb, trim_cap))
492 		goto discard_and_relse;
493 
494 	skb->dev = NULL;
495 
496 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
497 		atomic_inc(&sk->sk_drops);
498 		goto discard_and_relse;
499 	}
500 	if (nested)
501 		bh_lock_sock_nested(sk);
502 	else
503 		bh_lock_sock(sk);
504 	if (!sock_owned_by_user(sk)) {
505 		/*
506 		 * trylock + unlock semantics:
507 		 */
508 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
509 
510 		rc = sk_backlog_rcv(sk, skb);
511 
512 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
513 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
514 		bh_unlock_sock(sk);
515 		atomic_inc(&sk->sk_drops);
516 		goto discard_and_relse;
517 	}
518 
519 	bh_unlock_sock(sk);
520 out:
521 	if (refcounted)
522 		sock_put(sk);
523 	return rc;
524 discard_and_relse:
525 	kfree_skb(skb);
526 	goto out;
527 }
528 EXPORT_SYMBOL(__sk_receive_skb);
529 
__sk_dst_check(struct sock * sk,u32 cookie)530 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
531 {
532 	struct dst_entry *dst = __sk_dst_get(sk);
533 
534 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
535 		sk_tx_queue_clear(sk);
536 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
537 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
538 		dst_release(dst);
539 		return NULL;
540 	}
541 
542 	return dst;
543 }
544 EXPORT_SYMBOL(__sk_dst_check);
545 
sk_dst_check(struct sock * sk,u32 cookie)546 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
547 {
548 	struct dst_entry *dst = sk_dst_get(sk);
549 
550 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
551 		sk_dst_reset(sk);
552 		dst_release(dst);
553 		return NULL;
554 	}
555 
556 	return dst;
557 }
558 EXPORT_SYMBOL(sk_dst_check);
559 
sock_bindtoindex_locked(struct sock * sk,int ifindex)560 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
561 {
562 	int ret = -ENOPROTOOPT;
563 #ifdef CONFIG_NETDEVICES
564 	struct net *net = sock_net(sk);
565 
566 	/* Sorry... */
567 	ret = -EPERM;
568 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
569 		goto out;
570 
571 	ret = -EINVAL;
572 	if (ifindex < 0)
573 		goto out;
574 
575 	sk->sk_bound_dev_if = ifindex;
576 	if (sk->sk_prot->rehash)
577 		sk->sk_prot->rehash(sk);
578 	sk_dst_reset(sk);
579 
580 	ret = 0;
581 
582 out:
583 #endif
584 
585 	return ret;
586 }
587 
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)588 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
589 {
590 	int ret;
591 
592 	if (lock_sk)
593 		lock_sock(sk);
594 	ret = sock_bindtoindex_locked(sk, ifindex);
595 	if (lock_sk)
596 		release_sock(sk);
597 
598 	return ret;
599 }
600 EXPORT_SYMBOL(sock_bindtoindex);
601 
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)602 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
603 {
604 	int ret = -ENOPROTOOPT;
605 #ifdef CONFIG_NETDEVICES
606 	struct net *net = sock_net(sk);
607 	char devname[IFNAMSIZ];
608 	int index;
609 
610 	ret = -EINVAL;
611 	if (optlen < 0)
612 		goto out;
613 
614 	/* Bind this socket to a particular device like "eth0",
615 	 * as specified in the passed interface name. If the
616 	 * name is "" or the option length is zero the socket
617 	 * is not bound.
618 	 */
619 	if (optlen > IFNAMSIZ - 1)
620 		optlen = IFNAMSIZ - 1;
621 	memset(devname, 0, sizeof(devname));
622 
623 	ret = -EFAULT;
624 	if (copy_from_sockptr(devname, optval, optlen))
625 		goto out;
626 
627 	index = 0;
628 	if (devname[0] != '\0') {
629 		struct net_device *dev;
630 
631 		rcu_read_lock();
632 		dev = dev_get_by_name_rcu(net, devname);
633 		if (dev)
634 			index = dev->ifindex;
635 		rcu_read_unlock();
636 		ret = -ENODEV;
637 		if (!dev)
638 			goto out;
639 	}
640 
641 	return sock_bindtoindex(sk, index, true);
642 out:
643 #endif
644 
645 	return ret;
646 }
647 
sock_getbindtodevice(struct sock * sk,sockptr_t optval,sockptr_t optlen,int len)648 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
649 				sockptr_t optlen, int len)
650 {
651 	int ret = -ENOPROTOOPT;
652 #ifdef CONFIG_NETDEVICES
653 	struct net *net = sock_net(sk);
654 	char devname[IFNAMSIZ];
655 
656 	if (sk->sk_bound_dev_if == 0) {
657 		len = 0;
658 		goto zero;
659 	}
660 
661 	ret = -EINVAL;
662 	if (len < IFNAMSIZ)
663 		goto out;
664 
665 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
666 	if (ret)
667 		goto out;
668 
669 	len = strlen(devname) + 1;
670 
671 	ret = -EFAULT;
672 	if (copy_to_sockptr(optval, devname, len))
673 		goto out;
674 
675 zero:
676 	ret = -EFAULT;
677 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
678 		goto out;
679 
680 	ret = 0;
681 
682 out:
683 #endif
684 
685 	return ret;
686 }
687 
sk_mc_loop(struct sock * sk)688 bool sk_mc_loop(struct sock *sk)
689 {
690 	if (dev_recursion_level())
691 		return false;
692 	if (!sk)
693 		return true;
694 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
695 	switch (READ_ONCE(sk->sk_family)) {
696 	case AF_INET:
697 		return inet_sk(sk)->mc_loop;
698 #if IS_ENABLED(CONFIG_IPV6)
699 	case AF_INET6:
700 		return inet6_sk(sk)->mc_loop;
701 #endif
702 	}
703 	WARN_ON_ONCE(1);
704 	return true;
705 }
706 EXPORT_SYMBOL(sk_mc_loop);
707 
sock_set_reuseaddr(struct sock * sk)708 void sock_set_reuseaddr(struct sock *sk)
709 {
710 	lock_sock(sk);
711 	sk->sk_reuse = SK_CAN_REUSE;
712 	release_sock(sk);
713 }
714 EXPORT_SYMBOL(sock_set_reuseaddr);
715 
sock_set_reuseport(struct sock * sk)716 void sock_set_reuseport(struct sock *sk)
717 {
718 	lock_sock(sk);
719 	sk->sk_reuseport = true;
720 	release_sock(sk);
721 }
722 EXPORT_SYMBOL(sock_set_reuseport);
723 
sock_no_linger(struct sock * sk)724 void sock_no_linger(struct sock *sk)
725 {
726 	lock_sock(sk);
727 	sk->sk_lingertime = 0;
728 	sock_set_flag(sk, SOCK_LINGER);
729 	release_sock(sk);
730 }
731 EXPORT_SYMBOL(sock_no_linger);
732 
sock_set_priority(struct sock * sk,u32 priority)733 void sock_set_priority(struct sock *sk, u32 priority)
734 {
735 	lock_sock(sk);
736 	sk->sk_priority = priority;
737 	release_sock(sk);
738 }
739 EXPORT_SYMBOL(sock_set_priority);
740 
sock_set_sndtimeo(struct sock * sk,s64 secs)741 void sock_set_sndtimeo(struct sock *sk, s64 secs)
742 {
743 	lock_sock(sk);
744 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
745 		sk->sk_sndtimeo = secs * HZ;
746 	else
747 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
748 	release_sock(sk);
749 }
750 EXPORT_SYMBOL(sock_set_sndtimeo);
751 
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)752 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
753 {
754 	if (val)  {
755 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
756 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
757 		sock_set_flag(sk, SOCK_RCVTSTAMP);
758 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
759 	} else {
760 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
761 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
762 	}
763 }
764 
sock_enable_timestamps(struct sock * sk)765 void sock_enable_timestamps(struct sock *sk)
766 {
767 	lock_sock(sk);
768 	__sock_set_timestamps(sk, true, false, true);
769 	release_sock(sk);
770 }
771 EXPORT_SYMBOL(sock_enable_timestamps);
772 
sock_set_keepalive(struct sock * sk)773 void sock_set_keepalive(struct sock *sk)
774 {
775 	lock_sock(sk);
776 	if (sk->sk_prot->keepalive)
777 		sk->sk_prot->keepalive(sk, true);
778 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
779 	release_sock(sk);
780 }
781 EXPORT_SYMBOL(sock_set_keepalive);
782 
__sock_set_rcvbuf(struct sock * sk,int val)783 static void __sock_set_rcvbuf(struct sock *sk, int val)
784 {
785 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
786 	 * as a negative value.
787 	 */
788 	val = min_t(int, val, INT_MAX / 2);
789 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
790 
791 	/* We double it on the way in to account for "struct sk_buff" etc.
792 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
793 	 * will allow that much actual data to be received on that socket.
794 	 *
795 	 * Applications are unaware that "struct sk_buff" and other overheads
796 	 * allocate from the receive buffer during socket buffer allocation.
797 	 *
798 	 * And after considering the possible alternatives, returning the value
799 	 * we actually used in getsockopt is the most desirable behavior.
800 	 */
801 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
802 }
803 
sock_set_rcvbuf(struct sock * sk,int val)804 void sock_set_rcvbuf(struct sock *sk, int val)
805 {
806 	lock_sock(sk);
807 	__sock_set_rcvbuf(sk, val);
808 	release_sock(sk);
809 }
810 EXPORT_SYMBOL(sock_set_rcvbuf);
811 
__sock_set_mark(struct sock * sk,u32 val)812 static void __sock_set_mark(struct sock *sk, u32 val)
813 {
814 	if (val != sk->sk_mark) {
815 		sk->sk_mark = val;
816 		sk_dst_reset(sk);
817 	}
818 }
819 
sock_set_mark(struct sock * sk,u32 val)820 void sock_set_mark(struct sock *sk, u32 val)
821 {
822 	lock_sock(sk);
823 	__sock_set_mark(sk, val);
824 	release_sock(sk);
825 }
826 EXPORT_SYMBOL(sock_set_mark);
827 
828 /*
829  *	This is meant for all protocols to use and covers goings on
830  *	at the socket level. Everything here is generic.
831  */
832 
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)833 int sock_setsockopt(struct socket *sock, int level, int optname,
834 		    sockptr_t optval, unsigned int optlen)
835 {
836 	struct sock_txtime sk_txtime;
837 	struct sock *sk = sock->sk;
838 	int val;
839 	int valbool;
840 	struct linger ling;
841 	int ret = 0;
842 
843 	/*
844 	 *	Options without arguments
845 	 */
846 
847 	if (optname == SO_BINDTODEVICE)
848 		return sock_setbindtodevice(sk, optval, optlen);
849 
850 	if (optlen < sizeof(int))
851 		return -EINVAL;
852 
853 	if (copy_from_sockptr(&val, optval, sizeof(val)))
854 		return -EFAULT;
855 
856 	valbool = val ? 1 : 0;
857 
858 	lock_sock(sk);
859 
860 	switch (optname) {
861 	case SO_DEBUG:
862 		if (val && !capable(CAP_NET_ADMIN))
863 			ret = -EACCES;
864 		else
865 			sock_valbool_flag(sk, SOCK_DBG, valbool);
866 		break;
867 	case SO_REUSEADDR:
868 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
869 		break;
870 	case SO_REUSEPORT:
871 		sk->sk_reuseport = valbool;
872 		break;
873 	case SO_TYPE:
874 	case SO_PROTOCOL:
875 	case SO_DOMAIN:
876 	case SO_ERROR:
877 		ret = -ENOPROTOOPT;
878 		break;
879 	case SO_DONTROUTE:
880 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
881 		sk_dst_reset(sk);
882 		break;
883 	case SO_BROADCAST:
884 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
885 		break;
886 	case SO_SNDBUF:
887 		/* Don't error on this BSD doesn't and if you think
888 		 * about it this is right. Otherwise apps have to
889 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
890 		 * are treated in BSD as hints
891 		 */
892 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
893 set_sndbuf:
894 		/* Ensure val * 2 fits into an int, to prevent max_t()
895 		 * from treating it as a negative value.
896 		 */
897 		val = min_t(int, val, INT_MAX / 2);
898 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
899 		WRITE_ONCE(sk->sk_sndbuf,
900 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
901 		/* Wake up sending tasks if we upped the value. */
902 		sk->sk_write_space(sk);
903 		break;
904 
905 	case SO_SNDBUFFORCE:
906 		if (!capable(CAP_NET_ADMIN)) {
907 			ret = -EPERM;
908 			break;
909 		}
910 
911 		/* No negative values (to prevent underflow, as val will be
912 		 * multiplied by 2).
913 		 */
914 		if (val < 0)
915 			val = 0;
916 		goto set_sndbuf;
917 
918 	case SO_RCVBUF:
919 		/* Don't error on this BSD doesn't and if you think
920 		 * about it this is right. Otherwise apps have to
921 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
922 		 * are treated in BSD as hints
923 		 */
924 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
925 		break;
926 
927 	case SO_RCVBUFFORCE:
928 		if (!capable(CAP_NET_ADMIN)) {
929 			ret = -EPERM;
930 			break;
931 		}
932 
933 		/* No negative values (to prevent underflow, as val will be
934 		 * multiplied by 2).
935 		 */
936 		__sock_set_rcvbuf(sk, max(val, 0));
937 		break;
938 
939 	case SO_KEEPALIVE:
940 		if (sk->sk_prot->keepalive)
941 			sk->sk_prot->keepalive(sk, valbool);
942 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
943 		break;
944 
945 	case SO_OOBINLINE:
946 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
947 		break;
948 
949 	case SO_NO_CHECK:
950 		sk->sk_no_check_tx = valbool;
951 		break;
952 
953 	case SO_PRIORITY:
954 		if ((val >= 0 && val <= 6) ||
955 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
956 			sk->sk_priority = val;
957 		else
958 			ret = -EPERM;
959 		break;
960 
961 	case SO_LINGER:
962 		if (optlen < sizeof(ling)) {
963 			ret = -EINVAL;	/* 1003.1g */
964 			break;
965 		}
966 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
967 			ret = -EFAULT;
968 			break;
969 		}
970 		if (!ling.l_onoff)
971 			sock_reset_flag(sk, SOCK_LINGER);
972 		else {
973 #if (BITS_PER_LONG == 32)
974 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
975 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
976 			else
977 #endif
978 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
979 			sock_set_flag(sk, SOCK_LINGER);
980 		}
981 		break;
982 
983 	case SO_BSDCOMPAT:
984 		break;
985 
986 	case SO_PASSCRED:
987 		if (valbool)
988 			set_bit(SOCK_PASSCRED, &sock->flags);
989 		else
990 			clear_bit(SOCK_PASSCRED, &sock->flags);
991 		break;
992 
993 	case SO_TIMESTAMP_OLD:
994 		__sock_set_timestamps(sk, valbool, false, false);
995 		break;
996 	case SO_TIMESTAMP_NEW:
997 		__sock_set_timestamps(sk, valbool, true, false);
998 		break;
999 	case SO_TIMESTAMPNS_OLD:
1000 		__sock_set_timestamps(sk, valbool, false, true);
1001 		break;
1002 	case SO_TIMESTAMPNS_NEW:
1003 		__sock_set_timestamps(sk, valbool, true, true);
1004 		break;
1005 	case SO_TIMESTAMPING_NEW:
1006 	case SO_TIMESTAMPING_OLD:
1007 		if (val & ~SOF_TIMESTAMPING_MASK) {
1008 			ret = -EINVAL;
1009 			break;
1010 		}
1011 
1012 		if (val & SOF_TIMESTAMPING_OPT_ID &&
1013 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
1014 			if (sk->sk_protocol == IPPROTO_TCP &&
1015 			    sk->sk_type == SOCK_STREAM) {
1016 				if ((1 << sk->sk_state) &
1017 				    (TCPF_CLOSE | TCPF_LISTEN)) {
1018 					ret = -EINVAL;
1019 					break;
1020 				}
1021 				sk->sk_tskey = tcp_sk(sk)->snd_una;
1022 			} else {
1023 				sk->sk_tskey = 0;
1024 			}
1025 		}
1026 
1027 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
1028 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
1029 			ret = -EINVAL;
1030 			break;
1031 		}
1032 
1033 		sk->sk_tsflags = val;
1034 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
1035 
1036 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
1037 			sock_enable_timestamp(sk,
1038 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
1039 		else
1040 			sock_disable_timestamp(sk,
1041 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
1042 		break;
1043 
1044 	case SO_RCVLOWAT:
1045 		if (val < 0)
1046 			val = INT_MAX;
1047 		if (sock->ops->set_rcvlowat)
1048 			ret = sock->ops->set_rcvlowat(sk, val);
1049 		else
1050 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1051 		break;
1052 
1053 	case SO_RCVTIMEO_OLD:
1054 	case SO_RCVTIMEO_NEW:
1055 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1056 				       optlen, optname == SO_RCVTIMEO_OLD);
1057 		break;
1058 
1059 	case SO_SNDTIMEO_OLD:
1060 	case SO_SNDTIMEO_NEW:
1061 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1062 				       optlen, optname == SO_SNDTIMEO_OLD);
1063 		break;
1064 
1065 	case SO_ATTACH_FILTER: {
1066 		struct sock_fprog fprog;
1067 
1068 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1069 		if (!ret)
1070 			ret = sk_attach_filter(&fprog, sk);
1071 		break;
1072 	}
1073 	case SO_ATTACH_BPF:
1074 		ret = -EINVAL;
1075 		if (optlen == sizeof(u32)) {
1076 			u32 ufd;
1077 
1078 			ret = -EFAULT;
1079 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1080 				break;
1081 
1082 			ret = sk_attach_bpf(ufd, sk);
1083 		}
1084 		break;
1085 
1086 	case SO_ATTACH_REUSEPORT_CBPF: {
1087 		struct sock_fprog fprog;
1088 
1089 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1090 		if (!ret)
1091 			ret = sk_reuseport_attach_filter(&fprog, sk);
1092 		break;
1093 	}
1094 	case SO_ATTACH_REUSEPORT_EBPF:
1095 		ret = -EINVAL;
1096 		if (optlen == sizeof(u32)) {
1097 			u32 ufd;
1098 
1099 			ret = -EFAULT;
1100 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1101 				break;
1102 
1103 			ret = sk_reuseport_attach_bpf(ufd, sk);
1104 		}
1105 		break;
1106 
1107 	case SO_DETACH_REUSEPORT_BPF:
1108 		ret = reuseport_detach_prog(sk);
1109 		break;
1110 
1111 	case SO_DETACH_FILTER:
1112 		ret = sk_detach_filter(sk);
1113 		break;
1114 
1115 	case SO_LOCK_FILTER:
1116 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1117 			ret = -EPERM;
1118 		else
1119 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1120 		break;
1121 
1122 	case SO_PASSSEC:
1123 		if (valbool)
1124 			set_bit(SOCK_PASSSEC, &sock->flags);
1125 		else
1126 			clear_bit(SOCK_PASSSEC, &sock->flags);
1127 		break;
1128 	case SO_MARK:
1129 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1130 			ret = -EPERM;
1131 			break;
1132 		}
1133 
1134 		__sock_set_mark(sk, val);
1135 		break;
1136 
1137 	case SO_RXQ_OVFL:
1138 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1139 		break;
1140 
1141 	case SO_WIFI_STATUS:
1142 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1143 		break;
1144 
1145 	case SO_PEEK_OFF:
1146 		if (sock->ops->set_peek_off)
1147 			ret = sock->ops->set_peek_off(sk, val);
1148 		else
1149 			ret = -EOPNOTSUPP;
1150 		break;
1151 
1152 	case SO_NOFCS:
1153 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1154 		break;
1155 
1156 	case SO_SELECT_ERR_QUEUE:
1157 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1158 		break;
1159 
1160 #ifdef CONFIG_NET_RX_BUSY_POLL
1161 	case SO_BUSY_POLL:
1162 		/* allow unprivileged users to decrease the value */
1163 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1164 			ret = -EPERM;
1165 		else {
1166 			if (val < 0)
1167 				ret = -EINVAL;
1168 			else
1169 				WRITE_ONCE(sk->sk_ll_usec, val);
1170 		}
1171 		break;
1172 #endif
1173 
1174 	case SO_MAX_PACING_RATE:
1175 		{
1176 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1177 
1178 		if (sizeof(ulval) != sizeof(val) &&
1179 		    optlen >= sizeof(ulval) &&
1180 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1181 			ret = -EFAULT;
1182 			break;
1183 		}
1184 		if (ulval != ~0UL)
1185 			cmpxchg(&sk->sk_pacing_status,
1186 				SK_PACING_NONE,
1187 				SK_PACING_NEEDED);
1188 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1189 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1190 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1191 		break;
1192 		}
1193 	case SO_INCOMING_CPU:
1194 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1195 		break;
1196 
1197 	case SO_CNX_ADVICE:
1198 		if (val == 1)
1199 			dst_negative_advice(sk);
1200 		break;
1201 
1202 	case SO_ZEROCOPY:
1203 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1204 			if (!((sk->sk_type == SOCK_STREAM &&
1205 			       sk->sk_protocol == IPPROTO_TCP) ||
1206 			      (sk->sk_type == SOCK_DGRAM &&
1207 			       sk->sk_protocol == IPPROTO_UDP)))
1208 				ret = -ENOTSUPP;
1209 		} else if (sk->sk_family != PF_RDS) {
1210 			ret = -ENOTSUPP;
1211 		}
1212 		if (!ret) {
1213 			if (val < 0 || val > 1)
1214 				ret = -EINVAL;
1215 			else
1216 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1217 		}
1218 		break;
1219 
1220 	case SO_TXTIME:
1221 		if (optlen != sizeof(struct sock_txtime)) {
1222 			ret = -EINVAL;
1223 			break;
1224 		} else if (copy_from_sockptr(&sk_txtime, optval,
1225 			   sizeof(struct sock_txtime))) {
1226 			ret = -EFAULT;
1227 			break;
1228 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1229 			ret = -EINVAL;
1230 			break;
1231 		}
1232 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1233 		 * scheduler has enough safe guards.
1234 		 */
1235 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1236 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1237 			ret = -EPERM;
1238 			break;
1239 		}
1240 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1241 		sk->sk_clockid = sk_txtime.clockid;
1242 		sk->sk_txtime_deadline_mode =
1243 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1244 		sk->sk_txtime_report_errors =
1245 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1246 		break;
1247 
1248 	case SO_BINDTOIFINDEX:
1249 		ret = sock_bindtoindex_locked(sk, val);
1250 		break;
1251 
1252 	default:
1253 		ret = -ENOPROTOOPT;
1254 		break;
1255 	}
1256 	release_sock(sk);
1257 	return ret;
1258 }
1259 EXPORT_SYMBOL(sock_setsockopt);
1260 
sk_get_peer_cred(struct sock * sk)1261 static const struct cred *sk_get_peer_cred(struct sock *sk)
1262 {
1263 	const struct cred *cred;
1264 
1265 	spin_lock(&sk->sk_peer_lock);
1266 	cred = get_cred(sk->sk_peer_cred);
1267 	spin_unlock(&sk->sk_peer_lock);
1268 
1269 	return cred;
1270 }
1271 
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1272 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1273 			  struct ucred *ucred)
1274 {
1275 	ucred->pid = pid_vnr(pid);
1276 	ucred->uid = ucred->gid = -1;
1277 	if (cred) {
1278 		struct user_namespace *current_ns = current_user_ns();
1279 
1280 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1281 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1282 	}
1283 }
1284 
groups_to_user(sockptr_t dst,const struct group_info * src)1285 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1286 {
1287 	struct user_namespace *user_ns = current_user_ns();
1288 	int i;
1289 
1290 	for (i = 0; i < src->ngroups; i++) {
1291 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1292 
1293 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1294 			return -EFAULT;
1295 	}
1296 
1297 	return 0;
1298 }
1299 
sk_getsockopt(struct sock * sk,int level,int optname,sockptr_t optval,sockptr_t optlen)1300 static int sk_getsockopt(struct sock *sk, int level, int optname,
1301 			 sockptr_t optval, sockptr_t optlen)
1302 {
1303 	struct socket *sock = sk->sk_socket;
1304 
1305 	union {
1306 		int val;
1307 		u64 val64;
1308 		unsigned long ulval;
1309 		struct linger ling;
1310 		struct old_timeval32 tm32;
1311 		struct __kernel_old_timeval tm;
1312 		struct  __kernel_sock_timeval stm;
1313 		struct sock_txtime txtime;
1314 	} v;
1315 
1316 	int lv = sizeof(int);
1317 	int len;
1318 
1319 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1320 		return -EFAULT;
1321 	if (len < 0)
1322 		return -EINVAL;
1323 
1324 	memset(&v, 0, sizeof(v));
1325 
1326 	switch (optname) {
1327 	case SO_DEBUG:
1328 		v.val = sock_flag(sk, SOCK_DBG);
1329 		break;
1330 
1331 	case SO_DONTROUTE:
1332 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1333 		break;
1334 
1335 	case SO_BROADCAST:
1336 		v.val = sock_flag(sk, SOCK_BROADCAST);
1337 		break;
1338 
1339 	case SO_SNDBUF:
1340 		v.val = READ_ONCE(sk->sk_sndbuf);
1341 		break;
1342 
1343 	case SO_RCVBUF:
1344 		v.val = READ_ONCE(sk->sk_rcvbuf);
1345 		break;
1346 
1347 	case SO_REUSEADDR:
1348 		v.val = sk->sk_reuse;
1349 		break;
1350 
1351 	case SO_REUSEPORT:
1352 		v.val = sk->sk_reuseport;
1353 		break;
1354 
1355 	case SO_KEEPALIVE:
1356 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1357 		break;
1358 
1359 	case SO_TYPE:
1360 		v.val = sk->sk_type;
1361 		break;
1362 
1363 	case SO_PROTOCOL:
1364 		v.val = sk->sk_protocol;
1365 		break;
1366 
1367 	case SO_DOMAIN:
1368 		v.val = sk->sk_family;
1369 		break;
1370 
1371 	case SO_ERROR:
1372 		v.val = -sock_error(sk);
1373 		if (v.val == 0)
1374 			v.val = xchg(&sk->sk_err_soft, 0);
1375 		break;
1376 
1377 	case SO_OOBINLINE:
1378 		v.val = sock_flag(sk, SOCK_URGINLINE);
1379 		break;
1380 
1381 	case SO_NO_CHECK:
1382 		v.val = sk->sk_no_check_tx;
1383 		break;
1384 
1385 	case SO_PRIORITY:
1386 		v.val = sk->sk_priority;
1387 		break;
1388 
1389 	case SO_LINGER:
1390 		lv		= sizeof(v.ling);
1391 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1392 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1393 		break;
1394 
1395 	case SO_BSDCOMPAT:
1396 		break;
1397 
1398 	case SO_TIMESTAMP_OLD:
1399 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1400 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1401 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1402 		break;
1403 
1404 	case SO_TIMESTAMPNS_OLD:
1405 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1406 		break;
1407 
1408 	case SO_TIMESTAMP_NEW:
1409 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1410 		break;
1411 
1412 	case SO_TIMESTAMPNS_NEW:
1413 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1414 		break;
1415 
1416 	case SO_TIMESTAMPING_OLD:
1417 		v.val = sk->sk_tsflags;
1418 		break;
1419 
1420 	case SO_RCVTIMEO_OLD:
1421 	case SO_RCVTIMEO_NEW:
1422 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1423 		break;
1424 
1425 	case SO_SNDTIMEO_OLD:
1426 	case SO_SNDTIMEO_NEW:
1427 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1428 		break;
1429 
1430 	case SO_RCVLOWAT:
1431 		v.val = READ_ONCE(sk->sk_rcvlowat);
1432 		break;
1433 
1434 	case SO_SNDLOWAT:
1435 		v.val = 1;
1436 		break;
1437 
1438 	case SO_PASSCRED:
1439 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1440 		break;
1441 
1442 	case SO_PEERCRED:
1443 	{
1444 		struct ucred peercred;
1445 		if (len > sizeof(peercred))
1446 			len = sizeof(peercred);
1447 
1448 		spin_lock(&sk->sk_peer_lock);
1449 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1450 		spin_unlock(&sk->sk_peer_lock);
1451 
1452 		if (copy_to_sockptr(optval, &peercred, len))
1453 			return -EFAULT;
1454 		goto lenout;
1455 	}
1456 
1457 	case SO_PEERGROUPS:
1458 	{
1459 		const struct cred *cred;
1460 		int ret, n;
1461 
1462 		cred = sk_get_peer_cred(sk);
1463 		if (!cred)
1464 			return -ENODATA;
1465 
1466 		n = cred->group_info->ngroups;
1467 		if (len < n * sizeof(gid_t)) {
1468 			len = n * sizeof(gid_t);
1469 			put_cred(cred);
1470 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1471 		}
1472 		len = n * sizeof(gid_t);
1473 
1474 		ret = groups_to_user(optval, cred->group_info);
1475 		put_cred(cred);
1476 		if (ret)
1477 			return ret;
1478 		goto lenout;
1479 	}
1480 
1481 	case SO_PEERNAME:
1482 	{
1483 		char address[128];
1484 
1485 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1486 		if (lv < 0)
1487 			return -ENOTCONN;
1488 		if (lv < len)
1489 			return -EINVAL;
1490 		if (copy_to_sockptr(optval, address, len))
1491 			return -EFAULT;
1492 		goto lenout;
1493 	}
1494 
1495 	/* Dubious BSD thing... Probably nobody even uses it, but
1496 	 * the UNIX standard wants it for whatever reason... -DaveM
1497 	 */
1498 	case SO_ACCEPTCONN:
1499 		v.val = sk->sk_state == TCP_LISTEN;
1500 		break;
1501 
1502 	case SO_PASSSEC:
1503 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1504 		break;
1505 
1506 	case SO_PEERSEC:
1507 		return security_socket_getpeersec_stream(sock,
1508 							 optval, optlen, len);
1509 
1510 	case SO_MARK:
1511 		v.val = sk->sk_mark;
1512 		break;
1513 
1514 	case SO_RXQ_OVFL:
1515 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1516 		break;
1517 
1518 	case SO_WIFI_STATUS:
1519 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1520 		break;
1521 
1522 	case SO_PEEK_OFF:
1523 		if (!sock->ops->set_peek_off)
1524 			return -EOPNOTSUPP;
1525 
1526 		v.val = READ_ONCE(sk->sk_peek_off);
1527 		break;
1528 	case SO_NOFCS:
1529 		v.val = sock_flag(sk, SOCK_NOFCS);
1530 		break;
1531 
1532 	case SO_BINDTODEVICE:
1533 		return sock_getbindtodevice(sk, optval, optlen, len);
1534 
1535 	case SO_GET_FILTER:
1536 		len = sk_get_filter(sk, optval, len);
1537 		if (len < 0)
1538 			return len;
1539 
1540 		goto lenout;
1541 
1542 	case SO_LOCK_FILTER:
1543 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1544 		break;
1545 
1546 	case SO_BPF_EXTENSIONS:
1547 		v.val = bpf_tell_extensions();
1548 		break;
1549 
1550 	case SO_SELECT_ERR_QUEUE:
1551 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1552 		break;
1553 
1554 #ifdef CONFIG_NET_RX_BUSY_POLL
1555 	case SO_BUSY_POLL:
1556 		v.val = READ_ONCE(sk->sk_ll_usec);
1557 		break;
1558 #endif
1559 
1560 	case SO_MAX_PACING_RATE:
1561 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1562 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1563 			lv = sizeof(v.ulval);
1564 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1565 		} else {
1566 			/* 32bit version */
1567 			v.val = min_t(unsigned long, ~0U,
1568 				      READ_ONCE(sk->sk_max_pacing_rate));
1569 		}
1570 		break;
1571 
1572 	case SO_INCOMING_CPU:
1573 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1574 		break;
1575 
1576 	case SO_MEMINFO:
1577 	{
1578 		u32 meminfo[SK_MEMINFO_VARS];
1579 
1580 		sk_get_meminfo(sk, meminfo);
1581 
1582 		len = min_t(unsigned int, len, sizeof(meminfo));
1583 		if (copy_to_sockptr(optval, &meminfo, len))
1584 			return -EFAULT;
1585 
1586 		goto lenout;
1587 	}
1588 
1589 #ifdef CONFIG_NET_RX_BUSY_POLL
1590 	case SO_INCOMING_NAPI_ID:
1591 		v.val = READ_ONCE(sk->sk_napi_id);
1592 
1593 		/* aggregate non-NAPI IDs down to 0 */
1594 		if (v.val < MIN_NAPI_ID)
1595 			v.val = 0;
1596 
1597 		break;
1598 #endif
1599 
1600 	case SO_COOKIE:
1601 		lv = sizeof(u64);
1602 		if (len < lv)
1603 			return -EINVAL;
1604 		v.val64 = sock_gen_cookie(sk);
1605 		break;
1606 
1607 	case SO_ZEROCOPY:
1608 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1609 		break;
1610 
1611 	case SO_TXTIME:
1612 		lv = sizeof(v.txtime);
1613 		v.txtime.clockid = sk->sk_clockid;
1614 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1615 				  SOF_TXTIME_DEADLINE_MODE : 0;
1616 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1617 				  SOF_TXTIME_REPORT_ERRORS : 0;
1618 		break;
1619 
1620 	case SO_BINDTOIFINDEX:
1621 		v.val = sk->sk_bound_dev_if;
1622 		break;
1623 
1624 	case SO_NETNS_COOKIE:
1625 		lv = sizeof(u64);
1626 		if (len != lv)
1627 			return -EINVAL;
1628 		v.val64 = atomic64_read(&sock_net(sk)->net_cookie);
1629 		break;
1630 
1631 	default:
1632 		/* We implement the SO_SNDLOWAT etc to not be settable
1633 		 * (1003.1g 7).
1634 		 */
1635 		return -ENOPROTOOPT;
1636 	}
1637 
1638 	if (len > lv)
1639 		len = lv;
1640 	if (copy_to_sockptr(optval, &v, len))
1641 		return -EFAULT;
1642 lenout:
1643 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
1644 		return -EFAULT;
1645 	return 0;
1646 }
1647 
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1648 int sock_getsockopt(struct socket *sock, int level, int optname,
1649 		    char __user *optval, int __user *optlen)
1650 {
1651 	return sk_getsockopt(sock->sk, level, optname,
1652 			     USER_SOCKPTR(optval),
1653 			     USER_SOCKPTR(optlen));
1654 }
1655 
1656 /*
1657  * Initialize an sk_lock.
1658  *
1659  * (We also register the sk_lock with the lock validator.)
1660  */
sock_lock_init(struct sock * sk)1661 static inline void sock_lock_init(struct sock *sk)
1662 {
1663 	if (sk->sk_kern_sock)
1664 		sock_lock_init_class_and_name(
1665 			sk,
1666 			af_family_kern_slock_key_strings[sk->sk_family],
1667 			af_family_kern_slock_keys + sk->sk_family,
1668 			af_family_kern_key_strings[sk->sk_family],
1669 			af_family_kern_keys + sk->sk_family);
1670 	else
1671 		sock_lock_init_class_and_name(
1672 			sk,
1673 			af_family_slock_key_strings[sk->sk_family],
1674 			af_family_slock_keys + sk->sk_family,
1675 			af_family_key_strings[sk->sk_family],
1676 			af_family_keys + sk->sk_family);
1677 }
1678 
1679 /*
1680  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1681  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1682  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1683  */
sock_copy(struct sock * nsk,const struct sock * osk)1684 static void sock_copy(struct sock *nsk, const struct sock *osk)
1685 {
1686 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1687 #ifdef CONFIG_SECURITY_NETWORK
1688 	void *sptr = nsk->sk_security;
1689 #endif
1690 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1691 
1692 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1693 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1694 
1695 #ifdef CONFIG_SECURITY_NETWORK
1696 	nsk->sk_security = sptr;
1697 	security_sk_clone(osk, nsk);
1698 #endif
1699 }
1700 
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1701 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1702 		int family)
1703 {
1704 	struct sock *sk;
1705 	struct kmem_cache *slab;
1706 
1707 	slab = prot->slab;
1708 	if (slab != NULL) {
1709 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1710 		if (!sk)
1711 			return sk;
1712 		if (want_init_on_alloc(priority))
1713 			sk_prot_clear_nulls(sk, prot->obj_size);
1714 	} else
1715 		sk = kmalloc(prot->obj_size, priority);
1716 
1717 	if (sk != NULL) {
1718 		if (security_sk_alloc(sk, family, priority))
1719 			goto out_free;
1720 
1721 		if (!try_module_get(prot->owner))
1722 			goto out_free_sec;
1723 		sk_tx_queue_clear(sk);
1724 	}
1725 
1726 	return sk;
1727 
1728 out_free_sec:
1729 	security_sk_free(sk);
1730 out_free:
1731 	if (slab != NULL)
1732 		kmem_cache_free(slab, sk);
1733 	else
1734 		kfree(sk);
1735 	return NULL;
1736 }
1737 
sk_prot_free(struct proto * prot,struct sock * sk)1738 static void sk_prot_free(struct proto *prot, struct sock *sk)
1739 {
1740 	struct kmem_cache *slab;
1741 	struct module *owner;
1742 
1743 	owner = prot->owner;
1744 	slab = prot->slab;
1745 
1746 	cgroup_sk_free(&sk->sk_cgrp_data);
1747 	mem_cgroup_sk_free(sk);
1748 	security_sk_free(sk);
1749 	if (slab != NULL)
1750 		kmem_cache_free(slab, sk);
1751 	else
1752 		kfree(sk);
1753 	module_put(owner);
1754 }
1755 
1756 /**
1757  *	sk_alloc - All socket objects are allocated here
1758  *	@net: the applicable net namespace
1759  *	@family: protocol family
1760  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1761  *	@prot: struct proto associated with this new sock instance
1762  *	@kern: is this to be a kernel socket?
1763  */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)1764 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1765 		      struct proto *prot, int kern)
1766 {
1767 	struct sock *sk;
1768 
1769 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1770 	if (sk) {
1771 		sk->sk_family = family;
1772 		/*
1773 		 * See comment in struct sock definition to understand
1774 		 * why we need sk_prot_creator -acme
1775 		 */
1776 		sk->sk_prot = sk->sk_prot_creator = prot;
1777 		sk->sk_kern_sock = kern;
1778 		sock_lock_init(sk);
1779 		sk->sk_net_refcnt = kern ? 0 : 1;
1780 		if (likely(sk->sk_net_refcnt)) {
1781 			get_net(net);
1782 			sock_inuse_add(net, 1);
1783 		}
1784 
1785 		sock_net_set(sk, net);
1786 		refcount_set(&sk->sk_wmem_alloc, 1);
1787 
1788 		mem_cgroup_sk_alloc(sk);
1789 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1790 		sock_update_classid(&sk->sk_cgrp_data);
1791 		sock_update_netprioidx(&sk->sk_cgrp_data);
1792 		sk_tx_queue_clear(sk);
1793 	}
1794 
1795 	return sk;
1796 }
1797 EXPORT_SYMBOL(sk_alloc);
1798 
1799 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1800  * grace period. This is the case for UDP sockets and TCP listeners.
1801  */
__sk_destruct(struct rcu_head * head)1802 static void __sk_destruct(struct rcu_head *head)
1803 {
1804 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1805 	struct sk_filter *filter;
1806 
1807 	if (sk->sk_destruct)
1808 		sk->sk_destruct(sk);
1809 
1810 	filter = rcu_dereference_check(sk->sk_filter,
1811 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1812 	if (filter) {
1813 		sk_filter_uncharge(sk, filter);
1814 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1815 	}
1816 
1817 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1818 
1819 #ifdef CONFIG_BPF_SYSCALL
1820 	bpf_sk_storage_free(sk);
1821 #endif
1822 
1823 	if (atomic_read(&sk->sk_omem_alloc))
1824 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1825 			 __func__, atomic_read(&sk->sk_omem_alloc));
1826 
1827 	if (sk->sk_frag.page) {
1828 		put_page(sk->sk_frag.page);
1829 		sk->sk_frag.page = NULL;
1830 	}
1831 
1832 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1833 	put_cred(sk->sk_peer_cred);
1834 	put_pid(sk->sk_peer_pid);
1835 
1836 	if (likely(sk->sk_net_refcnt))
1837 		put_net(sock_net(sk));
1838 	sk_prot_free(sk->sk_prot_creator, sk);
1839 }
1840 
sk_destruct(struct sock * sk)1841 void sk_destruct(struct sock *sk)
1842 {
1843 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1844 
1845 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1846 		reuseport_detach_sock(sk);
1847 		use_call_rcu = true;
1848 	}
1849 
1850 	if (use_call_rcu)
1851 		call_rcu(&sk->sk_rcu, __sk_destruct);
1852 	else
1853 		__sk_destruct(&sk->sk_rcu);
1854 }
1855 
__sk_free(struct sock * sk)1856 static void __sk_free(struct sock *sk)
1857 {
1858 	if (likely(sk->sk_net_refcnt))
1859 		sock_inuse_add(sock_net(sk), -1);
1860 
1861 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1862 		sock_diag_broadcast_destroy(sk);
1863 	else
1864 		sk_destruct(sk);
1865 }
1866 
sk_free(struct sock * sk)1867 void sk_free(struct sock *sk)
1868 {
1869 	/*
1870 	 * We subtract one from sk_wmem_alloc and can know if
1871 	 * some packets are still in some tx queue.
1872 	 * If not null, sock_wfree() will call __sk_free(sk) later
1873 	 */
1874 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1875 		__sk_free(sk);
1876 }
1877 EXPORT_SYMBOL(sk_free);
1878 
sk_init_common(struct sock * sk)1879 static void sk_init_common(struct sock *sk)
1880 {
1881 	skb_queue_head_init(&sk->sk_receive_queue);
1882 	skb_queue_head_init(&sk->sk_write_queue);
1883 	skb_queue_head_init(&sk->sk_error_queue);
1884 
1885 	rwlock_init(&sk->sk_callback_lock);
1886 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1887 			af_rlock_keys + sk->sk_family,
1888 			af_family_rlock_key_strings[sk->sk_family]);
1889 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1890 			af_wlock_keys + sk->sk_family,
1891 			af_family_wlock_key_strings[sk->sk_family]);
1892 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1893 			af_elock_keys + sk->sk_family,
1894 			af_family_elock_key_strings[sk->sk_family]);
1895 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1896 			af_callback_keys + sk->sk_family,
1897 			af_family_clock_key_strings[sk->sk_family]);
1898 }
1899 
1900 /**
1901  *	sk_clone_lock - clone a socket, and lock its clone
1902  *	@sk: the socket to clone
1903  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1904  *
1905  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1906  */
sk_clone_lock(const struct sock * sk,const gfp_t priority)1907 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1908 {
1909 	struct proto *prot = READ_ONCE(sk->sk_prot);
1910 	struct sk_filter *filter;
1911 	bool is_charged = true;
1912 	struct sock *newsk;
1913 
1914 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
1915 	if (!newsk)
1916 		goto out;
1917 
1918 	sock_copy(newsk, sk);
1919 
1920 	newsk->sk_prot_creator = prot;
1921 
1922 	/* SANITY */
1923 	if (likely(newsk->sk_net_refcnt)) {
1924 		get_net(sock_net(newsk));
1925 		sock_inuse_add(sock_net(newsk), 1);
1926 	}
1927 	sk_node_init(&newsk->sk_node);
1928 	sock_lock_init(newsk);
1929 	bh_lock_sock(newsk);
1930 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1931 	newsk->sk_backlog.len = 0;
1932 
1933 	atomic_set(&newsk->sk_rmem_alloc, 0);
1934 
1935 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
1936 	refcount_set(&newsk->sk_wmem_alloc, 1);
1937 
1938 	atomic_set(&newsk->sk_omem_alloc, 0);
1939 	sk_init_common(newsk);
1940 
1941 	newsk->sk_dst_cache	= NULL;
1942 	newsk->sk_dst_pending_confirm = 0;
1943 	newsk->sk_wmem_queued	= 0;
1944 	newsk->sk_forward_alloc = 0;
1945 	atomic_set(&newsk->sk_drops, 0);
1946 	newsk->sk_send_head	= NULL;
1947 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1948 	atomic_set(&newsk->sk_zckey, 0);
1949 
1950 	sock_reset_flag(newsk, SOCK_DONE);
1951 
1952 	/* sk->sk_memcg will be populated at accept() time */
1953 	newsk->sk_memcg = NULL;
1954 
1955 	cgroup_sk_clone(&newsk->sk_cgrp_data);
1956 
1957 	rcu_read_lock();
1958 	filter = rcu_dereference(sk->sk_filter);
1959 	if (filter != NULL)
1960 		/* though it's an empty new sock, the charging may fail
1961 		 * if sysctl_optmem_max was changed between creation of
1962 		 * original socket and cloning
1963 		 */
1964 		is_charged = sk_filter_charge(newsk, filter);
1965 	RCU_INIT_POINTER(newsk->sk_filter, filter);
1966 	rcu_read_unlock();
1967 
1968 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1969 		/* We need to make sure that we don't uncharge the new
1970 		 * socket if we couldn't charge it in the first place
1971 		 * as otherwise we uncharge the parent's filter.
1972 		 */
1973 		if (!is_charged)
1974 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
1975 		sk_free_unlock_clone(newsk);
1976 		newsk = NULL;
1977 		goto out;
1978 	}
1979 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1980 
1981 	if (bpf_sk_storage_clone(sk, newsk)) {
1982 		sk_free_unlock_clone(newsk);
1983 		newsk = NULL;
1984 		goto out;
1985 	}
1986 
1987 	/* Clear sk_user_data if parent had the pointer tagged
1988 	 * as not suitable for copying when cloning.
1989 	 */
1990 	if (sk_user_data_is_nocopy(newsk))
1991 		newsk->sk_user_data = NULL;
1992 
1993 	newsk->sk_err	   = 0;
1994 	newsk->sk_err_soft = 0;
1995 	newsk->sk_priority = 0;
1996 	newsk->sk_incoming_cpu = raw_smp_processor_id();
1997 
1998 	/* Before updating sk_refcnt, we must commit prior changes to memory
1999 	 * (Documentation/RCU/rculist_nulls.rst for details)
2000 	 */
2001 	smp_wmb();
2002 	refcount_set(&newsk->sk_refcnt, 2);
2003 
2004 	/* Increment the counter in the same struct proto as the master
2005 	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2006 	 * is the same as sk->sk_prot->socks, as this field was copied
2007 	 * with memcpy).
2008 	 *
2009 	 * This _changes_ the previous behaviour, where
2010 	 * tcp_create_openreq_child always was incrementing the
2011 	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2012 	 * to be taken into account in all callers. -acme
2013 	 */
2014 	sk_refcnt_debug_inc(newsk);
2015 	sk_set_socket(newsk, NULL);
2016 	sk_tx_queue_clear(newsk);
2017 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2018 
2019 	if (newsk->sk_prot->sockets_allocated)
2020 		sk_sockets_allocated_inc(newsk);
2021 
2022 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2023 		net_enable_timestamp();
2024 out:
2025 	return newsk;
2026 }
2027 EXPORT_SYMBOL_GPL(sk_clone_lock);
2028 
sk_free_unlock_clone(struct sock * sk)2029 void sk_free_unlock_clone(struct sock *sk)
2030 {
2031 	/* It is still raw copy of parent, so invalidate
2032 	 * destructor and make plain sk_free() */
2033 	sk->sk_destruct = NULL;
2034 	bh_unlock_sock(sk);
2035 	sk_free(sk);
2036 }
2037 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2038 
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2039 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2040 {
2041 	u32 max_segs = 1;
2042 
2043 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2044 	if (sk->sk_route_caps & NETIF_F_GSO)
2045 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2046 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
2047 	if (sk_can_gso(sk)) {
2048 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2049 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2050 		} else {
2051 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2052 			sk->sk_gso_max_size = dst->dev->gso_max_size;
2053 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2054 		}
2055 	}
2056 	sk->sk_gso_max_segs = max_segs;
2057 	sk_dst_set(sk, dst);
2058 }
2059 EXPORT_SYMBOL_GPL(sk_setup_caps);
2060 
2061 /*
2062  *	Simple resource managers for sockets.
2063  */
2064 
2065 
2066 /*
2067  * Write buffer destructor automatically called from kfree_skb.
2068  */
sock_wfree(struct sk_buff * skb)2069 void sock_wfree(struct sk_buff *skb)
2070 {
2071 	struct sock *sk = skb->sk;
2072 	unsigned int len = skb->truesize;
2073 
2074 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2075 		/*
2076 		 * Keep a reference on sk_wmem_alloc, this will be released
2077 		 * after sk_write_space() call
2078 		 */
2079 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2080 		sk->sk_write_space(sk);
2081 		len = 1;
2082 	}
2083 	/*
2084 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2085 	 * could not do because of in-flight packets
2086 	 */
2087 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2088 		__sk_free(sk);
2089 }
2090 EXPORT_SYMBOL(sock_wfree);
2091 
2092 /* This variant of sock_wfree() is used by TCP,
2093  * since it sets SOCK_USE_WRITE_QUEUE.
2094  */
__sock_wfree(struct sk_buff * skb)2095 void __sock_wfree(struct sk_buff *skb)
2096 {
2097 	struct sock *sk = skb->sk;
2098 
2099 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2100 		__sk_free(sk);
2101 }
2102 
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2103 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2104 {
2105 	skb_orphan(skb);
2106 	skb->sk = sk;
2107 #ifdef CONFIG_INET
2108 	if (unlikely(!sk_fullsock(sk))) {
2109 		skb->destructor = sock_edemux;
2110 		sock_hold(sk);
2111 		return;
2112 	}
2113 #endif
2114 	skb->destructor = sock_wfree;
2115 	skb_set_hash_from_sk(skb, sk);
2116 	/*
2117 	 * We used to take a refcount on sk, but following operation
2118 	 * is enough to guarantee sk_free() wont free this sock until
2119 	 * all in-flight packets are completed
2120 	 */
2121 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2122 }
2123 EXPORT_SYMBOL(skb_set_owner_w);
2124 
can_skb_orphan_partial(const struct sk_buff * skb)2125 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2126 {
2127 #ifdef CONFIG_TLS_DEVICE
2128 	/* Drivers depend on in-order delivery for crypto offload,
2129 	 * partial orphan breaks out-of-order-OK logic.
2130 	 */
2131 	if (skb->decrypted)
2132 		return false;
2133 #endif
2134 	return (skb->destructor == sock_wfree ||
2135 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2136 }
2137 
2138 /* This helper is used by netem, as it can hold packets in its
2139  * delay queue. We want to allow the owner socket to send more
2140  * packets, as if they were already TX completed by a typical driver.
2141  * But we also want to keep skb->sk set because some packet schedulers
2142  * rely on it (sch_fq for example).
2143  */
skb_orphan_partial(struct sk_buff * skb)2144 void skb_orphan_partial(struct sk_buff *skb)
2145 {
2146 	if (skb_is_tcp_pure_ack(skb))
2147 		return;
2148 
2149 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2150 		return;
2151 
2152 	skb_orphan(skb);
2153 }
2154 EXPORT_SYMBOL(skb_orphan_partial);
2155 
2156 /*
2157  * Read buffer destructor automatically called from kfree_skb.
2158  */
sock_rfree(struct sk_buff * skb)2159 void sock_rfree(struct sk_buff *skb)
2160 {
2161 	struct sock *sk = skb->sk;
2162 	unsigned int len = skb->truesize;
2163 
2164 	atomic_sub(len, &sk->sk_rmem_alloc);
2165 	sk_mem_uncharge(sk, len);
2166 }
2167 EXPORT_SYMBOL(sock_rfree);
2168 
2169 /*
2170  * Buffer destructor for skbs that are not used directly in read or write
2171  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2172  */
sock_efree(struct sk_buff * skb)2173 void sock_efree(struct sk_buff *skb)
2174 {
2175 	sock_put(skb->sk);
2176 }
2177 EXPORT_SYMBOL(sock_efree);
2178 
2179 /* Buffer destructor for prefetch/receive path where reference count may
2180  * not be held, e.g. for listen sockets.
2181  */
2182 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2183 void sock_pfree(struct sk_buff *skb)
2184 {
2185 	if (sk_is_refcounted(skb->sk))
2186 		sock_gen_put(skb->sk);
2187 }
2188 EXPORT_SYMBOL(sock_pfree);
2189 #endif /* CONFIG_INET */
2190 
sock_i_uid(struct sock * sk)2191 kuid_t sock_i_uid(struct sock *sk)
2192 {
2193 	kuid_t uid;
2194 
2195 	read_lock_bh(&sk->sk_callback_lock);
2196 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2197 	read_unlock_bh(&sk->sk_callback_lock);
2198 	return uid;
2199 }
2200 EXPORT_SYMBOL(sock_i_uid);
2201 
__sock_i_ino(struct sock * sk)2202 unsigned long __sock_i_ino(struct sock *sk)
2203 {
2204 	unsigned long ino;
2205 
2206 	read_lock(&sk->sk_callback_lock);
2207 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2208 	read_unlock(&sk->sk_callback_lock);
2209 	return ino;
2210 }
2211 EXPORT_SYMBOL(__sock_i_ino);
2212 
sock_i_ino(struct sock * sk)2213 unsigned long sock_i_ino(struct sock *sk)
2214 {
2215 	unsigned long ino;
2216 
2217 	local_bh_disable();
2218 	ino = __sock_i_ino(sk);
2219 	local_bh_enable();
2220 	return ino;
2221 }
2222 EXPORT_SYMBOL(sock_i_ino);
2223 
2224 /*
2225  * Allocate a skb from the socket's send buffer.
2226  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2227 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2228 			     gfp_t priority)
2229 {
2230 	if (force ||
2231 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2232 		struct sk_buff *skb = alloc_skb(size, priority);
2233 
2234 		if (skb) {
2235 			skb_set_owner_w(skb, sk);
2236 			return skb;
2237 		}
2238 	}
2239 	return NULL;
2240 }
2241 EXPORT_SYMBOL(sock_wmalloc);
2242 
sock_ofree(struct sk_buff * skb)2243 static void sock_ofree(struct sk_buff *skb)
2244 {
2245 	struct sock *sk = skb->sk;
2246 
2247 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2248 }
2249 
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2250 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2251 			     gfp_t priority)
2252 {
2253 	struct sk_buff *skb;
2254 
2255 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2256 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2257 	    READ_ONCE(sysctl_optmem_max))
2258 		return NULL;
2259 
2260 	skb = alloc_skb(size, priority);
2261 	if (!skb)
2262 		return NULL;
2263 
2264 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2265 	skb->sk = sk;
2266 	skb->destructor = sock_ofree;
2267 	return skb;
2268 }
2269 
2270 /*
2271  * Allocate a memory block from the socket's option memory buffer.
2272  */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2273 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2274 {
2275 	int optmem_max = READ_ONCE(sysctl_optmem_max);
2276 
2277 	if ((unsigned int)size <= optmem_max &&
2278 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2279 		void *mem;
2280 		/* First do the add, to avoid the race if kmalloc
2281 		 * might sleep.
2282 		 */
2283 		atomic_add(size, &sk->sk_omem_alloc);
2284 		mem = kmalloc(size, priority);
2285 		if (mem)
2286 			return mem;
2287 		atomic_sub(size, &sk->sk_omem_alloc);
2288 	}
2289 	return NULL;
2290 }
2291 EXPORT_SYMBOL(sock_kmalloc);
2292 
2293 /* Free an option memory block. Note, we actually want the inline
2294  * here as this allows gcc to detect the nullify and fold away the
2295  * condition entirely.
2296  */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2297 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2298 				  const bool nullify)
2299 {
2300 	if (WARN_ON_ONCE(!mem))
2301 		return;
2302 	if (nullify)
2303 		kfree_sensitive(mem);
2304 	else
2305 		kfree(mem);
2306 	atomic_sub(size, &sk->sk_omem_alloc);
2307 }
2308 
sock_kfree_s(struct sock * sk,void * mem,int size)2309 void sock_kfree_s(struct sock *sk, void *mem, int size)
2310 {
2311 	__sock_kfree_s(sk, mem, size, false);
2312 }
2313 EXPORT_SYMBOL(sock_kfree_s);
2314 
sock_kzfree_s(struct sock * sk,void * mem,int size)2315 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2316 {
2317 	__sock_kfree_s(sk, mem, size, true);
2318 }
2319 EXPORT_SYMBOL(sock_kzfree_s);
2320 
2321 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2322    I think, these locks should be removed for datagram sockets.
2323  */
sock_wait_for_wmem(struct sock * sk,long timeo)2324 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2325 {
2326 	DEFINE_WAIT(wait);
2327 
2328 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2329 	for (;;) {
2330 		if (!timeo)
2331 			break;
2332 		if (signal_pending(current))
2333 			break;
2334 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2335 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2336 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2337 			break;
2338 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2339 			break;
2340 		if (READ_ONCE(sk->sk_err))
2341 			break;
2342 		timeo = schedule_timeout(timeo);
2343 	}
2344 	finish_wait(sk_sleep(sk), &wait);
2345 	return timeo;
2346 }
2347 
2348 
2349 /*
2350  *	Generic send/receive buffer handlers
2351  */
2352 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2353 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2354 				     unsigned long data_len, int noblock,
2355 				     int *errcode, int max_page_order)
2356 {
2357 	struct sk_buff *skb;
2358 	long timeo;
2359 	int err;
2360 
2361 	timeo = sock_sndtimeo(sk, noblock);
2362 	for (;;) {
2363 		err = sock_error(sk);
2364 		if (err != 0)
2365 			goto failure;
2366 
2367 		err = -EPIPE;
2368 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2369 			goto failure;
2370 
2371 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2372 			break;
2373 
2374 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2375 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2376 		err = -EAGAIN;
2377 		if (!timeo)
2378 			goto failure;
2379 		if (signal_pending(current))
2380 			goto interrupted;
2381 		timeo = sock_wait_for_wmem(sk, timeo);
2382 	}
2383 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2384 				   errcode, sk->sk_allocation);
2385 	if (skb)
2386 		skb_set_owner_w(skb, sk);
2387 	return skb;
2388 
2389 interrupted:
2390 	err = sock_intr_errno(timeo);
2391 failure:
2392 	*errcode = err;
2393 	return NULL;
2394 }
2395 EXPORT_SYMBOL(sock_alloc_send_pskb);
2396 
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)2397 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2398 				    int noblock, int *errcode)
2399 {
2400 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2401 }
2402 EXPORT_SYMBOL(sock_alloc_send_skb);
2403 
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2404 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2405 		     struct sockcm_cookie *sockc)
2406 {
2407 	u32 tsflags;
2408 
2409 	switch (cmsg->cmsg_type) {
2410 	case SO_MARK:
2411 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2412 			return -EPERM;
2413 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2414 			return -EINVAL;
2415 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2416 		break;
2417 	case SO_TIMESTAMPING_OLD:
2418 	case SO_TIMESTAMPING_NEW:
2419 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2420 			return -EINVAL;
2421 
2422 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2423 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2424 			return -EINVAL;
2425 
2426 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2427 		sockc->tsflags |= tsflags;
2428 		break;
2429 	case SCM_TXTIME:
2430 		if (!sock_flag(sk, SOCK_TXTIME))
2431 			return -EINVAL;
2432 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2433 			return -EINVAL;
2434 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2435 		break;
2436 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2437 	case SCM_RIGHTS:
2438 	case SCM_CREDENTIALS:
2439 		break;
2440 	default:
2441 		return -EINVAL;
2442 	}
2443 	return 0;
2444 }
2445 EXPORT_SYMBOL(__sock_cmsg_send);
2446 
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2447 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2448 		   struct sockcm_cookie *sockc)
2449 {
2450 	struct cmsghdr *cmsg;
2451 	int ret;
2452 
2453 	for_each_cmsghdr(cmsg, msg) {
2454 		if (!CMSG_OK(msg, cmsg))
2455 			return -EINVAL;
2456 		if (cmsg->cmsg_level != SOL_SOCKET)
2457 			continue;
2458 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2459 		if (ret)
2460 			return ret;
2461 	}
2462 	return 0;
2463 }
2464 EXPORT_SYMBOL(sock_cmsg_send);
2465 
sk_enter_memory_pressure(struct sock * sk)2466 static void sk_enter_memory_pressure(struct sock *sk)
2467 {
2468 	if (!sk->sk_prot->enter_memory_pressure)
2469 		return;
2470 
2471 	sk->sk_prot->enter_memory_pressure(sk);
2472 }
2473 
sk_leave_memory_pressure(struct sock * sk)2474 static void sk_leave_memory_pressure(struct sock *sk)
2475 {
2476 	if (sk->sk_prot->leave_memory_pressure) {
2477 		sk->sk_prot->leave_memory_pressure(sk);
2478 	} else {
2479 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2480 
2481 		if (memory_pressure && READ_ONCE(*memory_pressure))
2482 			WRITE_ONCE(*memory_pressure, 0);
2483 	}
2484 }
2485 
2486 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2487 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2488 
2489 /**
2490  * skb_page_frag_refill - check that a page_frag contains enough room
2491  * @sz: minimum size of the fragment we want to get
2492  * @pfrag: pointer to page_frag
2493  * @gfp: priority for memory allocation
2494  *
2495  * Note: While this allocator tries to use high order pages, there is
2496  * no guarantee that allocations succeed. Therefore, @sz MUST be
2497  * less or equal than PAGE_SIZE.
2498  */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2499 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2500 {
2501 	if (pfrag->page) {
2502 		if (page_ref_count(pfrag->page) == 1) {
2503 			pfrag->offset = 0;
2504 			return true;
2505 		}
2506 		if (pfrag->offset + sz <= pfrag->size)
2507 			return true;
2508 		put_page(pfrag->page);
2509 	}
2510 
2511 	pfrag->offset = 0;
2512 	if (SKB_FRAG_PAGE_ORDER &&
2513 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2514 		/* Avoid direct reclaim but allow kswapd to wake */
2515 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2516 					  __GFP_COMP | __GFP_NOWARN |
2517 					  __GFP_NORETRY,
2518 					  SKB_FRAG_PAGE_ORDER);
2519 		if (likely(pfrag->page)) {
2520 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2521 			return true;
2522 		}
2523 	}
2524 	pfrag->page = alloc_page(gfp);
2525 	if (likely(pfrag->page)) {
2526 		pfrag->size = PAGE_SIZE;
2527 		return true;
2528 	}
2529 	return false;
2530 }
2531 EXPORT_SYMBOL(skb_page_frag_refill);
2532 
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2533 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2534 {
2535 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2536 		return true;
2537 
2538 	sk_enter_memory_pressure(sk);
2539 	sk_stream_moderate_sndbuf(sk);
2540 	return false;
2541 }
2542 EXPORT_SYMBOL(sk_page_frag_refill);
2543 
__lock_sock(struct sock * sk)2544 static void __lock_sock(struct sock *sk)
2545 	__releases(&sk->sk_lock.slock)
2546 	__acquires(&sk->sk_lock.slock)
2547 {
2548 	DEFINE_WAIT(wait);
2549 
2550 	for (;;) {
2551 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2552 					TASK_UNINTERRUPTIBLE);
2553 		spin_unlock_bh(&sk->sk_lock.slock);
2554 		schedule();
2555 		spin_lock_bh(&sk->sk_lock.slock);
2556 		if (!sock_owned_by_user(sk))
2557 			break;
2558 	}
2559 	finish_wait(&sk->sk_lock.wq, &wait);
2560 }
2561 
__release_sock(struct sock * sk)2562 void __release_sock(struct sock *sk)
2563 	__releases(&sk->sk_lock.slock)
2564 	__acquires(&sk->sk_lock.slock)
2565 {
2566 	struct sk_buff *skb, *next;
2567 
2568 	while ((skb = sk->sk_backlog.head) != NULL) {
2569 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2570 
2571 		spin_unlock_bh(&sk->sk_lock.slock);
2572 
2573 		do {
2574 			next = skb->next;
2575 			prefetch(next);
2576 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2577 			skb_mark_not_on_list(skb);
2578 			sk_backlog_rcv(sk, skb);
2579 
2580 			cond_resched();
2581 
2582 			skb = next;
2583 		} while (skb != NULL);
2584 
2585 		spin_lock_bh(&sk->sk_lock.slock);
2586 	}
2587 
2588 	/*
2589 	 * Doing the zeroing here guarantee we can not loop forever
2590 	 * while a wild producer attempts to flood us.
2591 	 */
2592 	sk->sk_backlog.len = 0;
2593 }
2594 
__sk_flush_backlog(struct sock * sk)2595 void __sk_flush_backlog(struct sock *sk)
2596 {
2597 	spin_lock_bh(&sk->sk_lock.slock);
2598 	__release_sock(sk);
2599 	spin_unlock_bh(&sk->sk_lock.slock);
2600 }
2601 
2602 /**
2603  * sk_wait_data - wait for data to arrive at sk_receive_queue
2604  * @sk:    sock to wait on
2605  * @timeo: for how long
2606  * @skb:   last skb seen on sk_receive_queue
2607  *
2608  * Now socket state including sk->sk_err is changed only under lock,
2609  * hence we may omit checks after joining wait queue.
2610  * We check receive queue before schedule() only as optimization;
2611  * it is very likely that release_sock() added new data.
2612  */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2613 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2614 {
2615 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2616 	int rc;
2617 
2618 	add_wait_queue(sk_sleep(sk), &wait);
2619 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2620 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2621 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2622 	remove_wait_queue(sk_sleep(sk), &wait);
2623 	return rc;
2624 }
2625 EXPORT_SYMBOL(sk_wait_data);
2626 
2627 /**
2628  *	__sk_mem_raise_allocated - increase memory_allocated
2629  *	@sk: socket
2630  *	@size: memory size to allocate
2631  *	@amt: pages to allocate
2632  *	@kind: allocation type
2633  *
2634  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2635  */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)2636 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2637 {
2638 	struct proto *prot = sk->sk_prot;
2639 	long allocated = sk_memory_allocated_add(sk, amt);
2640 	bool charged = true;
2641 
2642 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2643 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2644 		goto suppress_allocation;
2645 
2646 	/* Under limit. */
2647 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2648 		sk_leave_memory_pressure(sk);
2649 		return 1;
2650 	}
2651 
2652 	/* Under pressure. */
2653 	if (allocated > sk_prot_mem_limits(sk, 1))
2654 		sk_enter_memory_pressure(sk);
2655 
2656 	/* Over hard limit. */
2657 	if (allocated > sk_prot_mem_limits(sk, 2))
2658 		goto suppress_allocation;
2659 
2660 	/* guarantee minimum buffer size under pressure */
2661 	if (kind == SK_MEM_RECV) {
2662 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2663 			return 1;
2664 
2665 	} else { /* SK_MEM_SEND */
2666 		int wmem0 = sk_get_wmem0(sk, prot);
2667 
2668 		if (sk->sk_type == SOCK_STREAM) {
2669 			if (sk->sk_wmem_queued < wmem0)
2670 				return 1;
2671 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2672 				return 1;
2673 		}
2674 	}
2675 
2676 	if (sk_has_memory_pressure(sk)) {
2677 		u64 alloc;
2678 
2679 		if (!sk_under_memory_pressure(sk))
2680 			return 1;
2681 		alloc = sk_sockets_allocated_read_positive(sk);
2682 		if (sk_prot_mem_limits(sk, 2) > alloc *
2683 		    sk_mem_pages(sk->sk_wmem_queued +
2684 				 atomic_read(&sk->sk_rmem_alloc) +
2685 				 sk->sk_forward_alloc))
2686 			return 1;
2687 	}
2688 
2689 suppress_allocation:
2690 
2691 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2692 		sk_stream_moderate_sndbuf(sk);
2693 
2694 		/* Fail only if socket is _under_ its sndbuf.
2695 		 * In this case we cannot block, so that we have to fail.
2696 		 */
2697 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2698 			return 1;
2699 	}
2700 
2701 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2702 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2703 
2704 	sk_memory_allocated_sub(sk, amt);
2705 
2706 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2707 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2708 
2709 	return 0;
2710 }
2711 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2712 
2713 /**
2714  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2715  *	@sk: socket
2716  *	@size: memory size to allocate
2717  *	@kind: allocation type
2718  *
2719  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2720  *	rmem allocation. This function assumes that protocols which have
2721  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2722  */
__sk_mem_schedule(struct sock * sk,int size,int kind)2723 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2724 {
2725 	int ret, amt = sk_mem_pages(size);
2726 
2727 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2728 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2729 	if (!ret)
2730 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2731 	return ret;
2732 }
2733 EXPORT_SYMBOL(__sk_mem_schedule);
2734 
2735 /**
2736  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2737  *	@sk: socket
2738  *	@amount: number of quanta
2739  *
2740  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2741  */
__sk_mem_reduce_allocated(struct sock * sk,int amount)2742 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2743 {
2744 	sk_memory_allocated_sub(sk, amount);
2745 
2746 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2747 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2748 
2749 	if (sk_under_global_memory_pressure(sk) &&
2750 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2751 		sk_leave_memory_pressure(sk);
2752 }
2753 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2754 
2755 /**
2756  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2757  *	@sk: socket
2758  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2759  */
__sk_mem_reclaim(struct sock * sk,int amount)2760 void __sk_mem_reclaim(struct sock *sk, int amount)
2761 {
2762 	amount >>= SK_MEM_QUANTUM_SHIFT;
2763 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2764 	__sk_mem_reduce_allocated(sk, amount);
2765 }
2766 EXPORT_SYMBOL(__sk_mem_reclaim);
2767 
sk_set_peek_off(struct sock * sk,int val)2768 int sk_set_peek_off(struct sock *sk, int val)
2769 {
2770 	WRITE_ONCE(sk->sk_peek_off, val);
2771 	return 0;
2772 }
2773 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2774 
2775 /*
2776  * Set of default routines for initialising struct proto_ops when
2777  * the protocol does not support a particular function. In certain
2778  * cases where it makes no sense for a protocol to have a "do nothing"
2779  * function, some default processing is provided.
2780  */
2781 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2782 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2783 {
2784 	return -EOPNOTSUPP;
2785 }
2786 EXPORT_SYMBOL(sock_no_bind);
2787 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2788 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2789 		    int len, int flags)
2790 {
2791 	return -EOPNOTSUPP;
2792 }
2793 EXPORT_SYMBOL(sock_no_connect);
2794 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2795 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2796 {
2797 	return -EOPNOTSUPP;
2798 }
2799 EXPORT_SYMBOL(sock_no_socketpair);
2800 
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)2801 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2802 		   bool kern)
2803 {
2804 	return -EOPNOTSUPP;
2805 }
2806 EXPORT_SYMBOL(sock_no_accept);
2807 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)2808 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2809 		    int peer)
2810 {
2811 	return -EOPNOTSUPP;
2812 }
2813 EXPORT_SYMBOL(sock_no_getname);
2814 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2815 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2816 {
2817 	return -EOPNOTSUPP;
2818 }
2819 EXPORT_SYMBOL(sock_no_ioctl);
2820 
sock_no_listen(struct socket * sock,int backlog)2821 int sock_no_listen(struct socket *sock, int backlog)
2822 {
2823 	return -EOPNOTSUPP;
2824 }
2825 EXPORT_SYMBOL(sock_no_listen);
2826 
sock_no_shutdown(struct socket * sock,int how)2827 int sock_no_shutdown(struct socket *sock, int how)
2828 {
2829 	return -EOPNOTSUPP;
2830 }
2831 EXPORT_SYMBOL(sock_no_shutdown);
2832 
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)2833 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2834 {
2835 	return -EOPNOTSUPP;
2836 }
2837 EXPORT_SYMBOL(sock_no_sendmsg);
2838 
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)2839 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2840 {
2841 	return -EOPNOTSUPP;
2842 }
2843 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2844 
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)2845 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2846 		    int flags)
2847 {
2848 	return -EOPNOTSUPP;
2849 }
2850 EXPORT_SYMBOL(sock_no_recvmsg);
2851 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)2852 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2853 {
2854 	/* Mirror missing mmap method error code */
2855 	return -ENODEV;
2856 }
2857 EXPORT_SYMBOL(sock_no_mmap);
2858 
2859 /*
2860  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2861  * various sock-based usage counts.
2862  */
__receive_sock(struct file * file)2863 void __receive_sock(struct file *file)
2864 {
2865 	struct socket *sock;
2866 	int error;
2867 
2868 	/*
2869 	 * The resulting value of "error" is ignored here since we only
2870 	 * need to take action when the file is a socket and testing
2871 	 * "sock" for NULL is sufficient.
2872 	 */
2873 	sock = sock_from_file(file, &error);
2874 	if (sock) {
2875 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2876 		sock_update_classid(&sock->sk->sk_cgrp_data);
2877 	}
2878 }
2879 
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)2880 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2881 {
2882 	ssize_t res;
2883 	struct msghdr msg = {.msg_flags = flags};
2884 	struct kvec iov;
2885 	char *kaddr = kmap(page);
2886 	iov.iov_base = kaddr + offset;
2887 	iov.iov_len = size;
2888 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2889 	kunmap(page);
2890 	return res;
2891 }
2892 EXPORT_SYMBOL(sock_no_sendpage);
2893 
sock_no_sendpage_locked(struct sock * sk,struct page * page,int offset,size_t size,int flags)2894 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2895 				int offset, size_t size, int flags)
2896 {
2897 	ssize_t res;
2898 	struct msghdr msg = {.msg_flags = flags};
2899 	struct kvec iov;
2900 	char *kaddr = kmap(page);
2901 
2902 	iov.iov_base = kaddr + offset;
2903 	iov.iov_len = size;
2904 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2905 	kunmap(page);
2906 	return res;
2907 }
2908 EXPORT_SYMBOL(sock_no_sendpage_locked);
2909 
2910 /*
2911  *	Default Socket Callbacks
2912  */
2913 
sock_def_wakeup(struct sock * sk)2914 static void sock_def_wakeup(struct sock *sk)
2915 {
2916 	struct socket_wq *wq;
2917 
2918 	rcu_read_lock();
2919 	wq = rcu_dereference(sk->sk_wq);
2920 	if (skwq_has_sleeper(wq))
2921 		wake_up_interruptible_all(&wq->wait);
2922 	rcu_read_unlock();
2923 }
2924 
sock_def_error_report(struct sock * sk)2925 static void sock_def_error_report(struct sock *sk)
2926 {
2927 	struct socket_wq *wq;
2928 
2929 	rcu_read_lock();
2930 	wq = rcu_dereference(sk->sk_wq);
2931 	if (skwq_has_sleeper(wq))
2932 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2933 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2934 	rcu_read_unlock();
2935 }
2936 
sock_def_readable(struct sock * sk)2937 void sock_def_readable(struct sock *sk)
2938 {
2939 	struct socket_wq *wq;
2940 
2941 	rcu_read_lock();
2942 	wq = rcu_dereference(sk->sk_wq);
2943 
2944 	if (skwq_has_sleeper(wq)) {
2945 		int done = 0;
2946 
2947 		trace_android_vh_do_wake_up_sync(&wq->wait, &done);
2948 		if (done)
2949 			goto out;
2950 
2951 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2952 						EPOLLRDNORM | EPOLLRDBAND);
2953 	}
2954 
2955 out:
2956 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2957 	rcu_read_unlock();
2958 }
2959 
sock_def_write_space(struct sock * sk)2960 static void sock_def_write_space(struct sock *sk)
2961 {
2962 	struct socket_wq *wq;
2963 
2964 	rcu_read_lock();
2965 
2966 	/* Do not wake up a writer until he can make "significant"
2967 	 * progress.  --DaveM
2968 	 */
2969 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2970 		wq = rcu_dereference(sk->sk_wq);
2971 		if (skwq_has_sleeper(wq))
2972 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2973 						EPOLLWRNORM | EPOLLWRBAND);
2974 
2975 		/* Should agree with poll, otherwise some programs break */
2976 		if (sock_writeable(sk))
2977 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2978 	}
2979 
2980 	rcu_read_unlock();
2981 }
2982 
sock_def_destruct(struct sock * sk)2983 static void sock_def_destruct(struct sock *sk)
2984 {
2985 }
2986 
sk_send_sigurg(struct sock * sk)2987 void sk_send_sigurg(struct sock *sk)
2988 {
2989 	if (sk->sk_socket && sk->sk_socket->file)
2990 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2991 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2992 }
2993 EXPORT_SYMBOL(sk_send_sigurg);
2994 
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)2995 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2996 		    unsigned long expires)
2997 {
2998 	if (!mod_timer(timer, expires))
2999 		sock_hold(sk);
3000 }
3001 EXPORT_SYMBOL(sk_reset_timer);
3002 
sk_stop_timer(struct sock * sk,struct timer_list * timer)3003 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3004 {
3005 	if (del_timer(timer))
3006 		__sock_put(sk);
3007 }
3008 EXPORT_SYMBOL(sk_stop_timer);
3009 
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3010 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3011 {
3012 	if (del_timer_sync(timer))
3013 		__sock_put(sk);
3014 }
3015 EXPORT_SYMBOL(sk_stop_timer_sync);
3016 
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)3017 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3018 {
3019 	sk_init_common(sk);
3020 	sk->sk_send_head	=	NULL;
3021 
3022 	timer_setup(&sk->sk_timer, NULL, 0);
3023 
3024 	sk->sk_allocation	=	GFP_KERNEL;
3025 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3026 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3027 	sk->sk_state		=	TCP_CLOSE;
3028 	sk_set_socket(sk, sock);
3029 
3030 	sock_set_flag(sk, SOCK_ZAPPED);
3031 
3032 	if (sock) {
3033 		sk->sk_type	=	sock->type;
3034 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3035 		sock->sk	=	sk;
3036 	} else {
3037 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3038 	}
3039 	sk->sk_uid	=	uid;
3040 
3041 	rwlock_init(&sk->sk_callback_lock);
3042 	if (sk->sk_kern_sock)
3043 		lockdep_set_class_and_name(
3044 			&sk->sk_callback_lock,
3045 			af_kern_callback_keys + sk->sk_family,
3046 			af_family_kern_clock_key_strings[sk->sk_family]);
3047 	else
3048 		lockdep_set_class_and_name(
3049 			&sk->sk_callback_lock,
3050 			af_callback_keys + sk->sk_family,
3051 			af_family_clock_key_strings[sk->sk_family]);
3052 
3053 	sk->sk_state_change	=	sock_def_wakeup;
3054 	sk->sk_data_ready	=	sock_def_readable;
3055 	sk->sk_write_space	=	sock_def_write_space;
3056 	sk->sk_error_report	=	sock_def_error_report;
3057 	sk->sk_destruct		=	sock_def_destruct;
3058 
3059 	sk->sk_frag.page	=	NULL;
3060 	sk->sk_frag.offset	=	0;
3061 	sk->sk_peek_off		=	-1;
3062 
3063 	sk->sk_peer_pid 	=	NULL;
3064 	sk->sk_peer_cred	=	NULL;
3065 	spin_lock_init(&sk->sk_peer_lock);
3066 
3067 	sk->sk_write_pending	=	0;
3068 	sk->sk_rcvlowat		=	1;
3069 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3070 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3071 
3072 	sk->sk_stamp = SK_DEFAULT_STAMP;
3073 #if BITS_PER_LONG==32
3074 	seqlock_init(&sk->sk_stamp_seq);
3075 #endif
3076 	atomic_set(&sk->sk_zckey, 0);
3077 
3078 #ifdef CONFIG_NET_RX_BUSY_POLL
3079 	sk->sk_napi_id		=	0;
3080 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3081 #endif
3082 
3083 	sk->sk_max_pacing_rate = ~0UL;
3084 	sk->sk_pacing_rate = ~0UL;
3085 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3086 	sk->sk_incoming_cpu = -1;
3087 
3088 	sk_rx_queue_clear(sk);
3089 	/*
3090 	 * Before updating sk_refcnt, we must commit prior changes to memory
3091 	 * (Documentation/RCU/rculist_nulls.rst for details)
3092 	 */
3093 	smp_wmb();
3094 	refcount_set(&sk->sk_refcnt, 1);
3095 	atomic_set(&sk->sk_drops, 0);
3096 }
3097 EXPORT_SYMBOL(sock_init_data_uid);
3098 
sock_init_data(struct socket * sock,struct sock * sk)3099 void sock_init_data(struct socket *sock, struct sock *sk)
3100 {
3101 	kuid_t uid = sock ?
3102 		SOCK_INODE(sock)->i_uid :
3103 		make_kuid(sock_net(sk)->user_ns, 0);
3104 
3105 	sock_init_data_uid(sock, sk, uid);
3106 }
3107 EXPORT_SYMBOL(sock_init_data);
3108 
lock_sock_nested(struct sock * sk,int subclass)3109 void lock_sock_nested(struct sock *sk, int subclass)
3110 {
3111 	might_sleep();
3112 	spin_lock_bh(&sk->sk_lock.slock);
3113 	if (sk->sk_lock.owned)
3114 		__lock_sock(sk);
3115 	sk->sk_lock.owned = 1;
3116 	spin_unlock(&sk->sk_lock.slock);
3117 	/*
3118 	 * The sk_lock has mutex_lock() semantics here:
3119 	 */
3120 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3121 	local_bh_enable();
3122 }
3123 EXPORT_SYMBOL(lock_sock_nested);
3124 
release_sock(struct sock * sk)3125 void release_sock(struct sock *sk)
3126 {
3127 	spin_lock_bh(&sk->sk_lock.slock);
3128 	if (sk->sk_backlog.tail)
3129 		__release_sock(sk);
3130 
3131 	/* Warning : release_cb() might need to release sk ownership,
3132 	 * ie call sock_release_ownership(sk) before us.
3133 	 */
3134 	if (sk->sk_prot->release_cb)
3135 		sk->sk_prot->release_cb(sk);
3136 
3137 	sock_release_ownership(sk);
3138 	if (waitqueue_active(&sk->sk_lock.wq))
3139 		wake_up(&sk->sk_lock.wq);
3140 	spin_unlock_bh(&sk->sk_lock.slock);
3141 }
3142 EXPORT_SYMBOL(release_sock);
3143 
3144 /**
3145  * lock_sock_fast - fast version of lock_sock
3146  * @sk: socket
3147  *
3148  * This version should be used for very small section, where process wont block
3149  * return false if fast path is taken:
3150  *
3151  *   sk_lock.slock locked, owned = 0, BH disabled
3152  *
3153  * return true if slow path is taken:
3154  *
3155  *   sk_lock.slock unlocked, owned = 1, BH enabled
3156  */
lock_sock_fast(struct sock * sk)3157 bool lock_sock_fast(struct sock *sk)
3158 {
3159 	might_sleep();
3160 	spin_lock_bh(&sk->sk_lock.slock);
3161 
3162 	if (!sk->sk_lock.owned)
3163 		/*
3164 		 * Note : We must disable BH
3165 		 */
3166 		return false;
3167 
3168 	__lock_sock(sk);
3169 	sk->sk_lock.owned = 1;
3170 	spin_unlock(&sk->sk_lock.slock);
3171 	/*
3172 	 * The sk_lock has mutex_lock() semantics here:
3173 	 */
3174 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3175 	local_bh_enable();
3176 	return true;
3177 }
3178 EXPORT_SYMBOL(lock_sock_fast);
3179 
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3180 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3181 		   bool timeval, bool time32)
3182 {
3183 	struct sock *sk = sock->sk;
3184 	struct timespec64 ts;
3185 
3186 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3187 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3188 	if (ts.tv_sec == -1)
3189 		return -ENOENT;
3190 	if (ts.tv_sec == 0) {
3191 		ktime_t kt = ktime_get_real();
3192 		sock_write_timestamp(sk, kt);
3193 		ts = ktime_to_timespec64(kt);
3194 	}
3195 
3196 	if (timeval)
3197 		ts.tv_nsec /= 1000;
3198 
3199 #ifdef CONFIG_COMPAT_32BIT_TIME
3200 	if (time32)
3201 		return put_old_timespec32(&ts, userstamp);
3202 #endif
3203 #ifdef CONFIG_SPARC64
3204 	/* beware of padding in sparc64 timeval */
3205 	if (timeval && !in_compat_syscall()) {
3206 		struct __kernel_old_timeval __user tv = {
3207 			.tv_sec = ts.tv_sec,
3208 			.tv_usec = ts.tv_nsec,
3209 		};
3210 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3211 			return -EFAULT;
3212 		return 0;
3213 	}
3214 #endif
3215 	return put_timespec64(&ts, userstamp);
3216 }
3217 EXPORT_SYMBOL(sock_gettstamp);
3218 
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3219 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3220 {
3221 	if (!sock_flag(sk, flag)) {
3222 		unsigned long previous_flags = sk->sk_flags;
3223 
3224 		sock_set_flag(sk, flag);
3225 		/*
3226 		 * we just set one of the two flags which require net
3227 		 * time stamping, but time stamping might have been on
3228 		 * already because of the other one
3229 		 */
3230 		if (sock_needs_netstamp(sk) &&
3231 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3232 			net_enable_timestamp();
3233 	}
3234 }
3235 
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3236 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3237 		       int level, int type)
3238 {
3239 	struct sock_exterr_skb *serr;
3240 	struct sk_buff *skb;
3241 	int copied, err;
3242 
3243 	err = -EAGAIN;
3244 	skb = sock_dequeue_err_skb(sk);
3245 	if (skb == NULL)
3246 		goto out;
3247 
3248 	copied = skb->len;
3249 	if (copied > len) {
3250 		msg->msg_flags |= MSG_TRUNC;
3251 		copied = len;
3252 	}
3253 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3254 	if (err)
3255 		goto out_free_skb;
3256 
3257 	sock_recv_timestamp(msg, sk, skb);
3258 
3259 	serr = SKB_EXT_ERR(skb);
3260 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3261 
3262 	msg->msg_flags |= MSG_ERRQUEUE;
3263 	err = copied;
3264 
3265 out_free_skb:
3266 	kfree_skb(skb);
3267 out:
3268 	return err;
3269 }
3270 EXPORT_SYMBOL(sock_recv_errqueue);
3271 
3272 /*
3273  *	Get a socket option on an socket.
3274  *
3275  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3276  *	asynchronous errors should be reported by getsockopt. We assume
3277  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3278  */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3279 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3280 			   char __user *optval, int __user *optlen)
3281 {
3282 	struct sock *sk = sock->sk;
3283 
3284 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3285 }
3286 EXPORT_SYMBOL(sock_common_getsockopt);
3287 
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3288 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3289 			int flags)
3290 {
3291 	struct sock *sk = sock->sk;
3292 	int addr_len = 0;
3293 	int err;
3294 
3295 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3296 				   flags & ~MSG_DONTWAIT, &addr_len);
3297 	if (err >= 0)
3298 		msg->msg_namelen = addr_len;
3299 	return err;
3300 }
3301 EXPORT_SYMBOL(sock_common_recvmsg);
3302 
3303 /*
3304  *	Set socket options on an inet socket.
3305  */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3306 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3307 			   sockptr_t optval, unsigned int optlen)
3308 {
3309 	struct sock *sk = sock->sk;
3310 
3311 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3312 }
3313 EXPORT_SYMBOL(sock_common_setsockopt);
3314 
sk_common_release(struct sock * sk)3315 void sk_common_release(struct sock *sk)
3316 {
3317 	if (sk->sk_prot->destroy)
3318 		sk->sk_prot->destroy(sk);
3319 
3320 	/*
3321 	 * Observation: when sk_common_release is called, processes have
3322 	 * no access to socket. But net still has.
3323 	 * Step one, detach it from networking:
3324 	 *
3325 	 * A. Remove from hash tables.
3326 	 */
3327 
3328 	sk->sk_prot->unhash(sk);
3329 
3330 	/*
3331 	 * In this point socket cannot receive new packets, but it is possible
3332 	 * that some packets are in flight because some CPU runs receiver and
3333 	 * did hash table lookup before we unhashed socket. They will achieve
3334 	 * receive queue and will be purged by socket destructor.
3335 	 *
3336 	 * Also we still have packets pending on receive queue and probably,
3337 	 * our own packets waiting in device queues. sock_destroy will drain
3338 	 * receive queue, but transmitted packets will delay socket destruction
3339 	 * until the last reference will be released.
3340 	 */
3341 
3342 	sock_orphan(sk);
3343 
3344 	xfrm_sk_free_policy(sk);
3345 
3346 	sk_refcnt_debug_release(sk);
3347 
3348 	sock_put(sk);
3349 }
3350 EXPORT_SYMBOL(sk_common_release);
3351 
sk_get_meminfo(const struct sock * sk,u32 * mem)3352 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3353 {
3354 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3355 
3356 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3357 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3358 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3359 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3360 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3361 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3362 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3363 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3364 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3365 }
3366 
3367 #ifdef CONFIG_PROC_FS
3368 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3369 struct prot_inuse {
3370 	int val[PROTO_INUSE_NR];
3371 };
3372 
3373 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3374 
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)3375 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3376 {
3377 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3378 }
3379 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3380 
sock_prot_inuse_get(struct net * net,struct proto * prot)3381 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3382 {
3383 	int cpu, idx = prot->inuse_idx;
3384 	int res = 0;
3385 
3386 	for_each_possible_cpu(cpu)
3387 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3388 
3389 	return res >= 0 ? res : 0;
3390 }
3391 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3392 
sock_inuse_add(struct net * net,int val)3393 static void sock_inuse_add(struct net *net, int val)
3394 {
3395 	this_cpu_add(*net->core.sock_inuse, val);
3396 }
3397 
sock_inuse_get(struct net * net)3398 int sock_inuse_get(struct net *net)
3399 {
3400 	int cpu, res = 0;
3401 
3402 	for_each_possible_cpu(cpu)
3403 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3404 
3405 	return res;
3406 }
3407 
3408 EXPORT_SYMBOL_GPL(sock_inuse_get);
3409 
sock_inuse_init_net(struct net * net)3410 static int __net_init sock_inuse_init_net(struct net *net)
3411 {
3412 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3413 	if (net->core.prot_inuse == NULL)
3414 		return -ENOMEM;
3415 
3416 	net->core.sock_inuse = alloc_percpu(int);
3417 	if (net->core.sock_inuse == NULL)
3418 		goto out;
3419 
3420 	return 0;
3421 
3422 out:
3423 	free_percpu(net->core.prot_inuse);
3424 	return -ENOMEM;
3425 }
3426 
sock_inuse_exit_net(struct net * net)3427 static void __net_exit sock_inuse_exit_net(struct net *net)
3428 {
3429 	free_percpu(net->core.prot_inuse);
3430 	free_percpu(net->core.sock_inuse);
3431 }
3432 
3433 static struct pernet_operations net_inuse_ops = {
3434 	.init = sock_inuse_init_net,
3435 	.exit = sock_inuse_exit_net,
3436 };
3437 
net_inuse_init(void)3438 static __init int net_inuse_init(void)
3439 {
3440 	if (register_pernet_subsys(&net_inuse_ops))
3441 		panic("Cannot initialize net inuse counters");
3442 
3443 	return 0;
3444 }
3445 
3446 core_initcall(net_inuse_init);
3447 
assign_proto_idx(struct proto * prot)3448 static int assign_proto_idx(struct proto *prot)
3449 {
3450 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3451 
3452 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3453 		pr_err("PROTO_INUSE_NR exhausted\n");
3454 		return -ENOSPC;
3455 	}
3456 
3457 	set_bit(prot->inuse_idx, proto_inuse_idx);
3458 	return 0;
3459 }
3460 
release_proto_idx(struct proto * prot)3461 static void release_proto_idx(struct proto *prot)
3462 {
3463 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3464 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3465 }
3466 #else
assign_proto_idx(struct proto * prot)3467 static inline int assign_proto_idx(struct proto *prot)
3468 {
3469 	return 0;
3470 }
3471 
release_proto_idx(struct proto * prot)3472 static inline void release_proto_idx(struct proto *prot)
3473 {
3474 }
3475 
sock_inuse_add(struct net * net,int val)3476 static void sock_inuse_add(struct net *net, int val)
3477 {
3478 }
3479 #endif
3480 
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3481 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3482 {
3483 	if (!twsk_prot)
3484 		return;
3485 	kfree(twsk_prot->twsk_slab_name);
3486 	twsk_prot->twsk_slab_name = NULL;
3487 	kmem_cache_destroy(twsk_prot->twsk_slab);
3488 	twsk_prot->twsk_slab = NULL;
3489 }
3490 
req_prot_cleanup(struct request_sock_ops * rsk_prot)3491 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3492 {
3493 	if (!rsk_prot)
3494 		return;
3495 	kfree(rsk_prot->slab_name);
3496 	rsk_prot->slab_name = NULL;
3497 	kmem_cache_destroy(rsk_prot->slab);
3498 	rsk_prot->slab = NULL;
3499 }
3500 
req_prot_init(const struct proto * prot)3501 static int req_prot_init(const struct proto *prot)
3502 {
3503 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3504 
3505 	if (!rsk_prot)
3506 		return 0;
3507 
3508 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3509 					prot->name);
3510 	if (!rsk_prot->slab_name)
3511 		return -ENOMEM;
3512 
3513 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3514 					   rsk_prot->obj_size, 0,
3515 					   SLAB_ACCOUNT | prot->slab_flags,
3516 					   NULL);
3517 
3518 	if (!rsk_prot->slab) {
3519 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3520 			prot->name);
3521 		return -ENOMEM;
3522 	}
3523 	return 0;
3524 }
3525 
proto_register(struct proto * prot,int alloc_slab)3526 int proto_register(struct proto *prot, int alloc_slab)
3527 {
3528 	int ret = -ENOBUFS;
3529 
3530 	if (alloc_slab) {
3531 		prot->slab = kmem_cache_create_usercopy(prot->name,
3532 					prot->obj_size, 0,
3533 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3534 					prot->slab_flags,
3535 					prot->useroffset, prot->usersize,
3536 					NULL);
3537 
3538 		if (prot->slab == NULL) {
3539 			pr_crit("%s: Can't create sock SLAB cache!\n",
3540 				prot->name);
3541 			goto out;
3542 		}
3543 
3544 		if (req_prot_init(prot))
3545 			goto out_free_request_sock_slab;
3546 
3547 		if (prot->twsk_prot != NULL) {
3548 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3549 
3550 			if (prot->twsk_prot->twsk_slab_name == NULL)
3551 				goto out_free_request_sock_slab;
3552 
3553 			prot->twsk_prot->twsk_slab =
3554 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3555 						  prot->twsk_prot->twsk_obj_size,
3556 						  0,
3557 						  SLAB_ACCOUNT |
3558 						  prot->slab_flags,
3559 						  NULL);
3560 			if (prot->twsk_prot->twsk_slab == NULL)
3561 				goto out_free_timewait_sock_slab;
3562 		}
3563 	}
3564 
3565 	mutex_lock(&proto_list_mutex);
3566 	ret = assign_proto_idx(prot);
3567 	if (ret) {
3568 		mutex_unlock(&proto_list_mutex);
3569 		goto out_free_timewait_sock_slab;
3570 	}
3571 	list_add(&prot->node, &proto_list);
3572 	mutex_unlock(&proto_list_mutex);
3573 	return ret;
3574 
3575 out_free_timewait_sock_slab:
3576 	if (alloc_slab && prot->twsk_prot)
3577 		tw_prot_cleanup(prot->twsk_prot);
3578 out_free_request_sock_slab:
3579 	if (alloc_slab) {
3580 		req_prot_cleanup(prot->rsk_prot);
3581 
3582 		kmem_cache_destroy(prot->slab);
3583 		prot->slab = NULL;
3584 	}
3585 out:
3586 	return ret;
3587 }
3588 EXPORT_SYMBOL(proto_register);
3589 
proto_unregister(struct proto * prot)3590 void proto_unregister(struct proto *prot)
3591 {
3592 	mutex_lock(&proto_list_mutex);
3593 	release_proto_idx(prot);
3594 	list_del(&prot->node);
3595 	mutex_unlock(&proto_list_mutex);
3596 
3597 	kmem_cache_destroy(prot->slab);
3598 	prot->slab = NULL;
3599 
3600 	req_prot_cleanup(prot->rsk_prot);
3601 	tw_prot_cleanup(prot->twsk_prot);
3602 }
3603 EXPORT_SYMBOL(proto_unregister);
3604 
sock_load_diag_module(int family,int protocol)3605 int sock_load_diag_module(int family, int protocol)
3606 {
3607 	if (!protocol) {
3608 		if (!sock_is_registered(family))
3609 			return -ENOENT;
3610 
3611 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3612 				      NETLINK_SOCK_DIAG, family);
3613 	}
3614 
3615 #ifdef CONFIG_INET
3616 	if (family == AF_INET &&
3617 	    protocol != IPPROTO_RAW &&
3618 	    protocol < MAX_INET_PROTOS &&
3619 	    !rcu_access_pointer(inet_protos[protocol]))
3620 		return -ENOENT;
3621 #endif
3622 
3623 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3624 			      NETLINK_SOCK_DIAG, family, protocol);
3625 }
3626 EXPORT_SYMBOL(sock_load_diag_module);
3627 
3628 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)3629 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3630 	__acquires(proto_list_mutex)
3631 {
3632 	mutex_lock(&proto_list_mutex);
3633 	return seq_list_start_head(&proto_list, *pos);
3634 }
3635 
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)3636 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3637 {
3638 	return seq_list_next(v, &proto_list, pos);
3639 }
3640 
proto_seq_stop(struct seq_file * seq,void * v)3641 static void proto_seq_stop(struct seq_file *seq, void *v)
3642 	__releases(proto_list_mutex)
3643 {
3644 	mutex_unlock(&proto_list_mutex);
3645 }
3646 
proto_method_implemented(const void * method)3647 static char proto_method_implemented(const void *method)
3648 {
3649 	return method == NULL ? 'n' : 'y';
3650 }
sock_prot_memory_allocated(struct proto * proto)3651 static long sock_prot_memory_allocated(struct proto *proto)
3652 {
3653 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3654 }
3655 
sock_prot_memory_pressure(struct proto * proto)3656 static const char *sock_prot_memory_pressure(struct proto *proto)
3657 {
3658 	return proto->memory_pressure != NULL ?
3659 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3660 }
3661 
proto_seq_printf(struct seq_file * seq,struct proto * proto)3662 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3663 {
3664 
3665 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3666 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3667 		   proto->name,
3668 		   proto->obj_size,
3669 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3670 		   sock_prot_memory_allocated(proto),
3671 		   sock_prot_memory_pressure(proto),
3672 		   proto->max_header,
3673 		   proto->slab == NULL ? "no" : "yes",
3674 		   module_name(proto->owner),
3675 		   proto_method_implemented(proto->close),
3676 		   proto_method_implemented(proto->connect),
3677 		   proto_method_implemented(proto->disconnect),
3678 		   proto_method_implemented(proto->accept),
3679 		   proto_method_implemented(proto->ioctl),
3680 		   proto_method_implemented(proto->init),
3681 		   proto_method_implemented(proto->destroy),
3682 		   proto_method_implemented(proto->shutdown),
3683 		   proto_method_implemented(proto->setsockopt),
3684 		   proto_method_implemented(proto->getsockopt),
3685 		   proto_method_implemented(proto->sendmsg),
3686 		   proto_method_implemented(proto->recvmsg),
3687 		   proto_method_implemented(proto->sendpage),
3688 		   proto_method_implemented(proto->bind),
3689 		   proto_method_implemented(proto->backlog_rcv),
3690 		   proto_method_implemented(proto->hash),
3691 		   proto_method_implemented(proto->unhash),
3692 		   proto_method_implemented(proto->get_port),
3693 		   proto_method_implemented(proto->enter_memory_pressure));
3694 }
3695 
proto_seq_show(struct seq_file * seq,void * v)3696 static int proto_seq_show(struct seq_file *seq, void *v)
3697 {
3698 	if (v == &proto_list)
3699 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3700 			   "protocol",
3701 			   "size",
3702 			   "sockets",
3703 			   "memory",
3704 			   "press",
3705 			   "maxhdr",
3706 			   "slab",
3707 			   "module",
3708 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3709 	else
3710 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3711 	return 0;
3712 }
3713 
3714 static const struct seq_operations proto_seq_ops = {
3715 	.start  = proto_seq_start,
3716 	.next   = proto_seq_next,
3717 	.stop   = proto_seq_stop,
3718 	.show   = proto_seq_show,
3719 };
3720 
proto_init_net(struct net * net)3721 static __net_init int proto_init_net(struct net *net)
3722 {
3723 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3724 			sizeof(struct seq_net_private)))
3725 		return -ENOMEM;
3726 
3727 	return 0;
3728 }
3729 
proto_exit_net(struct net * net)3730 static __net_exit void proto_exit_net(struct net *net)
3731 {
3732 	remove_proc_entry("protocols", net->proc_net);
3733 }
3734 
3735 
3736 static __net_initdata struct pernet_operations proto_net_ops = {
3737 	.init = proto_init_net,
3738 	.exit = proto_exit_net,
3739 };
3740 
proto_init(void)3741 static int __init proto_init(void)
3742 {
3743 	return register_pernet_subsys(&proto_net_ops);
3744 }
3745 
3746 subsys_initcall(proto_init);
3747 
3748 #endif /* PROC_FS */
3749 
3750 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)3751 bool sk_busy_loop_end(void *p, unsigned long start_time)
3752 {
3753 	struct sock *sk = p;
3754 
3755 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3756 	       sk_busy_loop_timeout(sk, start_time);
3757 }
3758 EXPORT_SYMBOL(sk_busy_loop_end);
3759 #endif /* CONFIG_NET_RX_BUSY_POLL */
3760 
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)3761 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3762 {
3763 	if (!sk->sk_prot->bind_add)
3764 		return -EOPNOTSUPP;
3765 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3766 }
3767 EXPORT_SYMBOL(sock_bind_add);
3768