• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
120 
121 #include <asm/uaccess.h>
122 
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
135 
136 #include <linux/filter.h>
137 #include <net/sock_reuseport.h>
138 
139 #include <trace/events/sock.h>
140 
141 #include <net/tcp.h>
142 #include <net/busy_poll.h>
143 
144 static DEFINE_MUTEX(proto_list_mutex);
145 static LIST_HEAD(proto_list);
146 
147 /**
148  * sk_ns_capable - General socket capability test
149  * @sk: Socket to use a capability on or through
150  * @user_ns: The user namespace of the capability to use
151  * @cap: The capability to use
152  *
153  * Test to see if the opener of the socket had when the socket was
154  * created and the current process has the capability @cap in the user
155  * namespace @user_ns.
156  */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)157 bool sk_ns_capable(const struct sock *sk,
158 		   struct user_namespace *user_ns, int cap)
159 {
160 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 		ns_capable(user_ns, cap);
162 }
163 EXPORT_SYMBOL(sk_ns_capable);
164 
165 /**
166  * sk_capable - Socket global capability test
167  * @sk: Socket to use a capability on or through
168  * @cap: The global capability to use
169  *
170  * Test to see if the opener of the socket had when the socket was
171  * created and the current process has the capability @cap in all user
172  * namespaces.
173  */
sk_capable(const struct sock * sk,int cap)174 bool sk_capable(const struct sock *sk, int cap)
175 {
176 	return sk_ns_capable(sk, &init_user_ns, cap);
177 }
178 EXPORT_SYMBOL(sk_capable);
179 
180 /**
181  * sk_net_capable - Network namespace socket capability test
182  * @sk: Socket to use a capability on or through
183  * @cap: The capability to use
184  *
185  * Test to see if the opener of the socket had when the socket was created
186  * and the current process has the capability @cap over the network namespace
187  * the socket is a member of.
188  */
sk_net_capable(const struct sock * sk,int cap)189 bool sk_net_capable(const struct sock *sk, int cap)
190 {
191 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
192 }
193 EXPORT_SYMBOL(sk_net_capable);
194 
195 /*
196  * Each address family might have different locking rules, so we have
197  * one slock key per address family:
198  */
199 static struct lock_class_key af_family_keys[AF_MAX];
200 static struct lock_class_key af_family_slock_keys[AF_MAX];
201 
202 /*
203  * Make lock validator output more readable. (we pre-construct these
204  * strings build-time, so that runtime initialization of socket
205  * locks is fast):
206  */
207 static const char *const af_family_key_strings[AF_MAX+1] = {
208   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
209   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
210   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
211   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
212   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
213   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
214   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
215   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
216   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
217   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
218   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
219   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
220   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
221   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_KCM"      ,
222   "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX"
223 };
224 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
225   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
226   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
227   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
228   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
229   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
230   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
231   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
232   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
233   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
234   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
235   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
236   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
237   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
238   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_KCM"       ,
239   "slock-AF_QIPCRTR", "slock-AF_MAX"
240 };
241 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
242   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
243   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
244   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
245   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
246   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
247   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
248   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
249   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
250   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
251   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
252   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
253   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
254   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
255   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
256   "clock-AF_QIPCRTR", "clock-AF_MAX"
257 };
258 
259 /*
260  * sk_callback_lock locking rules are per-address-family,
261  * so split the lock classes by using a per-AF key:
262  */
263 static struct lock_class_key af_callback_keys[AF_MAX];
264 
265 /* Take into consideration the size of the struct sk_buff overhead in the
266  * determination of these values, since that is non-constant across
267  * platforms.  This makes socket queueing behavior and performance
268  * not depend upon such differences.
269  */
270 #define _SK_MEM_PACKETS		256
271 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
272 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
273 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
274 
275 /* Run time adjustable parameters. */
276 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
277 EXPORT_SYMBOL(sysctl_wmem_max);
278 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
279 EXPORT_SYMBOL(sysctl_rmem_max);
280 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
281 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
282 
283 /* Maximal space eaten by iovec or ancillary data plus some space */
284 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
285 EXPORT_SYMBOL(sysctl_optmem_max);
286 
287 int sysctl_tstamp_allow_data __read_mostly = 1;
288 
289 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
290 EXPORT_SYMBOL_GPL(memalloc_socks);
291 
292 /**
293  * sk_set_memalloc - sets %SOCK_MEMALLOC
294  * @sk: socket to set it on
295  *
296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297  * It's the responsibility of the admin to adjust min_free_kbytes
298  * to meet the requirements
299  */
sk_set_memalloc(struct sock * sk)300 void sk_set_memalloc(struct sock *sk)
301 {
302 	sock_set_flag(sk, SOCK_MEMALLOC);
303 	sk->sk_allocation |= __GFP_MEMALLOC;
304 	static_key_slow_inc(&memalloc_socks);
305 }
306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
307 
sk_clear_memalloc(struct sock * sk)308 void sk_clear_memalloc(struct sock *sk)
309 {
310 	sock_reset_flag(sk, SOCK_MEMALLOC);
311 	sk->sk_allocation &= ~__GFP_MEMALLOC;
312 	static_key_slow_dec(&memalloc_socks);
313 
314 	/*
315 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 	 * it has rmem allocations due to the last swapfile being deactivated
318 	 * but there is a risk that the socket is unusable due to exceeding
319 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
320 	 */
321 	sk_mem_reclaim(sk);
322 }
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324 
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326 {
327 	int ret;
328 	unsigned long pflags = current->flags;
329 
330 	/* these should have been dropped before queueing */
331 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332 
333 	current->flags |= PF_MEMALLOC;
334 	ret = sk->sk_backlog_rcv(sk, skb);
335 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
336 
337 	return ret;
338 }
339 EXPORT_SYMBOL(__sk_backlog_rcv);
340 
sock_set_timeout(long * timeo_p,char __user * optval,int optlen)341 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
342 {
343 	struct timeval tv;
344 
345 	if (optlen < sizeof(tv))
346 		return -EINVAL;
347 	if (copy_from_user(&tv, optval, sizeof(tv)))
348 		return -EFAULT;
349 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
350 		return -EDOM;
351 
352 	if (tv.tv_sec < 0) {
353 		static int warned __read_mostly;
354 
355 		*timeo_p = 0;
356 		if (warned < 10 && net_ratelimit()) {
357 			warned++;
358 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
359 				__func__, current->comm, task_pid_nr(current));
360 		}
361 		return 0;
362 	}
363 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
364 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
365 		return 0;
366 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
367 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
368 	return 0;
369 }
370 
sock_warn_obsolete_bsdism(const char * name)371 static void sock_warn_obsolete_bsdism(const char *name)
372 {
373 	static int warned;
374 	static char warncomm[TASK_COMM_LEN];
375 	if (strcmp(warncomm, current->comm) && warned < 5) {
376 		strcpy(warncomm,  current->comm);
377 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
378 			warncomm, name);
379 		warned++;
380 	}
381 }
382 
sock_needs_netstamp(const struct sock * sk)383 static bool sock_needs_netstamp(const struct sock *sk)
384 {
385 	switch (sk->sk_family) {
386 	case AF_UNSPEC:
387 	case AF_UNIX:
388 		return false;
389 	default:
390 		return true;
391 	}
392 }
393 
sock_disable_timestamp(struct sock * sk,unsigned long flags)394 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
395 {
396 	if (sk->sk_flags & flags) {
397 		sk->sk_flags &= ~flags;
398 		if (sock_needs_netstamp(sk) &&
399 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
400 			net_disable_timestamp();
401 	}
402 }
403 
404 
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)405 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
406 {
407 	unsigned long flags;
408 	struct sk_buff_head *list = &sk->sk_receive_queue;
409 
410 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
411 		atomic_inc(&sk->sk_drops);
412 		trace_sock_rcvqueue_full(sk, skb);
413 		return -ENOMEM;
414 	}
415 
416 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
417 		atomic_inc(&sk->sk_drops);
418 		return -ENOBUFS;
419 	}
420 
421 	skb->dev = NULL;
422 	skb_set_owner_r(skb, sk);
423 
424 	/* we escape from rcu protected region, make sure we dont leak
425 	 * a norefcounted dst
426 	 */
427 	skb_dst_force(skb);
428 
429 	spin_lock_irqsave(&list->lock, flags);
430 	sock_skb_set_dropcount(sk, skb);
431 	__skb_queue_tail(list, skb);
432 	spin_unlock_irqrestore(&list->lock, flags);
433 
434 	if (!sock_flag(sk, SOCK_DEAD))
435 		sk->sk_data_ready(sk);
436 	return 0;
437 }
438 EXPORT_SYMBOL(__sock_queue_rcv_skb);
439 
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)440 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
441 {
442 	int err;
443 
444 	err = sk_filter(sk, skb);
445 	if (err)
446 		return err;
447 
448 	return __sock_queue_rcv_skb(sk, skb);
449 }
450 EXPORT_SYMBOL(sock_queue_rcv_skb);
451 
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)452 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
453 		     const int nested, unsigned int trim_cap, bool refcounted)
454 {
455 	int rc = NET_RX_SUCCESS;
456 
457 	if (sk_filter_trim_cap(sk, skb, trim_cap))
458 		goto discard_and_relse;
459 
460 	skb->dev = NULL;
461 
462 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
463 		atomic_inc(&sk->sk_drops);
464 		goto discard_and_relse;
465 	}
466 	if (nested)
467 		bh_lock_sock_nested(sk);
468 	else
469 		bh_lock_sock(sk);
470 	if (!sock_owned_by_user(sk)) {
471 		/*
472 		 * trylock + unlock semantics:
473 		 */
474 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
475 
476 		rc = sk_backlog_rcv(sk, skb);
477 
478 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
479 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
480 		bh_unlock_sock(sk);
481 		atomic_inc(&sk->sk_drops);
482 		goto discard_and_relse;
483 	}
484 
485 	bh_unlock_sock(sk);
486 out:
487 	if (refcounted)
488 		sock_put(sk);
489 	return rc;
490 discard_and_relse:
491 	kfree_skb(skb);
492 	goto out;
493 }
494 EXPORT_SYMBOL(__sk_receive_skb);
495 
__sk_dst_check(struct sock * sk,u32 cookie)496 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
497 {
498 	struct dst_entry *dst = __sk_dst_get(sk);
499 
500 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
501 		sk_tx_queue_clear(sk);
502 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
503 		dst_release(dst);
504 		return NULL;
505 	}
506 
507 	return dst;
508 }
509 EXPORT_SYMBOL(__sk_dst_check);
510 
sk_dst_check(struct sock * sk,u32 cookie)511 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
512 {
513 	struct dst_entry *dst = sk_dst_get(sk);
514 
515 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
516 		sk_dst_reset(sk);
517 		dst_release(dst);
518 		return NULL;
519 	}
520 
521 	return dst;
522 }
523 EXPORT_SYMBOL(sk_dst_check);
524 
sock_setbindtodevice(struct sock * sk,char __user * optval,int optlen)525 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
526 				int optlen)
527 {
528 	int ret = -ENOPROTOOPT;
529 #ifdef CONFIG_NETDEVICES
530 	struct net *net = sock_net(sk);
531 	char devname[IFNAMSIZ];
532 	int index;
533 
534 	/* Sorry... */
535 	ret = -EPERM;
536 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
537 		goto out;
538 
539 	ret = -EINVAL;
540 	if (optlen < 0)
541 		goto out;
542 
543 	/* Bind this socket to a particular device like "eth0",
544 	 * as specified in the passed interface name. If the
545 	 * name is "" or the option length is zero the socket
546 	 * is not bound.
547 	 */
548 	if (optlen > IFNAMSIZ - 1)
549 		optlen = IFNAMSIZ - 1;
550 	memset(devname, 0, sizeof(devname));
551 
552 	ret = -EFAULT;
553 	if (copy_from_user(devname, optval, optlen))
554 		goto out;
555 
556 	index = 0;
557 	if (devname[0] != '\0') {
558 		struct net_device *dev;
559 
560 		rcu_read_lock();
561 		dev = dev_get_by_name_rcu(net, devname);
562 		if (dev)
563 			index = dev->ifindex;
564 		rcu_read_unlock();
565 		ret = -ENODEV;
566 		if (!dev)
567 			goto out;
568 	}
569 
570 	lock_sock(sk);
571 	sk->sk_bound_dev_if = index;
572 	sk_dst_reset(sk);
573 	release_sock(sk);
574 
575 	ret = 0;
576 
577 out:
578 #endif
579 
580 	return ret;
581 }
582 
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)583 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
584 				int __user *optlen, int len)
585 {
586 	int ret = -ENOPROTOOPT;
587 #ifdef CONFIG_NETDEVICES
588 	struct net *net = sock_net(sk);
589 	char devname[IFNAMSIZ];
590 
591 	if (sk->sk_bound_dev_if == 0) {
592 		len = 0;
593 		goto zero;
594 	}
595 
596 	ret = -EINVAL;
597 	if (len < IFNAMSIZ)
598 		goto out;
599 
600 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
601 	if (ret)
602 		goto out;
603 
604 	len = strlen(devname) + 1;
605 
606 	ret = -EFAULT;
607 	if (copy_to_user(optval, devname, len))
608 		goto out;
609 
610 zero:
611 	ret = -EFAULT;
612 	if (put_user(len, optlen))
613 		goto out;
614 
615 	ret = 0;
616 
617 out:
618 #endif
619 
620 	return ret;
621 }
622 
sock_valbool_flag(struct sock * sk,int bit,int valbool)623 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
624 {
625 	if (valbool)
626 		sock_set_flag(sk, bit);
627 	else
628 		sock_reset_flag(sk, bit);
629 }
630 
sk_mc_loop(struct sock * sk)631 bool sk_mc_loop(struct sock *sk)
632 {
633 	if (dev_recursion_level())
634 		return false;
635 	if (!sk)
636 		return true;
637 	switch (sk->sk_family) {
638 	case AF_INET:
639 		return inet_sk(sk)->mc_loop;
640 #if IS_ENABLED(CONFIG_IPV6)
641 	case AF_INET6:
642 		return inet6_sk(sk)->mc_loop;
643 #endif
644 	}
645 	WARN_ON(1);
646 	return true;
647 }
648 EXPORT_SYMBOL(sk_mc_loop);
649 
650 /*
651  *	This is meant for all protocols to use and covers goings on
652  *	at the socket level. Everything here is generic.
653  */
654 
sock_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)655 int sock_setsockopt(struct socket *sock, int level, int optname,
656 		    char __user *optval, unsigned int optlen)
657 {
658 	struct sock *sk = sock->sk;
659 	int val;
660 	int valbool;
661 	struct linger ling;
662 	int ret = 0;
663 
664 	/*
665 	 *	Options without arguments
666 	 */
667 
668 	if (optname == SO_BINDTODEVICE)
669 		return sock_setbindtodevice(sk, optval, optlen);
670 
671 	if (optlen < sizeof(int))
672 		return -EINVAL;
673 
674 	if (get_user(val, (int __user *)optval))
675 		return -EFAULT;
676 
677 	valbool = val ? 1 : 0;
678 
679 	lock_sock(sk);
680 
681 	switch (optname) {
682 	case SO_DEBUG:
683 		if (val && !capable(CAP_NET_ADMIN))
684 			ret = -EACCES;
685 		else
686 			sock_valbool_flag(sk, SOCK_DBG, valbool);
687 		break;
688 	case SO_REUSEADDR:
689 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
690 		break;
691 	case SO_REUSEPORT:
692 		sk->sk_reuseport = valbool;
693 		break;
694 	case SO_TYPE:
695 	case SO_PROTOCOL:
696 	case SO_DOMAIN:
697 	case SO_ERROR:
698 		ret = -ENOPROTOOPT;
699 		break;
700 	case SO_DONTROUTE:
701 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
702 		break;
703 	case SO_BROADCAST:
704 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
705 		break;
706 	case SO_SNDBUF:
707 		/* Don't error on this BSD doesn't and if you think
708 		 * about it this is right. Otherwise apps have to
709 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
710 		 * are treated in BSD as hints
711 		 */
712 		val = min_t(u32, val, sysctl_wmem_max);
713 set_sndbuf:
714 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
715 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
716 		/* Wake up sending tasks if we upped the value. */
717 		sk->sk_write_space(sk);
718 		break;
719 
720 	case SO_SNDBUFFORCE:
721 		if (!capable(CAP_NET_ADMIN)) {
722 			ret = -EPERM;
723 			break;
724 		}
725 		goto set_sndbuf;
726 
727 	case SO_RCVBUF:
728 		/* Don't error on this BSD doesn't and if you think
729 		 * about it this is right. Otherwise apps have to
730 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
731 		 * are treated in BSD as hints
732 		 */
733 		val = min_t(u32, val, sysctl_rmem_max);
734 set_rcvbuf:
735 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
736 		/*
737 		 * We double it on the way in to account for
738 		 * "struct sk_buff" etc. overhead.   Applications
739 		 * assume that the SO_RCVBUF setting they make will
740 		 * allow that much actual data to be received on that
741 		 * socket.
742 		 *
743 		 * Applications are unaware that "struct sk_buff" and
744 		 * other overheads allocate from the receive buffer
745 		 * during socket buffer allocation.
746 		 *
747 		 * And after considering the possible alternatives,
748 		 * returning the value we actually used in getsockopt
749 		 * is the most desirable behavior.
750 		 */
751 		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
752 		break;
753 
754 	case SO_RCVBUFFORCE:
755 		if (!capable(CAP_NET_ADMIN)) {
756 			ret = -EPERM;
757 			break;
758 		}
759 		goto set_rcvbuf;
760 
761 	case SO_KEEPALIVE:
762 #ifdef CONFIG_INET
763 		if (sk->sk_protocol == IPPROTO_TCP &&
764 		    sk->sk_type == SOCK_STREAM)
765 			tcp_set_keepalive(sk, valbool);
766 #endif
767 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
768 		break;
769 
770 	case SO_OOBINLINE:
771 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
772 		break;
773 
774 	case SO_NO_CHECK:
775 		sk->sk_no_check_tx = valbool;
776 		break;
777 
778 	case SO_PRIORITY:
779 		if ((val >= 0 && val <= 6) ||
780 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
781 			sk->sk_priority = val;
782 		else
783 			ret = -EPERM;
784 		break;
785 
786 	case SO_LINGER:
787 		if (optlen < sizeof(ling)) {
788 			ret = -EINVAL;	/* 1003.1g */
789 			break;
790 		}
791 		if (copy_from_user(&ling, optval, sizeof(ling))) {
792 			ret = -EFAULT;
793 			break;
794 		}
795 		if (!ling.l_onoff)
796 			sock_reset_flag(sk, SOCK_LINGER);
797 		else {
798 #if (BITS_PER_LONG == 32)
799 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
800 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
801 			else
802 #endif
803 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
804 			sock_set_flag(sk, SOCK_LINGER);
805 		}
806 		break;
807 
808 	case SO_BSDCOMPAT:
809 		sock_warn_obsolete_bsdism("setsockopt");
810 		break;
811 
812 	case SO_PASSCRED:
813 		if (valbool)
814 			set_bit(SOCK_PASSCRED, &sock->flags);
815 		else
816 			clear_bit(SOCK_PASSCRED, &sock->flags);
817 		break;
818 
819 	case SO_TIMESTAMP:
820 	case SO_TIMESTAMPNS:
821 		if (valbool)  {
822 			if (optname == SO_TIMESTAMP)
823 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
824 			else
825 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
826 			sock_set_flag(sk, SOCK_RCVTSTAMP);
827 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
828 		} else {
829 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
830 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
831 		}
832 		break;
833 
834 	case SO_TIMESTAMPING:
835 		if (val & ~SOF_TIMESTAMPING_MASK) {
836 			ret = -EINVAL;
837 			break;
838 		}
839 
840 		if (val & SOF_TIMESTAMPING_OPT_ID &&
841 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
842 			if (sk->sk_protocol == IPPROTO_TCP &&
843 			    sk->sk_type == SOCK_STREAM) {
844 				if ((1 << sk->sk_state) &
845 				    (TCPF_CLOSE | TCPF_LISTEN)) {
846 					ret = -EINVAL;
847 					break;
848 				}
849 				sk->sk_tskey = tcp_sk(sk)->snd_una;
850 			} else {
851 				sk->sk_tskey = 0;
852 			}
853 		}
854 		sk->sk_tsflags = val;
855 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
856 			sock_enable_timestamp(sk,
857 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
858 		else
859 			sock_disable_timestamp(sk,
860 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
861 		break;
862 
863 	case SO_RCVLOWAT:
864 		if (val < 0)
865 			val = INT_MAX;
866 		sk->sk_rcvlowat = val ? : 1;
867 		break;
868 
869 	case SO_RCVTIMEO:
870 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
871 		break;
872 
873 	case SO_SNDTIMEO:
874 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
875 		break;
876 
877 	case SO_ATTACH_FILTER:
878 		ret = -EINVAL;
879 		if (optlen == sizeof(struct sock_fprog)) {
880 			struct sock_fprog fprog;
881 
882 			ret = -EFAULT;
883 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
884 				break;
885 
886 			ret = sk_attach_filter(&fprog, sk);
887 		}
888 		break;
889 
890 	case SO_ATTACH_BPF:
891 		ret = -EINVAL;
892 		if (optlen == sizeof(u32)) {
893 			u32 ufd;
894 
895 			ret = -EFAULT;
896 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
897 				break;
898 
899 			ret = sk_attach_bpf(ufd, sk);
900 		}
901 		break;
902 
903 	case SO_ATTACH_REUSEPORT_CBPF:
904 		ret = -EINVAL;
905 		if (optlen == sizeof(struct sock_fprog)) {
906 			struct sock_fprog fprog;
907 
908 			ret = -EFAULT;
909 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
910 				break;
911 
912 			ret = sk_reuseport_attach_filter(&fprog, sk);
913 		}
914 		break;
915 
916 	case SO_ATTACH_REUSEPORT_EBPF:
917 		ret = -EINVAL;
918 		if (optlen == sizeof(u32)) {
919 			u32 ufd;
920 
921 			ret = -EFAULT;
922 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
923 				break;
924 
925 			ret = sk_reuseport_attach_bpf(ufd, sk);
926 		}
927 		break;
928 
929 	case SO_DETACH_FILTER:
930 		ret = sk_detach_filter(sk);
931 		break;
932 
933 	case SO_LOCK_FILTER:
934 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
935 			ret = -EPERM;
936 		else
937 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
938 		break;
939 
940 	case SO_PASSSEC:
941 		if (valbool)
942 			set_bit(SOCK_PASSSEC, &sock->flags);
943 		else
944 			clear_bit(SOCK_PASSSEC, &sock->flags);
945 		break;
946 	case SO_MARK:
947 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
948 			ret = -EPERM;
949 		else
950 			sk->sk_mark = val;
951 		break;
952 
953 	case SO_RXQ_OVFL:
954 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
955 		break;
956 
957 	case SO_WIFI_STATUS:
958 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
959 		break;
960 
961 	case SO_PEEK_OFF:
962 		if (sock->ops->set_peek_off)
963 			ret = sock->ops->set_peek_off(sk, val);
964 		else
965 			ret = -EOPNOTSUPP;
966 		break;
967 
968 	case SO_NOFCS:
969 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
970 		break;
971 
972 	case SO_SELECT_ERR_QUEUE:
973 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
974 		break;
975 
976 #ifdef CONFIG_NET_RX_BUSY_POLL
977 	case SO_BUSY_POLL:
978 		/* allow unprivileged users to decrease the value */
979 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
980 			ret = -EPERM;
981 		else {
982 			if (val < 0)
983 				ret = -EINVAL;
984 			else
985 				sk->sk_ll_usec = val;
986 		}
987 		break;
988 #endif
989 
990 	case SO_MAX_PACING_RATE:
991 		sk->sk_max_pacing_rate = val;
992 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
993 					 sk->sk_max_pacing_rate);
994 		break;
995 
996 	case SO_INCOMING_CPU:
997 		sk->sk_incoming_cpu = val;
998 		break;
999 
1000 	case SO_CNX_ADVICE:
1001 		if (val == 1)
1002 			dst_negative_advice(sk);
1003 		break;
1004 	default:
1005 		ret = -ENOPROTOOPT;
1006 		break;
1007 	}
1008 	release_sock(sk);
1009 	return ret;
1010 }
1011 EXPORT_SYMBOL(sock_setsockopt);
1012 
1013 
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1014 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1015 			  struct ucred *ucred)
1016 {
1017 	ucred->pid = pid_vnr(pid);
1018 	ucred->uid = ucred->gid = -1;
1019 	if (cred) {
1020 		struct user_namespace *current_ns = current_user_ns();
1021 
1022 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1023 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1024 	}
1025 }
1026 
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1027 int sock_getsockopt(struct socket *sock, int level, int optname,
1028 		    char __user *optval, int __user *optlen)
1029 {
1030 	struct sock *sk = sock->sk;
1031 
1032 	union {
1033 		int val;
1034 		u64 val64;
1035 		struct linger ling;
1036 		struct timeval tm;
1037 	} v;
1038 
1039 	int lv = sizeof(int);
1040 	int len;
1041 
1042 	if (get_user(len, optlen))
1043 		return -EFAULT;
1044 	if (len < 0)
1045 		return -EINVAL;
1046 
1047 	memset(&v, 0, sizeof(v));
1048 
1049 	switch (optname) {
1050 	case SO_DEBUG:
1051 		v.val = sock_flag(sk, SOCK_DBG);
1052 		break;
1053 
1054 	case SO_DONTROUTE:
1055 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1056 		break;
1057 
1058 	case SO_BROADCAST:
1059 		v.val = sock_flag(sk, SOCK_BROADCAST);
1060 		break;
1061 
1062 	case SO_SNDBUF:
1063 		v.val = sk->sk_sndbuf;
1064 		break;
1065 
1066 	case SO_RCVBUF:
1067 		v.val = sk->sk_rcvbuf;
1068 		break;
1069 
1070 	case SO_REUSEADDR:
1071 		v.val = sk->sk_reuse;
1072 		break;
1073 
1074 	case SO_REUSEPORT:
1075 		v.val = sk->sk_reuseport;
1076 		break;
1077 
1078 	case SO_KEEPALIVE:
1079 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1080 		break;
1081 
1082 	case SO_TYPE:
1083 		v.val = sk->sk_type;
1084 		break;
1085 
1086 	case SO_PROTOCOL:
1087 		v.val = sk->sk_protocol;
1088 		break;
1089 
1090 	case SO_DOMAIN:
1091 		v.val = sk->sk_family;
1092 		break;
1093 
1094 	case SO_ERROR:
1095 		v.val = -sock_error(sk);
1096 		if (v.val == 0)
1097 			v.val = xchg(&sk->sk_err_soft, 0);
1098 		break;
1099 
1100 	case SO_OOBINLINE:
1101 		v.val = sock_flag(sk, SOCK_URGINLINE);
1102 		break;
1103 
1104 	case SO_NO_CHECK:
1105 		v.val = sk->sk_no_check_tx;
1106 		break;
1107 
1108 	case SO_PRIORITY:
1109 		v.val = sk->sk_priority;
1110 		break;
1111 
1112 	case SO_LINGER:
1113 		lv		= sizeof(v.ling);
1114 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1115 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1116 		break;
1117 
1118 	case SO_BSDCOMPAT:
1119 		sock_warn_obsolete_bsdism("getsockopt");
1120 		break;
1121 
1122 	case SO_TIMESTAMP:
1123 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1124 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1125 		break;
1126 
1127 	case SO_TIMESTAMPNS:
1128 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1129 		break;
1130 
1131 	case SO_TIMESTAMPING:
1132 		v.val = sk->sk_tsflags;
1133 		break;
1134 
1135 	case SO_RCVTIMEO:
1136 		lv = sizeof(struct timeval);
1137 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1138 			v.tm.tv_sec = 0;
1139 			v.tm.tv_usec = 0;
1140 		} else {
1141 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1142 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1143 		}
1144 		break;
1145 
1146 	case SO_SNDTIMEO:
1147 		lv = sizeof(struct timeval);
1148 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1149 			v.tm.tv_sec = 0;
1150 			v.tm.tv_usec = 0;
1151 		} else {
1152 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1153 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1154 		}
1155 		break;
1156 
1157 	case SO_RCVLOWAT:
1158 		v.val = sk->sk_rcvlowat;
1159 		break;
1160 
1161 	case SO_SNDLOWAT:
1162 		v.val = 1;
1163 		break;
1164 
1165 	case SO_PASSCRED:
1166 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1167 		break;
1168 
1169 	case SO_PEERCRED:
1170 	{
1171 		struct ucred peercred;
1172 		if (len > sizeof(peercred))
1173 			len = sizeof(peercred);
1174 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1175 		if (copy_to_user(optval, &peercred, len))
1176 			return -EFAULT;
1177 		goto lenout;
1178 	}
1179 
1180 	case SO_PEERNAME:
1181 	{
1182 		char address[128];
1183 
1184 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1185 			return -ENOTCONN;
1186 		if (lv < len)
1187 			return -EINVAL;
1188 		if (copy_to_user(optval, address, len))
1189 			return -EFAULT;
1190 		goto lenout;
1191 	}
1192 
1193 	/* Dubious BSD thing... Probably nobody even uses it, but
1194 	 * the UNIX standard wants it for whatever reason... -DaveM
1195 	 */
1196 	case SO_ACCEPTCONN:
1197 		v.val = sk->sk_state == TCP_LISTEN;
1198 		break;
1199 
1200 	case SO_PASSSEC:
1201 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1202 		break;
1203 
1204 	case SO_PEERSEC:
1205 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1206 
1207 	case SO_MARK:
1208 		v.val = sk->sk_mark;
1209 		break;
1210 
1211 	case SO_RXQ_OVFL:
1212 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1213 		break;
1214 
1215 	case SO_WIFI_STATUS:
1216 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1217 		break;
1218 
1219 	case SO_PEEK_OFF:
1220 		if (!sock->ops->set_peek_off)
1221 			return -EOPNOTSUPP;
1222 
1223 		v.val = sk->sk_peek_off;
1224 		break;
1225 	case SO_NOFCS:
1226 		v.val = sock_flag(sk, SOCK_NOFCS);
1227 		break;
1228 
1229 	case SO_BINDTODEVICE:
1230 		return sock_getbindtodevice(sk, optval, optlen, len);
1231 
1232 	case SO_GET_FILTER:
1233 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1234 		if (len < 0)
1235 			return len;
1236 
1237 		goto lenout;
1238 
1239 	case SO_LOCK_FILTER:
1240 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1241 		break;
1242 
1243 	case SO_BPF_EXTENSIONS:
1244 		v.val = bpf_tell_extensions();
1245 		break;
1246 
1247 	case SO_SELECT_ERR_QUEUE:
1248 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1249 		break;
1250 
1251 #ifdef CONFIG_NET_RX_BUSY_POLL
1252 	case SO_BUSY_POLL:
1253 		v.val = sk->sk_ll_usec;
1254 		break;
1255 #endif
1256 
1257 	case SO_MAX_PACING_RATE:
1258 		v.val = sk->sk_max_pacing_rate;
1259 		break;
1260 
1261 	case SO_INCOMING_CPU:
1262 		v.val = sk->sk_incoming_cpu;
1263 		break;
1264 
1265 
1266 	case SO_COOKIE:
1267 		lv = sizeof(u64);
1268 		if (len < lv)
1269 			return -EINVAL;
1270 		v.val64 = sock_gen_cookie(sk);
1271 		break;
1272 	default:
1273 		/* We implement the SO_SNDLOWAT etc to not be settable
1274 		 * (1003.1g 7).
1275 		 */
1276 		return -ENOPROTOOPT;
1277 	}
1278 
1279 	if (len > lv)
1280 		len = lv;
1281 	if (copy_to_user(optval, &v, len))
1282 		return -EFAULT;
1283 lenout:
1284 	if (put_user(len, optlen))
1285 		return -EFAULT;
1286 	return 0;
1287 }
1288 
1289 /*
1290  * Initialize an sk_lock.
1291  *
1292  * (We also register the sk_lock with the lock validator.)
1293  */
sock_lock_init(struct sock * sk)1294 static inline void sock_lock_init(struct sock *sk)
1295 {
1296 	sock_lock_init_class_and_name(sk,
1297 			af_family_slock_key_strings[sk->sk_family],
1298 			af_family_slock_keys + sk->sk_family,
1299 			af_family_key_strings[sk->sk_family],
1300 			af_family_keys + sk->sk_family);
1301 }
1302 
1303 /*
1304  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1305  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1306  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1307  */
sock_copy(struct sock * nsk,const struct sock * osk)1308 static void sock_copy(struct sock *nsk, const struct sock *osk)
1309 {
1310 #ifdef CONFIG_SECURITY_NETWORK
1311 	void *sptr = nsk->sk_security;
1312 #endif
1313 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1314 
1315 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1316 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1317 
1318 #ifdef CONFIG_SECURITY_NETWORK
1319 	nsk->sk_security = sptr;
1320 	security_sk_clone(osk, nsk);
1321 #endif
1322 }
1323 
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1324 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1325 		int family)
1326 {
1327 	struct sock *sk;
1328 	struct kmem_cache *slab;
1329 
1330 	slab = prot->slab;
1331 	if (slab != NULL) {
1332 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1333 		if (!sk)
1334 			return sk;
1335 		if (priority & __GFP_ZERO)
1336 			sk_prot_clear_nulls(sk, prot->obj_size);
1337 	} else
1338 		sk = kmalloc(prot->obj_size, priority);
1339 
1340 	if (sk != NULL) {
1341 		kmemcheck_annotate_bitfield(sk, flags);
1342 
1343 		if (security_sk_alloc(sk, family, priority))
1344 			goto out_free;
1345 
1346 		if (!try_module_get(prot->owner))
1347 			goto out_free_sec;
1348 		sk_tx_queue_clear(sk);
1349 	}
1350 
1351 	return sk;
1352 
1353 out_free_sec:
1354 	security_sk_free(sk);
1355 out_free:
1356 	if (slab != NULL)
1357 		kmem_cache_free(slab, sk);
1358 	else
1359 		kfree(sk);
1360 	return NULL;
1361 }
1362 
sk_prot_free(struct proto * prot,struct sock * sk)1363 static void sk_prot_free(struct proto *prot, struct sock *sk)
1364 {
1365 	struct kmem_cache *slab;
1366 	struct module *owner;
1367 
1368 	owner = prot->owner;
1369 	slab = prot->slab;
1370 
1371 	cgroup_sk_free(&sk->sk_cgrp_data);
1372 	mem_cgroup_sk_free(sk);
1373 	security_sk_free(sk);
1374 	if (slab != NULL)
1375 		kmem_cache_free(slab, sk);
1376 	else
1377 		kfree(sk);
1378 	module_put(owner);
1379 }
1380 
1381 /**
1382  *	sk_alloc - All socket objects are allocated here
1383  *	@net: the applicable net namespace
1384  *	@family: protocol family
1385  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1386  *	@prot: struct proto associated with this new sock instance
1387  *	@kern: is this to be a kernel socket?
1388  */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)1389 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1390 		      struct proto *prot, int kern)
1391 {
1392 	struct sock *sk;
1393 
1394 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1395 	if (sk) {
1396 		sk->sk_family = family;
1397 		/*
1398 		 * See comment in struct sock definition to understand
1399 		 * why we need sk_prot_creator -acme
1400 		 */
1401 		sk->sk_prot = sk->sk_prot_creator = prot;
1402 		sock_lock_init(sk);
1403 		sk->sk_net_refcnt = kern ? 0 : 1;
1404 		if (likely(sk->sk_net_refcnt))
1405 			get_net(net);
1406 		sock_net_set(sk, net);
1407 		atomic_set(&sk->sk_wmem_alloc, 1);
1408 
1409 		mem_cgroup_sk_alloc(sk);
1410 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1411 		sock_update_classid(&sk->sk_cgrp_data);
1412 		sock_update_netprioidx(&sk->sk_cgrp_data);
1413 	}
1414 
1415 	return sk;
1416 }
1417 EXPORT_SYMBOL(sk_alloc);
1418 
1419 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1420  * grace period. This is the case for UDP sockets and TCP listeners.
1421  */
__sk_destruct(struct rcu_head * head)1422 static void __sk_destruct(struct rcu_head *head)
1423 {
1424 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1425 	struct sk_filter *filter;
1426 
1427 	if (sk->sk_destruct)
1428 		sk->sk_destruct(sk);
1429 
1430 	filter = rcu_dereference_check(sk->sk_filter,
1431 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1432 	if (filter) {
1433 		sk_filter_uncharge(sk, filter);
1434 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1435 	}
1436 	if (rcu_access_pointer(sk->sk_reuseport_cb))
1437 		reuseport_detach_sock(sk);
1438 
1439 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1440 
1441 	if (atomic_read(&sk->sk_omem_alloc))
1442 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1443 			 __func__, atomic_read(&sk->sk_omem_alloc));
1444 
1445 	if (sk->sk_frag.page) {
1446 		put_page(sk->sk_frag.page);
1447 		sk->sk_frag.page = NULL;
1448 	}
1449 
1450 	if (sk->sk_peer_cred)
1451 		put_cred(sk->sk_peer_cred);
1452 	put_pid(sk->sk_peer_pid);
1453 	if (likely(sk->sk_net_refcnt))
1454 		put_net(sock_net(sk));
1455 	sk_prot_free(sk->sk_prot_creator, sk);
1456 }
1457 
sk_destruct(struct sock * sk)1458 void sk_destruct(struct sock *sk)
1459 {
1460 	if (sock_flag(sk, SOCK_RCU_FREE))
1461 		call_rcu(&sk->sk_rcu, __sk_destruct);
1462 	else
1463 		__sk_destruct(&sk->sk_rcu);
1464 }
1465 
__sk_free(struct sock * sk)1466 static void __sk_free(struct sock *sk)
1467 {
1468 	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1469 		sock_diag_broadcast_destroy(sk);
1470 	else
1471 		sk_destruct(sk);
1472 }
1473 
sk_free(struct sock * sk)1474 void sk_free(struct sock *sk)
1475 {
1476 	/*
1477 	 * We subtract one from sk_wmem_alloc and can know if
1478 	 * some packets are still in some tx queue.
1479 	 * If not null, sock_wfree() will call __sk_free(sk) later
1480 	 */
1481 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1482 		__sk_free(sk);
1483 }
1484 EXPORT_SYMBOL(sk_free);
1485 
1486 /**
1487  *	sk_clone_lock - clone a socket, and lock its clone
1488  *	@sk: the socket to clone
1489  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1490  *
1491  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1492  */
sk_clone_lock(const struct sock * sk,const gfp_t priority)1493 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1494 {
1495 	struct sock *newsk;
1496 	bool is_charged = true;
1497 
1498 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1499 	if (newsk != NULL) {
1500 		struct sk_filter *filter;
1501 
1502 		sock_copy(newsk, sk);
1503 
1504 		newsk->sk_prot_creator = sk->sk_prot;
1505 
1506 		/* SANITY */
1507 		if (likely(newsk->sk_net_refcnt))
1508 			get_net(sock_net(newsk));
1509 		sk_node_init(&newsk->sk_node);
1510 		sock_lock_init(newsk);
1511 		bh_lock_sock(newsk);
1512 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1513 		newsk->sk_backlog.len = 0;
1514 
1515 		atomic_set(&newsk->sk_rmem_alloc, 0);
1516 		/*
1517 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1518 		 */
1519 		atomic_set(&newsk->sk_wmem_alloc, 1);
1520 		atomic_set(&newsk->sk_omem_alloc, 0);
1521 		skb_queue_head_init(&newsk->sk_receive_queue);
1522 		skb_queue_head_init(&newsk->sk_write_queue);
1523 
1524 		rwlock_init(&newsk->sk_callback_lock);
1525 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1526 				af_callback_keys + newsk->sk_family,
1527 				af_family_clock_key_strings[newsk->sk_family]);
1528 
1529 		newsk->sk_dst_cache	= NULL;
1530 		newsk->sk_wmem_queued	= 0;
1531 		newsk->sk_forward_alloc = 0;
1532 		atomic_set(&newsk->sk_drops, 0);
1533 		newsk->sk_send_head	= NULL;
1534 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1535 
1536 		sock_reset_flag(newsk, SOCK_DONE);
1537 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1538 		skb_queue_head_init(&newsk->sk_error_queue);
1539 
1540 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1541 		if (filter != NULL)
1542 			/* though it's an empty new sock, the charging may fail
1543 			 * if sysctl_optmem_max was changed between creation of
1544 			 * original socket and cloning
1545 			 */
1546 			is_charged = sk_filter_charge(newsk, filter);
1547 
1548 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1549 			/* We need to make sure that we don't uncharge the new
1550 			 * socket if we couldn't charge it in the first place
1551 			 * as otherwise we uncharge the parent's filter.
1552 			 */
1553 			if (!is_charged)
1554 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1555 			/* It is still raw copy of parent, so invalidate
1556 			 * destructor and make plain sk_free() */
1557 			newsk->sk_destruct = NULL;
1558 			bh_unlock_sock(newsk);
1559 			sk_free(newsk);
1560 			newsk = NULL;
1561 			goto out;
1562 		}
1563 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1564 
1565 		newsk->sk_err	   = 0;
1566 		newsk->sk_err_soft = 0;
1567 		newsk->sk_priority = 0;
1568 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1569 		atomic64_set(&newsk->sk_cookie, 0);
1570 
1571 		mem_cgroup_sk_alloc(newsk);
1572 		/*
1573 		 * Before updating sk_refcnt, we must commit prior changes to memory
1574 		 * (Documentation/RCU/rculist_nulls.txt for details)
1575 		 */
1576 		smp_wmb();
1577 		atomic_set(&newsk->sk_refcnt, 2);
1578 
1579 		/*
1580 		 * Increment the counter in the same struct proto as the master
1581 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1582 		 * is the same as sk->sk_prot->socks, as this field was copied
1583 		 * with memcpy).
1584 		 *
1585 		 * This _changes_ the previous behaviour, where
1586 		 * tcp_create_openreq_child always was incrementing the
1587 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1588 		 * to be taken into account in all callers. -acme
1589 		 */
1590 		sk_refcnt_debug_inc(newsk);
1591 		sk_set_socket(newsk, NULL);
1592 		newsk->sk_wq = NULL;
1593 
1594 		if (newsk->sk_prot->sockets_allocated)
1595 			sk_sockets_allocated_inc(newsk);
1596 
1597 		if (sock_needs_netstamp(sk) &&
1598 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1599 			net_enable_timestamp();
1600 	}
1601 out:
1602 	return newsk;
1603 }
1604 EXPORT_SYMBOL_GPL(sk_clone_lock);
1605 
sk_setup_caps(struct sock * sk,struct dst_entry * dst)1606 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1607 {
1608 	u32 max_segs = 1;
1609 
1610 	sk_dst_set(sk, dst);
1611 	sk->sk_route_caps = dst->dev->features;
1612 	if (sk->sk_route_caps & NETIF_F_GSO)
1613 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1614 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1615 	if (sk_can_gso(sk)) {
1616 		if (dst->header_len) {
1617 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1618 		} else {
1619 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1620 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1621 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1622 		}
1623 	}
1624 	sk->sk_gso_max_segs = max_segs;
1625 }
1626 EXPORT_SYMBOL_GPL(sk_setup_caps);
1627 
1628 /*
1629  *	Simple resource managers for sockets.
1630  */
1631 
1632 
1633 /*
1634  * Write buffer destructor automatically called from kfree_skb.
1635  */
sock_wfree(struct sk_buff * skb)1636 void sock_wfree(struct sk_buff *skb)
1637 {
1638 	struct sock *sk = skb->sk;
1639 	unsigned int len = skb->truesize;
1640 
1641 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1642 		/*
1643 		 * Keep a reference on sk_wmem_alloc, this will be released
1644 		 * after sk_write_space() call
1645 		 */
1646 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1647 		sk->sk_write_space(sk);
1648 		len = 1;
1649 	}
1650 	/*
1651 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1652 	 * could not do because of in-flight packets
1653 	 */
1654 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1655 		__sk_free(sk);
1656 }
1657 EXPORT_SYMBOL(sock_wfree);
1658 
1659 /* This variant of sock_wfree() is used by TCP,
1660  * since it sets SOCK_USE_WRITE_QUEUE.
1661  */
__sock_wfree(struct sk_buff * skb)1662 void __sock_wfree(struct sk_buff *skb)
1663 {
1664 	struct sock *sk = skb->sk;
1665 
1666 	if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1667 		__sk_free(sk);
1668 }
1669 
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)1670 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1671 {
1672 	skb_orphan(skb);
1673 	skb->sk = sk;
1674 #ifdef CONFIG_INET
1675 	if (unlikely(!sk_fullsock(sk))) {
1676 		skb->destructor = sock_edemux;
1677 		sock_hold(sk);
1678 		return;
1679 	}
1680 #endif
1681 	skb->destructor = sock_wfree;
1682 	skb_set_hash_from_sk(skb, sk);
1683 	/*
1684 	 * We used to take a refcount on sk, but following operation
1685 	 * is enough to guarantee sk_free() wont free this sock until
1686 	 * all in-flight packets are completed
1687 	 */
1688 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1689 }
1690 EXPORT_SYMBOL(skb_set_owner_w);
1691 
1692 /* This helper is used by netem, as it can hold packets in its
1693  * delay queue. We want to allow the owner socket to send more
1694  * packets, as if they were already TX completed by a typical driver.
1695  * But we also want to keep skb->sk set because some packet schedulers
1696  * rely on it (sch_fq for example).
1697  */
skb_orphan_partial(struct sk_buff * skb)1698 void skb_orphan_partial(struct sk_buff *skb)
1699 {
1700 	if (skb_is_tcp_pure_ack(skb))
1701 		return;
1702 
1703 	if (skb->destructor == sock_wfree
1704 #ifdef CONFIG_INET
1705 	    || skb->destructor == tcp_wfree
1706 #endif
1707 		) {
1708 		struct sock *sk = skb->sk;
1709 
1710 		if (atomic_inc_not_zero(&sk->sk_refcnt)) {
1711 			atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1712 			skb->destructor = sock_efree;
1713 		}
1714 	} else {
1715 		skb_orphan(skb);
1716 	}
1717 }
1718 EXPORT_SYMBOL(skb_orphan_partial);
1719 
1720 /*
1721  * Read buffer destructor automatically called from kfree_skb.
1722  */
sock_rfree(struct sk_buff * skb)1723 void sock_rfree(struct sk_buff *skb)
1724 {
1725 	struct sock *sk = skb->sk;
1726 	unsigned int len = skb->truesize;
1727 
1728 	atomic_sub(len, &sk->sk_rmem_alloc);
1729 	sk_mem_uncharge(sk, len);
1730 }
1731 EXPORT_SYMBOL(sock_rfree);
1732 
1733 /*
1734  * Buffer destructor for skbs that are not used directly in read or write
1735  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1736  */
sock_efree(struct sk_buff * skb)1737 void sock_efree(struct sk_buff *skb)
1738 {
1739 	sock_put(skb->sk);
1740 }
1741 EXPORT_SYMBOL(sock_efree);
1742 
sock_i_uid(struct sock * sk)1743 kuid_t sock_i_uid(struct sock *sk)
1744 {
1745 	kuid_t uid;
1746 
1747 	read_lock_bh(&sk->sk_callback_lock);
1748 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1749 	read_unlock_bh(&sk->sk_callback_lock);
1750 	return uid;
1751 }
1752 EXPORT_SYMBOL(sock_i_uid);
1753 
sock_i_ino(struct sock * sk)1754 unsigned long sock_i_ino(struct sock *sk)
1755 {
1756 	unsigned long ino;
1757 
1758 	read_lock_bh(&sk->sk_callback_lock);
1759 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1760 	read_unlock_bh(&sk->sk_callback_lock);
1761 	return ino;
1762 }
1763 EXPORT_SYMBOL(sock_i_ino);
1764 
1765 /*
1766  * Allocate a skb from the socket's send buffer.
1767  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)1768 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1769 			     gfp_t priority)
1770 {
1771 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1772 		struct sk_buff *skb = alloc_skb(size, priority);
1773 		if (skb) {
1774 			skb_set_owner_w(skb, sk);
1775 			return skb;
1776 		}
1777 	}
1778 	return NULL;
1779 }
1780 EXPORT_SYMBOL(sock_wmalloc);
1781 
1782 /*
1783  * Allocate a memory block from the socket's option memory buffer.
1784  */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)1785 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1786 {
1787 	if ((unsigned int)size <= sysctl_optmem_max &&
1788 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1789 		void *mem;
1790 		/* First do the add, to avoid the race if kmalloc
1791 		 * might sleep.
1792 		 */
1793 		atomic_add(size, &sk->sk_omem_alloc);
1794 		mem = kmalloc(size, priority);
1795 		if (mem)
1796 			return mem;
1797 		atomic_sub(size, &sk->sk_omem_alloc);
1798 	}
1799 	return NULL;
1800 }
1801 EXPORT_SYMBOL(sock_kmalloc);
1802 
1803 /* Free an option memory block. Note, we actually want the inline
1804  * here as this allows gcc to detect the nullify and fold away the
1805  * condition entirely.
1806  */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)1807 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1808 				  const bool nullify)
1809 {
1810 	if (WARN_ON_ONCE(!mem))
1811 		return;
1812 	if (nullify)
1813 		kzfree(mem);
1814 	else
1815 		kfree(mem);
1816 	atomic_sub(size, &sk->sk_omem_alloc);
1817 }
1818 
sock_kfree_s(struct sock * sk,void * mem,int size)1819 void sock_kfree_s(struct sock *sk, void *mem, int size)
1820 {
1821 	__sock_kfree_s(sk, mem, size, false);
1822 }
1823 EXPORT_SYMBOL(sock_kfree_s);
1824 
sock_kzfree_s(struct sock * sk,void * mem,int size)1825 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1826 {
1827 	__sock_kfree_s(sk, mem, size, true);
1828 }
1829 EXPORT_SYMBOL(sock_kzfree_s);
1830 
1831 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1832    I think, these locks should be removed for datagram sockets.
1833  */
sock_wait_for_wmem(struct sock * sk,long timeo)1834 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1835 {
1836 	DEFINE_WAIT(wait);
1837 
1838 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1839 	for (;;) {
1840 		if (!timeo)
1841 			break;
1842 		if (signal_pending(current))
1843 			break;
1844 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1845 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1846 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1847 			break;
1848 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1849 			break;
1850 		if (sk->sk_err)
1851 			break;
1852 		timeo = schedule_timeout(timeo);
1853 	}
1854 	finish_wait(sk_sleep(sk), &wait);
1855 	return timeo;
1856 }
1857 
1858 
1859 /*
1860  *	Generic send/receive buffer handlers
1861  */
1862 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)1863 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1864 				     unsigned long data_len, int noblock,
1865 				     int *errcode, int max_page_order)
1866 {
1867 	struct sk_buff *skb;
1868 	long timeo;
1869 	int err;
1870 
1871 	timeo = sock_sndtimeo(sk, noblock);
1872 	for (;;) {
1873 		err = sock_error(sk);
1874 		if (err != 0)
1875 			goto failure;
1876 
1877 		err = -EPIPE;
1878 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1879 			goto failure;
1880 
1881 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1882 			break;
1883 
1884 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1885 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1886 		err = -EAGAIN;
1887 		if (!timeo)
1888 			goto failure;
1889 		if (signal_pending(current))
1890 			goto interrupted;
1891 		timeo = sock_wait_for_wmem(sk, timeo);
1892 	}
1893 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1894 				   errcode, sk->sk_allocation);
1895 	if (skb)
1896 		skb_set_owner_w(skb, sk);
1897 	return skb;
1898 
1899 interrupted:
1900 	err = sock_intr_errno(timeo);
1901 failure:
1902 	*errcode = err;
1903 	return NULL;
1904 }
1905 EXPORT_SYMBOL(sock_alloc_send_pskb);
1906 
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)1907 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1908 				    int noblock, int *errcode)
1909 {
1910 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1911 }
1912 EXPORT_SYMBOL(sock_alloc_send_skb);
1913 
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)1914 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1915 		     struct sockcm_cookie *sockc)
1916 {
1917 	u32 tsflags;
1918 
1919 	switch (cmsg->cmsg_type) {
1920 	case SO_MARK:
1921 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1922 			return -EPERM;
1923 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1924 			return -EINVAL;
1925 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1926 		break;
1927 	case SO_TIMESTAMPING:
1928 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1929 			return -EINVAL;
1930 
1931 		tsflags = *(u32 *)CMSG_DATA(cmsg);
1932 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1933 			return -EINVAL;
1934 
1935 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1936 		sockc->tsflags |= tsflags;
1937 		break;
1938 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
1939 	case SCM_RIGHTS:
1940 	case SCM_CREDENTIALS:
1941 		break;
1942 	default:
1943 		return -EINVAL;
1944 	}
1945 	return 0;
1946 }
1947 EXPORT_SYMBOL(__sock_cmsg_send);
1948 
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)1949 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1950 		   struct sockcm_cookie *sockc)
1951 {
1952 	struct cmsghdr *cmsg;
1953 	int ret;
1954 
1955 	for_each_cmsghdr(cmsg, msg) {
1956 		if (!CMSG_OK(msg, cmsg))
1957 			return -EINVAL;
1958 		if (cmsg->cmsg_level != SOL_SOCKET)
1959 			continue;
1960 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1961 		if (ret)
1962 			return ret;
1963 	}
1964 	return 0;
1965 }
1966 EXPORT_SYMBOL(sock_cmsg_send);
1967 
1968 /* On 32bit arches, an skb frag is limited to 2^15 */
1969 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
1970 
1971 /**
1972  * skb_page_frag_refill - check that a page_frag contains enough room
1973  * @sz: minimum size of the fragment we want to get
1974  * @pfrag: pointer to page_frag
1975  * @gfp: priority for memory allocation
1976  *
1977  * Note: While this allocator tries to use high order pages, there is
1978  * no guarantee that allocations succeed. Therefore, @sz MUST be
1979  * less or equal than PAGE_SIZE.
1980  */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)1981 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1982 {
1983 	if (pfrag->page) {
1984 		if (page_ref_count(pfrag->page) == 1) {
1985 			pfrag->offset = 0;
1986 			return true;
1987 		}
1988 		if (pfrag->offset + sz <= pfrag->size)
1989 			return true;
1990 		put_page(pfrag->page);
1991 	}
1992 
1993 	pfrag->offset = 0;
1994 	if (SKB_FRAG_PAGE_ORDER) {
1995 		/* Avoid direct reclaim but allow kswapd to wake */
1996 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1997 					  __GFP_COMP | __GFP_NOWARN |
1998 					  __GFP_NORETRY,
1999 					  SKB_FRAG_PAGE_ORDER);
2000 		if (likely(pfrag->page)) {
2001 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2002 			return true;
2003 		}
2004 	}
2005 	pfrag->page = alloc_page(gfp);
2006 	if (likely(pfrag->page)) {
2007 		pfrag->size = PAGE_SIZE;
2008 		return true;
2009 	}
2010 	return false;
2011 }
2012 EXPORT_SYMBOL(skb_page_frag_refill);
2013 
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2014 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2015 {
2016 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2017 		return true;
2018 
2019 	sk_enter_memory_pressure(sk);
2020 	sk_stream_moderate_sndbuf(sk);
2021 	return false;
2022 }
2023 EXPORT_SYMBOL(sk_page_frag_refill);
2024 
__lock_sock(struct sock * sk)2025 static void __lock_sock(struct sock *sk)
2026 	__releases(&sk->sk_lock.slock)
2027 	__acquires(&sk->sk_lock.slock)
2028 {
2029 	DEFINE_WAIT(wait);
2030 
2031 	for (;;) {
2032 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2033 					TASK_UNINTERRUPTIBLE);
2034 		spin_unlock_bh(&sk->sk_lock.slock);
2035 		schedule();
2036 		spin_lock_bh(&sk->sk_lock.slock);
2037 		if (!sock_owned_by_user(sk))
2038 			break;
2039 	}
2040 	finish_wait(&sk->sk_lock.wq, &wait);
2041 }
2042 
__release_sock(struct sock * sk)2043 static void __release_sock(struct sock *sk)
2044 	__releases(&sk->sk_lock.slock)
2045 	__acquires(&sk->sk_lock.slock)
2046 {
2047 	struct sk_buff *skb, *next;
2048 
2049 	while ((skb = sk->sk_backlog.head) != NULL) {
2050 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2051 
2052 		spin_unlock_bh(&sk->sk_lock.slock);
2053 
2054 		do {
2055 			next = skb->next;
2056 			prefetch(next);
2057 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2058 			skb->next = NULL;
2059 			sk_backlog_rcv(sk, skb);
2060 
2061 			cond_resched();
2062 
2063 			skb = next;
2064 		} while (skb != NULL);
2065 
2066 		spin_lock_bh(&sk->sk_lock.slock);
2067 	}
2068 
2069 	/*
2070 	 * Doing the zeroing here guarantee we can not loop forever
2071 	 * while a wild producer attempts to flood us.
2072 	 */
2073 	sk->sk_backlog.len = 0;
2074 }
2075 
__sk_flush_backlog(struct sock * sk)2076 void __sk_flush_backlog(struct sock *sk)
2077 {
2078 	spin_lock_bh(&sk->sk_lock.slock);
2079 	__release_sock(sk);
2080 	spin_unlock_bh(&sk->sk_lock.slock);
2081 }
2082 
2083 /**
2084  * sk_wait_data - wait for data to arrive at sk_receive_queue
2085  * @sk:    sock to wait on
2086  * @timeo: for how long
2087  * @skb:   last skb seen on sk_receive_queue
2088  *
2089  * Now socket state including sk->sk_err is changed only under lock,
2090  * hence we may omit checks after joining wait queue.
2091  * We check receive queue before schedule() only as optimization;
2092  * it is very likely that release_sock() added new data.
2093  */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2094 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2095 {
2096 	int rc;
2097 	DEFINE_WAIT(wait);
2098 
2099 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2100 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2101 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
2102 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2103 	finish_wait(sk_sleep(sk), &wait);
2104 	return rc;
2105 }
2106 EXPORT_SYMBOL(sk_wait_data);
2107 
2108 /**
2109  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2110  *	@sk: socket
2111  *	@size: memory size to allocate
2112  *	@kind: allocation type
2113  *
2114  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2115  *	rmem allocation. This function assumes that protocols which have
2116  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2117  */
__sk_mem_schedule(struct sock * sk,int size,int kind)2118 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2119 {
2120 	struct proto *prot = sk->sk_prot;
2121 	int amt = sk_mem_pages(size);
2122 	long allocated;
2123 
2124 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2125 
2126 	allocated = sk_memory_allocated_add(sk, amt);
2127 
2128 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2129 	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2130 		goto suppress_allocation;
2131 
2132 	/* Under limit. */
2133 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2134 		sk_leave_memory_pressure(sk);
2135 		return 1;
2136 	}
2137 
2138 	/* Under pressure. */
2139 	if (allocated > sk_prot_mem_limits(sk, 1))
2140 		sk_enter_memory_pressure(sk);
2141 
2142 	/* Over hard limit. */
2143 	if (allocated > sk_prot_mem_limits(sk, 2))
2144 		goto suppress_allocation;
2145 
2146 	/* guarantee minimum buffer size under pressure */
2147 	if (kind == SK_MEM_RECV) {
2148 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2149 			return 1;
2150 
2151 	} else { /* SK_MEM_SEND */
2152 		if (sk->sk_type == SOCK_STREAM) {
2153 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2154 				return 1;
2155 		} else if (atomic_read(&sk->sk_wmem_alloc) <
2156 			   prot->sysctl_wmem[0])
2157 				return 1;
2158 	}
2159 
2160 	if (sk_has_memory_pressure(sk)) {
2161 		int alloc;
2162 
2163 		if (!sk_under_memory_pressure(sk))
2164 			return 1;
2165 		alloc = sk_sockets_allocated_read_positive(sk);
2166 		if (sk_prot_mem_limits(sk, 2) > alloc *
2167 		    sk_mem_pages(sk->sk_wmem_queued +
2168 				 atomic_read(&sk->sk_rmem_alloc) +
2169 				 sk->sk_forward_alloc))
2170 			return 1;
2171 	}
2172 
2173 suppress_allocation:
2174 
2175 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2176 		sk_stream_moderate_sndbuf(sk);
2177 
2178 		/* Fail only if socket is _under_ its sndbuf.
2179 		 * In this case we cannot block, so that we have to fail.
2180 		 */
2181 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2182 			return 1;
2183 	}
2184 
2185 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2186 
2187 	/* Alas. Undo changes. */
2188 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2189 
2190 	sk_memory_allocated_sub(sk, amt);
2191 
2192 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2193 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2194 
2195 	return 0;
2196 }
2197 EXPORT_SYMBOL(__sk_mem_schedule);
2198 
2199 /**
2200  *	__sk_mem_reclaim - reclaim memory_allocated
2201  *	@sk: socket
2202  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2203  */
__sk_mem_reclaim(struct sock * sk,int amount)2204 void __sk_mem_reclaim(struct sock *sk, int amount)
2205 {
2206 	amount >>= SK_MEM_QUANTUM_SHIFT;
2207 	sk_memory_allocated_sub(sk, amount);
2208 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2209 
2210 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2211 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2212 
2213 	if (sk_under_memory_pressure(sk) &&
2214 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2215 		sk_leave_memory_pressure(sk);
2216 }
2217 EXPORT_SYMBOL(__sk_mem_reclaim);
2218 
sk_set_peek_off(struct sock * sk,int val)2219 int sk_set_peek_off(struct sock *sk, int val)
2220 {
2221 	if (val < 0)
2222 		return -EINVAL;
2223 
2224 	sk->sk_peek_off = val;
2225 	return 0;
2226 }
2227 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2228 
2229 /*
2230  * Set of default routines for initialising struct proto_ops when
2231  * the protocol does not support a particular function. In certain
2232  * cases where it makes no sense for a protocol to have a "do nothing"
2233  * function, some default processing is provided.
2234  */
2235 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2236 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2237 {
2238 	return -EOPNOTSUPP;
2239 }
2240 EXPORT_SYMBOL(sock_no_bind);
2241 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2242 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2243 		    int len, int flags)
2244 {
2245 	return -EOPNOTSUPP;
2246 }
2247 EXPORT_SYMBOL(sock_no_connect);
2248 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2249 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2250 {
2251 	return -EOPNOTSUPP;
2252 }
2253 EXPORT_SYMBOL(sock_no_socketpair);
2254 
sock_no_accept(struct socket * sock,struct socket * newsock,int flags)2255 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2256 {
2257 	return -EOPNOTSUPP;
2258 }
2259 EXPORT_SYMBOL(sock_no_accept);
2260 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int * len,int peer)2261 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2262 		    int *len, int peer)
2263 {
2264 	return -EOPNOTSUPP;
2265 }
2266 EXPORT_SYMBOL(sock_no_getname);
2267 
sock_no_poll(struct file * file,struct socket * sock,poll_table * pt)2268 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2269 {
2270 	return 0;
2271 }
2272 EXPORT_SYMBOL(sock_no_poll);
2273 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2274 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2275 {
2276 	return -EOPNOTSUPP;
2277 }
2278 EXPORT_SYMBOL(sock_no_ioctl);
2279 
sock_no_listen(struct socket * sock,int backlog)2280 int sock_no_listen(struct socket *sock, int backlog)
2281 {
2282 	return -EOPNOTSUPP;
2283 }
2284 EXPORT_SYMBOL(sock_no_listen);
2285 
sock_no_shutdown(struct socket * sock,int how)2286 int sock_no_shutdown(struct socket *sock, int how)
2287 {
2288 	return -EOPNOTSUPP;
2289 }
2290 EXPORT_SYMBOL(sock_no_shutdown);
2291 
sock_no_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2292 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2293 		    char __user *optval, unsigned int optlen)
2294 {
2295 	return -EOPNOTSUPP;
2296 }
2297 EXPORT_SYMBOL(sock_no_setsockopt);
2298 
sock_no_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2299 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2300 		    char __user *optval, int __user *optlen)
2301 {
2302 	return -EOPNOTSUPP;
2303 }
2304 EXPORT_SYMBOL(sock_no_getsockopt);
2305 
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)2306 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2307 {
2308 	return -EOPNOTSUPP;
2309 }
2310 EXPORT_SYMBOL(sock_no_sendmsg);
2311 
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)2312 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2313 		    int flags)
2314 {
2315 	return -EOPNOTSUPP;
2316 }
2317 EXPORT_SYMBOL(sock_no_recvmsg);
2318 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)2319 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2320 {
2321 	/* Mirror missing mmap method error code */
2322 	return -ENODEV;
2323 }
2324 EXPORT_SYMBOL(sock_no_mmap);
2325 
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)2326 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2327 {
2328 	ssize_t res;
2329 	struct msghdr msg = {.msg_flags = flags};
2330 	struct kvec iov;
2331 	char *kaddr = kmap(page);
2332 	iov.iov_base = kaddr + offset;
2333 	iov.iov_len = size;
2334 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2335 	kunmap(page);
2336 	return res;
2337 }
2338 EXPORT_SYMBOL(sock_no_sendpage);
2339 
2340 /*
2341  *	Default Socket Callbacks
2342  */
2343 
sock_def_wakeup(struct sock * sk)2344 static void sock_def_wakeup(struct sock *sk)
2345 {
2346 	struct socket_wq *wq;
2347 
2348 	rcu_read_lock();
2349 	wq = rcu_dereference(sk->sk_wq);
2350 	if (skwq_has_sleeper(wq))
2351 		wake_up_interruptible_all(&wq->wait);
2352 	rcu_read_unlock();
2353 }
2354 
sock_def_error_report(struct sock * sk)2355 static void sock_def_error_report(struct sock *sk)
2356 {
2357 	struct socket_wq *wq;
2358 
2359 	rcu_read_lock();
2360 	wq = rcu_dereference(sk->sk_wq);
2361 	if (skwq_has_sleeper(wq))
2362 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2363 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2364 	rcu_read_unlock();
2365 }
2366 
sock_def_readable(struct sock * sk)2367 static void sock_def_readable(struct sock *sk)
2368 {
2369 	struct socket_wq *wq;
2370 
2371 	rcu_read_lock();
2372 	wq = rcu_dereference(sk->sk_wq);
2373 	if (skwq_has_sleeper(wq))
2374 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2375 						POLLRDNORM | POLLRDBAND);
2376 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2377 	rcu_read_unlock();
2378 }
2379 
sock_def_write_space(struct sock * sk)2380 static void sock_def_write_space(struct sock *sk)
2381 {
2382 	struct socket_wq *wq;
2383 
2384 	rcu_read_lock();
2385 
2386 	/* Do not wake up a writer until he can make "significant"
2387 	 * progress.  --DaveM
2388 	 */
2389 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2390 		wq = rcu_dereference(sk->sk_wq);
2391 		if (skwq_has_sleeper(wq))
2392 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2393 						POLLWRNORM | POLLWRBAND);
2394 
2395 		/* Should agree with poll, otherwise some programs break */
2396 		if (sock_writeable(sk))
2397 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2398 	}
2399 
2400 	rcu_read_unlock();
2401 }
2402 
sock_def_destruct(struct sock * sk)2403 static void sock_def_destruct(struct sock *sk)
2404 {
2405 }
2406 
sk_send_sigurg(struct sock * sk)2407 void sk_send_sigurg(struct sock *sk)
2408 {
2409 	if (sk->sk_socket && sk->sk_socket->file)
2410 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2411 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2412 }
2413 EXPORT_SYMBOL(sk_send_sigurg);
2414 
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)2415 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2416 		    unsigned long expires)
2417 {
2418 	if (!mod_timer(timer, expires))
2419 		sock_hold(sk);
2420 }
2421 EXPORT_SYMBOL(sk_reset_timer);
2422 
sk_stop_timer(struct sock * sk,struct timer_list * timer)2423 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2424 {
2425 	if (del_timer(timer))
2426 		__sock_put(sk);
2427 }
2428 EXPORT_SYMBOL(sk_stop_timer);
2429 
sock_init_data(struct socket * sock,struct sock * sk)2430 void sock_init_data(struct socket *sock, struct sock *sk)
2431 {
2432 	skb_queue_head_init(&sk->sk_receive_queue);
2433 	skb_queue_head_init(&sk->sk_write_queue);
2434 	skb_queue_head_init(&sk->sk_error_queue);
2435 
2436 	sk->sk_send_head	=	NULL;
2437 
2438 	init_timer(&sk->sk_timer);
2439 
2440 	sk->sk_allocation	=	GFP_KERNEL;
2441 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2442 	sk->sk_sndbuf		=	sysctl_wmem_default;
2443 	sk->sk_state		=	TCP_CLOSE;
2444 	sk_set_socket(sk, sock);
2445 
2446 	sock_set_flag(sk, SOCK_ZAPPED);
2447 
2448 	if (sock) {
2449 		sk->sk_type	=	sock->type;
2450 		sk->sk_wq	=	sock->wq;
2451 		sock->sk	=	sk;
2452 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2453 	} else {
2454 		sk->sk_wq	=	NULL;
2455 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2456 	}
2457 
2458 	rwlock_init(&sk->sk_callback_lock);
2459 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2460 			af_callback_keys + sk->sk_family,
2461 			af_family_clock_key_strings[sk->sk_family]);
2462 
2463 	sk->sk_state_change	=	sock_def_wakeup;
2464 	sk->sk_data_ready	=	sock_def_readable;
2465 	sk->sk_write_space	=	sock_def_write_space;
2466 	sk->sk_error_report	=	sock_def_error_report;
2467 	sk->sk_destruct		=	sock_def_destruct;
2468 
2469 	sk->sk_frag.page	=	NULL;
2470 	sk->sk_frag.offset	=	0;
2471 	sk->sk_peek_off		=	-1;
2472 
2473 	sk->sk_peer_pid 	=	NULL;
2474 	sk->sk_peer_cred	=	NULL;
2475 	sk->sk_write_pending	=	0;
2476 	sk->sk_rcvlowat		=	1;
2477 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2478 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2479 
2480 	sk->sk_stamp = ktime_set(-1L, 0);
2481 
2482 #ifdef CONFIG_NET_RX_BUSY_POLL
2483 	sk->sk_napi_id		=	0;
2484 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2485 #endif
2486 
2487 	sk->sk_max_pacing_rate = ~0U;
2488 	sk->sk_pacing_rate = ~0U;
2489 	sk->sk_incoming_cpu = -1;
2490 	/*
2491 	 * Before updating sk_refcnt, we must commit prior changes to memory
2492 	 * (Documentation/RCU/rculist_nulls.txt for details)
2493 	 */
2494 	smp_wmb();
2495 	atomic_set(&sk->sk_refcnt, 1);
2496 	atomic_set(&sk->sk_drops, 0);
2497 }
2498 EXPORT_SYMBOL(sock_init_data);
2499 
lock_sock_nested(struct sock * sk,int subclass)2500 void lock_sock_nested(struct sock *sk, int subclass)
2501 {
2502 	might_sleep();
2503 	spin_lock_bh(&sk->sk_lock.slock);
2504 	if (sk->sk_lock.owned)
2505 		__lock_sock(sk);
2506 	sk->sk_lock.owned = 1;
2507 	spin_unlock(&sk->sk_lock.slock);
2508 	/*
2509 	 * The sk_lock has mutex_lock() semantics here:
2510 	 */
2511 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2512 	local_bh_enable();
2513 }
2514 EXPORT_SYMBOL(lock_sock_nested);
2515 
release_sock(struct sock * sk)2516 void release_sock(struct sock *sk)
2517 {
2518 	spin_lock_bh(&sk->sk_lock.slock);
2519 	if (sk->sk_backlog.tail)
2520 		__release_sock(sk);
2521 
2522 	/* Warning : release_cb() might need to release sk ownership,
2523 	 * ie call sock_release_ownership(sk) before us.
2524 	 */
2525 	if (sk->sk_prot->release_cb)
2526 		sk->sk_prot->release_cb(sk);
2527 
2528 	sock_release_ownership(sk);
2529 	if (waitqueue_active(&sk->sk_lock.wq))
2530 		wake_up(&sk->sk_lock.wq);
2531 	spin_unlock_bh(&sk->sk_lock.slock);
2532 }
2533 EXPORT_SYMBOL(release_sock);
2534 
2535 /**
2536  * lock_sock_fast - fast version of lock_sock
2537  * @sk: socket
2538  *
2539  * This version should be used for very small section, where process wont block
2540  * return false if fast path is taken
2541  *   sk_lock.slock locked, owned = 0, BH disabled
2542  * return true if slow path is taken
2543  *   sk_lock.slock unlocked, owned = 1, BH enabled
2544  */
lock_sock_fast(struct sock * sk)2545 bool lock_sock_fast(struct sock *sk)
2546 {
2547 	might_sleep();
2548 	spin_lock_bh(&sk->sk_lock.slock);
2549 
2550 	if (!sk->sk_lock.owned)
2551 		/*
2552 		 * Note : We must disable BH
2553 		 */
2554 		return false;
2555 
2556 	__lock_sock(sk);
2557 	sk->sk_lock.owned = 1;
2558 	spin_unlock(&sk->sk_lock.slock);
2559 	/*
2560 	 * The sk_lock has mutex_lock() semantics here:
2561 	 */
2562 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2563 	local_bh_enable();
2564 	return true;
2565 }
2566 EXPORT_SYMBOL(lock_sock_fast);
2567 
sock_get_timestamp(struct sock * sk,struct timeval __user * userstamp)2568 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2569 {
2570 	struct timeval tv;
2571 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2572 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2573 	tv = ktime_to_timeval(sk->sk_stamp);
2574 	if (tv.tv_sec == -1)
2575 		return -ENOENT;
2576 	if (tv.tv_sec == 0) {
2577 		sk->sk_stamp = ktime_get_real();
2578 		tv = ktime_to_timeval(sk->sk_stamp);
2579 	}
2580 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2581 }
2582 EXPORT_SYMBOL(sock_get_timestamp);
2583 
sock_get_timestampns(struct sock * sk,struct timespec __user * userstamp)2584 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2585 {
2586 	struct timespec ts;
2587 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2588 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2589 	ts = ktime_to_timespec(sk->sk_stamp);
2590 	if (ts.tv_sec == -1)
2591 		return -ENOENT;
2592 	if (ts.tv_sec == 0) {
2593 		sk->sk_stamp = ktime_get_real();
2594 		ts = ktime_to_timespec(sk->sk_stamp);
2595 	}
2596 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2597 }
2598 EXPORT_SYMBOL(sock_get_timestampns);
2599 
sock_enable_timestamp(struct sock * sk,int flag)2600 void sock_enable_timestamp(struct sock *sk, int flag)
2601 {
2602 	if (!sock_flag(sk, flag)) {
2603 		unsigned long previous_flags = sk->sk_flags;
2604 
2605 		sock_set_flag(sk, flag);
2606 		/*
2607 		 * we just set one of the two flags which require net
2608 		 * time stamping, but time stamping might have been on
2609 		 * already because of the other one
2610 		 */
2611 		if (sock_needs_netstamp(sk) &&
2612 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2613 			net_enable_timestamp();
2614 	}
2615 }
2616 
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)2617 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2618 		       int level, int type)
2619 {
2620 	struct sock_exterr_skb *serr;
2621 	struct sk_buff *skb;
2622 	int copied, err;
2623 
2624 	err = -EAGAIN;
2625 	skb = sock_dequeue_err_skb(sk);
2626 	if (skb == NULL)
2627 		goto out;
2628 
2629 	copied = skb->len;
2630 	if (copied > len) {
2631 		msg->msg_flags |= MSG_TRUNC;
2632 		copied = len;
2633 	}
2634 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2635 	if (err)
2636 		goto out_free_skb;
2637 
2638 	sock_recv_timestamp(msg, sk, skb);
2639 
2640 	serr = SKB_EXT_ERR(skb);
2641 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2642 
2643 	msg->msg_flags |= MSG_ERRQUEUE;
2644 	err = copied;
2645 
2646 out_free_skb:
2647 	kfree_skb(skb);
2648 out:
2649 	return err;
2650 }
2651 EXPORT_SYMBOL(sock_recv_errqueue);
2652 
2653 /*
2654  *	Get a socket option on an socket.
2655  *
2656  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2657  *	asynchronous errors should be reported by getsockopt. We assume
2658  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2659  */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2660 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2661 			   char __user *optval, int __user *optlen)
2662 {
2663 	struct sock *sk = sock->sk;
2664 
2665 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2666 }
2667 EXPORT_SYMBOL(sock_common_getsockopt);
2668 
2669 #ifdef CONFIG_COMPAT
compat_sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2670 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2671 				  char __user *optval, int __user *optlen)
2672 {
2673 	struct sock *sk = sock->sk;
2674 
2675 	if (sk->sk_prot->compat_getsockopt != NULL)
2676 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2677 						      optval, optlen);
2678 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2679 }
2680 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2681 #endif
2682 
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2683 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2684 			int flags)
2685 {
2686 	struct sock *sk = sock->sk;
2687 	int addr_len = 0;
2688 	int err;
2689 
2690 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2691 				   flags & ~MSG_DONTWAIT, &addr_len);
2692 	if (err >= 0)
2693 		msg->msg_namelen = addr_len;
2694 	return err;
2695 }
2696 EXPORT_SYMBOL(sock_common_recvmsg);
2697 
2698 /*
2699  *	Set socket options on an inet socket.
2700  */
sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2701 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2702 			   char __user *optval, unsigned int optlen)
2703 {
2704 	struct sock *sk = sock->sk;
2705 
2706 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2707 }
2708 EXPORT_SYMBOL(sock_common_setsockopt);
2709 
2710 #ifdef CONFIG_COMPAT
compat_sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2711 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2712 				  char __user *optval, unsigned int optlen)
2713 {
2714 	struct sock *sk = sock->sk;
2715 
2716 	if (sk->sk_prot->compat_setsockopt != NULL)
2717 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2718 						      optval, optlen);
2719 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2720 }
2721 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2722 #endif
2723 
sk_common_release(struct sock * sk)2724 void sk_common_release(struct sock *sk)
2725 {
2726 	if (sk->sk_prot->destroy)
2727 		sk->sk_prot->destroy(sk);
2728 
2729 	/*
2730 	 * Observation: when sock_common_release is called, processes have
2731 	 * no access to socket. But net still has.
2732 	 * Step one, detach it from networking:
2733 	 *
2734 	 * A. Remove from hash tables.
2735 	 */
2736 
2737 	sk->sk_prot->unhash(sk);
2738 
2739 	/*
2740 	 * In this point socket cannot receive new packets, but it is possible
2741 	 * that some packets are in flight because some CPU runs receiver and
2742 	 * did hash table lookup before we unhashed socket. They will achieve
2743 	 * receive queue and will be purged by socket destructor.
2744 	 *
2745 	 * Also we still have packets pending on receive queue and probably,
2746 	 * our own packets waiting in device queues. sock_destroy will drain
2747 	 * receive queue, but transmitted packets will delay socket destruction
2748 	 * until the last reference will be released.
2749 	 */
2750 
2751 	sock_orphan(sk);
2752 
2753 	xfrm_sk_free_policy(sk);
2754 
2755 	sk_refcnt_debug_release(sk);
2756 
2757 	sock_put(sk);
2758 }
2759 EXPORT_SYMBOL(sk_common_release);
2760 
2761 #ifdef CONFIG_PROC_FS
2762 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2763 struct prot_inuse {
2764 	int val[PROTO_INUSE_NR];
2765 };
2766 
2767 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2768 
2769 #ifdef CONFIG_NET_NS
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)2770 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2771 {
2772 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2773 }
2774 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2775 
sock_prot_inuse_get(struct net * net,struct proto * prot)2776 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2777 {
2778 	int cpu, idx = prot->inuse_idx;
2779 	int res = 0;
2780 
2781 	for_each_possible_cpu(cpu)
2782 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2783 
2784 	return res >= 0 ? res : 0;
2785 }
2786 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2787 
sock_inuse_init_net(struct net * net)2788 static int __net_init sock_inuse_init_net(struct net *net)
2789 {
2790 	net->core.inuse = alloc_percpu(struct prot_inuse);
2791 	return net->core.inuse ? 0 : -ENOMEM;
2792 }
2793 
sock_inuse_exit_net(struct net * net)2794 static void __net_exit sock_inuse_exit_net(struct net *net)
2795 {
2796 	free_percpu(net->core.inuse);
2797 }
2798 
2799 static struct pernet_operations net_inuse_ops = {
2800 	.init = sock_inuse_init_net,
2801 	.exit = sock_inuse_exit_net,
2802 };
2803 
net_inuse_init(void)2804 static __init int net_inuse_init(void)
2805 {
2806 	if (register_pernet_subsys(&net_inuse_ops))
2807 		panic("Cannot initialize net inuse counters");
2808 
2809 	return 0;
2810 }
2811 
2812 core_initcall(net_inuse_init);
2813 #else
2814 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2815 
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)2816 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2817 {
2818 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2819 }
2820 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2821 
sock_prot_inuse_get(struct net * net,struct proto * prot)2822 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2823 {
2824 	int cpu, idx = prot->inuse_idx;
2825 	int res = 0;
2826 
2827 	for_each_possible_cpu(cpu)
2828 		res += per_cpu(prot_inuse, cpu).val[idx];
2829 
2830 	return res >= 0 ? res : 0;
2831 }
2832 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2833 #endif
2834 
assign_proto_idx(struct proto * prot)2835 static void assign_proto_idx(struct proto *prot)
2836 {
2837 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2838 
2839 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2840 		pr_err("PROTO_INUSE_NR exhausted\n");
2841 		return;
2842 	}
2843 
2844 	set_bit(prot->inuse_idx, proto_inuse_idx);
2845 }
2846 
release_proto_idx(struct proto * prot)2847 static void release_proto_idx(struct proto *prot)
2848 {
2849 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2850 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2851 }
2852 #else
assign_proto_idx(struct proto * prot)2853 static inline void assign_proto_idx(struct proto *prot)
2854 {
2855 }
2856 
release_proto_idx(struct proto * prot)2857 static inline void release_proto_idx(struct proto *prot)
2858 {
2859 }
2860 #endif
2861 
req_prot_cleanup(struct request_sock_ops * rsk_prot)2862 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2863 {
2864 	if (!rsk_prot)
2865 		return;
2866 	kfree(rsk_prot->slab_name);
2867 	rsk_prot->slab_name = NULL;
2868 	kmem_cache_destroy(rsk_prot->slab);
2869 	rsk_prot->slab = NULL;
2870 }
2871 
req_prot_init(const struct proto * prot)2872 static int req_prot_init(const struct proto *prot)
2873 {
2874 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
2875 
2876 	if (!rsk_prot)
2877 		return 0;
2878 
2879 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2880 					prot->name);
2881 	if (!rsk_prot->slab_name)
2882 		return -ENOMEM;
2883 
2884 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2885 					   rsk_prot->obj_size, 0,
2886 					   prot->slab_flags, NULL);
2887 
2888 	if (!rsk_prot->slab) {
2889 		pr_crit("%s: Can't create request sock SLAB cache!\n",
2890 			prot->name);
2891 		return -ENOMEM;
2892 	}
2893 	return 0;
2894 }
2895 
proto_register(struct proto * prot,int alloc_slab)2896 int proto_register(struct proto *prot, int alloc_slab)
2897 {
2898 	if (alloc_slab) {
2899 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2900 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2901 					NULL);
2902 
2903 		if (prot->slab == NULL) {
2904 			pr_crit("%s: Can't create sock SLAB cache!\n",
2905 				prot->name);
2906 			goto out;
2907 		}
2908 
2909 		if (req_prot_init(prot))
2910 			goto out_free_request_sock_slab;
2911 
2912 		if (prot->twsk_prot != NULL) {
2913 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2914 
2915 			if (prot->twsk_prot->twsk_slab_name == NULL)
2916 				goto out_free_request_sock_slab;
2917 
2918 			prot->twsk_prot->twsk_slab =
2919 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2920 						  prot->twsk_prot->twsk_obj_size,
2921 						  0,
2922 						  prot->slab_flags,
2923 						  NULL);
2924 			if (prot->twsk_prot->twsk_slab == NULL)
2925 				goto out_free_timewait_sock_slab_name;
2926 		}
2927 	}
2928 
2929 	mutex_lock(&proto_list_mutex);
2930 	list_add(&prot->node, &proto_list);
2931 	assign_proto_idx(prot);
2932 	mutex_unlock(&proto_list_mutex);
2933 	return 0;
2934 
2935 out_free_timewait_sock_slab_name:
2936 	kfree(prot->twsk_prot->twsk_slab_name);
2937 out_free_request_sock_slab:
2938 	req_prot_cleanup(prot->rsk_prot);
2939 
2940 	kmem_cache_destroy(prot->slab);
2941 	prot->slab = NULL;
2942 out:
2943 	return -ENOBUFS;
2944 }
2945 EXPORT_SYMBOL(proto_register);
2946 
proto_unregister(struct proto * prot)2947 void proto_unregister(struct proto *prot)
2948 {
2949 	mutex_lock(&proto_list_mutex);
2950 	release_proto_idx(prot);
2951 	list_del(&prot->node);
2952 	mutex_unlock(&proto_list_mutex);
2953 
2954 	kmem_cache_destroy(prot->slab);
2955 	prot->slab = NULL;
2956 
2957 	req_prot_cleanup(prot->rsk_prot);
2958 
2959 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2960 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2961 		kfree(prot->twsk_prot->twsk_slab_name);
2962 		prot->twsk_prot->twsk_slab = NULL;
2963 	}
2964 }
2965 EXPORT_SYMBOL(proto_unregister);
2966 
2967 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)2968 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2969 	__acquires(proto_list_mutex)
2970 {
2971 	mutex_lock(&proto_list_mutex);
2972 	return seq_list_start_head(&proto_list, *pos);
2973 }
2974 
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)2975 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2976 {
2977 	return seq_list_next(v, &proto_list, pos);
2978 }
2979 
proto_seq_stop(struct seq_file * seq,void * v)2980 static void proto_seq_stop(struct seq_file *seq, void *v)
2981 	__releases(proto_list_mutex)
2982 {
2983 	mutex_unlock(&proto_list_mutex);
2984 }
2985 
proto_method_implemented(const void * method)2986 static char proto_method_implemented(const void *method)
2987 {
2988 	return method == NULL ? 'n' : 'y';
2989 }
sock_prot_memory_allocated(struct proto * proto)2990 static long sock_prot_memory_allocated(struct proto *proto)
2991 {
2992 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2993 }
2994 
sock_prot_memory_pressure(struct proto * proto)2995 static char *sock_prot_memory_pressure(struct proto *proto)
2996 {
2997 	return proto->memory_pressure != NULL ?
2998 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2999 }
3000 
proto_seq_printf(struct seq_file * seq,struct proto * proto)3001 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3002 {
3003 
3004 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3005 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3006 		   proto->name,
3007 		   proto->obj_size,
3008 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3009 		   sock_prot_memory_allocated(proto),
3010 		   sock_prot_memory_pressure(proto),
3011 		   proto->max_header,
3012 		   proto->slab == NULL ? "no" : "yes",
3013 		   module_name(proto->owner),
3014 		   proto_method_implemented(proto->close),
3015 		   proto_method_implemented(proto->connect),
3016 		   proto_method_implemented(proto->disconnect),
3017 		   proto_method_implemented(proto->accept),
3018 		   proto_method_implemented(proto->ioctl),
3019 		   proto_method_implemented(proto->init),
3020 		   proto_method_implemented(proto->destroy),
3021 		   proto_method_implemented(proto->shutdown),
3022 		   proto_method_implemented(proto->setsockopt),
3023 		   proto_method_implemented(proto->getsockopt),
3024 		   proto_method_implemented(proto->sendmsg),
3025 		   proto_method_implemented(proto->recvmsg),
3026 		   proto_method_implemented(proto->sendpage),
3027 		   proto_method_implemented(proto->bind),
3028 		   proto_method_implemented(proto->backlog_rcv),
3029 		   proto_method_implemented(proto->hash),
3030 		   proto_method_implemented(proto->unhash),
3031 		   proto_method_implemented(proto->get_port),
3032 		   proto_method_implemented(proto->enter_memory_pressure));
3033 }
3034 
proto_seq_show(struct seq_file * seq,void * v)3035 static int proto_seq_show(struct seq_file *seq, void *v)
3036 {
3037 	if (v == &proto_list)
3038 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3039 			   "protocol",
3040 			   "size",
3041 			   "sockets",
3042 			   "memory",
3043 			   "press",
3044 			   "maxhdr",
3045 			   "slab",
3046 			   "module",
3047 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3048 	else
3049 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3050 	return 0;
3051 }
3052 
3053 static const struct seq_operations proto_seq_ops = {
3054 	.start  = proto_seq_start,
3055 	.next   = proto_seq_next,
3056 	.stop   = proto_seq_stop,
3057 	.show   = proto_seq_show,
3058 };
3059 
proto_seq_open(struct inode * inode,struct file * file)3060 static int proto_seq_open(struct inode *inode, struct file *file)
3061 {
3062 	return seq_open_net(inode, file, &proto_seq_ops,
3063 			    sizeof(struct seq_net_private));
3064 }
3065 
3066 static const struct file_operations proto_seq_fops = {
3067 	.owner		= THIS_MODULE,
3068 	.open		= proto_seq_open,
3069 	.read		= seq_read,
3070 	.llseek		= seq_lseek,
3071 	.release	= seq_release_net,
3072 };
3073 
proto_init_net(struct net * net)3074 static __net_init int proto_init_net(struct net *net)
3075 {
3076 	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3077 		return -ENOMEM;
3078 
3079 	return 0;
3080 }
3081 
proto_exit_net(struct net * net)3082 static __net_exit void proto_exit_net(struct net *net)
3083 {
3084 	remove_proc_entry("protocols", net->proc_net);
3085 }
3086 
3087 
3088 static __net_initdata struct pernet_operations proto_net_ops = {
3089 	.init = proto_init_net,
3090 	.exit = proto_exit_net,
3091 };
3092 
proto_init(void)3093 static int __init proto_init(void)
3094 {
3095 	return register_pernet_subsys(&proto_net_ops);
3096 }
3097 
3098 subsys_initcall(proto_init);
3099 
3100 #endif /* PROC_FS */
3101