• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/sched/mm.h>
106 #include <linux/timer.h>
107 #include <linux/string.h>
108 #include <linux/sockios.h>
109 #include <linux/net.h>
110 #include <linux/mm.h>
111 #include <linux/slab.h>
112 #include <linux/interrupt.h>
113 #include <linux/poll.h>
114 #include <linux/tcp.h>
115 #include <linux/init.h>
116 #include <linux/highmem.h>
117 #include <linux/user_namespace.h>
118 #include <linux/static_key.h>
119 #include <linux/memcontrol.h>
120 #include <linux/prefetch.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <linux/net_tstamp.h>
131 #include <net/xfrm.h>
132 #include <linux/ipsec.h>
133 #include <net/cls_cgroup.h>
134 #include <net/netprio_cgroup.h>
135 #include <linux/sock_diag.h>
136 
137 #include <linux/filter.h>
138 #include <net/sock_reuseport.h>
139 
140 #include <trace/events/sock.h>
141 
142 #include <net/tcp.h>
143 #include <net/busy_poll.h>
144 
145 static DEFINE_MUTEX(proto_list_mutex);
146 static LIST_HEAD(proto_list);
147 
148 /**
149  * sk_ns_capable - General socket capability test
150  * @sk: Socket to use a capability on or through
151  * @user_ns: The user namespace of the capability to use
152  * @cap: The capability to use
153  *
154  * Test to see if the opener of the socket had when the socket was
155  * created and the current process has the capability @cap in the user
156  * namespace @user_ns.
157  */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)158 bool sk_ns_capable(const struct sock *sk,
159 		   struct user_namespace *user_ns, int cap)
160 {
161 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
162 		ns_capable(user_ns, cap);
163 }
164 EXPORT_SYMBOL(sk_ns_capable);
165 
166 /**
167  * sk_capable - Socket global capability test
168  * @sk: Socket to use a capability on or through
169  * @cap: The global capability to use
170  *
171  * Test to see if the opener of the socket had when the socket was
172  * created and the current process has the capability @cap in all user
173  * namespaces.
174  */
sk_capable(const struct sock * sk,int cap)175 bool sk_capable(const struct sock *sk, int cap)
176 {
177 	return sk_ns_capable(sk, &init_user_ns, cap);
178 }
179 EXPORT_SYMBOL(sk_capable);
180 
181 /**
182  * sk_net_capable - Network namespace socket capability test
183  * @sk: Socket to use a capability on or through
184  * @cap: The capability to use
185  *
186  * Test to see if the opener of the socket had when the socket was created
187  * and the current process has the capability @cap over the network namespace
188  * the socket is a member of.
189  */
sk_net_capable(const struct sock * sk,int cap)190 bool sk_net_capable(const struct sock *sk, int cap)
191 {
192 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
193 }
194 EXPORT_SYMBOL(sk_net_capable);
195 
196 /*
197  * Each address family might have different locking rules, so we have
198  * one slock key per address family and separate keys for internal and
199  * userspace sockets.
200  */
201 static struct lock_class_key af_family_keys[AF_MAX];
202 static struct lock_class_key af_family_kern_keys[AF_MAX];
203 static struct lock_class_key af_family_slock_keys[AF_MAX];
204 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
205 
206 /*
207  * Make lock validator output more readable. (we pre-construct these
208  * strings build-time, so that runtime initialization of socket
209  * locks is fast):
210  */
211 
212 #define _sock_locks(x)						  \
213   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
214   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
215   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
216   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
217   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
218   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
219   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
220   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
221   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
222   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
223   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
224   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
225   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
226   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
227   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_MAX"
228 
229 static const char *const af_family_key_strings[AF_MAX+1] = {
230 	_sock_locks("sk_lock-")
231 };
232 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
233 	_sock_locks("slock-")
234 };
235 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
236 	_sock_locks("clock-")
237 };
238 
239 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
240 	_sock_locks("k-sk_lock-")
241 };
242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
243 	_sock_locks("k-slock-")
244 };
245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
246 	_sock_locks("k-clock-")
247 };
248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
249   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
250   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
251   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
252   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
253   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
254   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
255   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
256   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
257   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
258   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
259   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
260   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
261   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
262   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
263   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
264 };
265 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
266   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
267   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
268   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
269   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
270   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
271   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
272   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
273   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
274   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
275   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
276   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
277   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
278   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
279   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
280   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
281 };
282 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
283   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
284   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
285   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
286   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
287   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
288   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
289   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
290   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
291   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
292   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
293   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
294   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
295   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
296   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
297   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
298 };
299 
300 /*
301  * sk_callback_lock and sk queues locking rules are per-address-family,
302  * so split the lock classes by using a per-AF key:
303  */
304 static struct lock_class_key af_callback_keys[AF_MAX];
305 static struct lock_class_key af_rlock_keys[AF_MAX];
306 static struct lock_class_key af_wlock_keys[AF_MAX];
307 static struct lock_class_key af_elock_keys[AF_MAX];
308 static struct lock_class_key af_kern_callback_keys[AF_MAX];
309 
310 /* Run time adjustable parameters. */
311 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
312 EXPORT_SYMBOL(sysctl_wmem_max);
313 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
314 EXPORT_SYMBOL(sysctl_rmem_max);
315 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
316 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
317 
318 /* Maximal space eaten by iovec or ancillary data plus some space */
319 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
320 EXPORT_SYMBOL(sysctl_optmem_max);
321 
322 int sysctl_tstamp_allow_data __read_mostly = 1;
323 
324 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
325 EXPORT_SYMBOL_GPL(memalloc_socks);
326 
327 /**
328  * sk_set_memalloc - sets %SOCK_MEMALLOC
329  * @sk: socket to set it on
330  *
331  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
332  * It's the responsibility of the admin to adjust min_free_kbytes
333  * to meet the requirements
334  */
sk_set_memalloc(struct sock * sk)335 void sk_set_memalloc(struct sock *sk)
336 {
337 	sock_set_flag(sk, SOCK_MEMALLOC);
338 	sk->sk_allocation |= __GFP_MEMALLOC;
339 	static_key_slow_inc(&memalloc_socks);
340 }
341 EXPORT_SYMBOL_GPL(sk_set_memalloc);
342 
sk_clear_memalloc(struct sock * sk)343 void sk_clear_memalloc(struct sock *sk)
344 {
345 	sock_reset_flag(sk, SOCK_MEMALLOC);
346 	sk->sk_allocation &= ~__GFP_MEMALLOC;
347 	static_key_slow_dec(&memalloc_socks);
348 
349 	/*
350 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
351 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
352 	 * it has rmem allocations due to the last swapfile being deactivated
353 	 * but there is a risk that the socket is unusable due to exceeding
354 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
355 	 */
356 	sk_mem_reclaim(sk);
357 }
358 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
359 
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)360 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
361 {
362 	int ret;
363 	unsigned int noreclaim_flag;
364 
365 	/* these should have been dropped before queueing */
366 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
367 
368 	noreclaim_flag = memalloc_noreclaim_save();
369 	ret = sk->sk_backlog_rcv(sk, skb);
370 	memalloc_noreclaim_restore(noreclaim_flag);
371 
372 	return ret;
373 }
374 EXPORT_SYMBOL(__sk_backlog_rcv);
375 
sock_set_timeout(long * timeo_p,char __user * optval,int optlen)376 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
377 {
378 	struct timeval tv;
379 
380 	if (optlen < sizeof(tv))
381 		return -EINVAL;
382 	if (copy_from_user(&tv, optval, sizeof(tv)))
383 		return -EFAULT;
384 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
385 		return -EDOM;
386 
387 	if (tv.tv_sec < 0) {
388 		static int warned __read_mostly;
389 
390 		*timeo_p = 0;
391 		if (warned < 10 && net_ratelimit()) {
392 			warned++;
393 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
394 				__func__, current->comm, task_pid_nr(current));
395 		}
396 		return 0;
397 	}
398 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
399 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
400 		return 0;
401 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
402 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
403 	return 0;
404 }
405 
sock_warn_obsolete_bsdism(const char * name)406 static void sock_warn_obsolete_bsdism(const char *name)
407 {
408 	static int warned;
409 	static char warncomm[TASK_COMM_LEN];
410 	if (strcmp(warncomm, current->comm) && warned < 5) {
411 		strcpy(warncomm,  current->comm);
412 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
413 			warncomm, name);
414 		warned++;
415 	}
416 }
417 
sock_needs_netstamp(const struct sock * sk)418 static bool sock_needs_netstamp(const struct sock *sk)
419 {
420 	switch (sk->sk_family) {
421 	case AF_UNSPEC:
422 	case AF_UNIX:
423 		return false;
424 	default:
425 		return true;
426 	}
427 }
428 
sock_disable_timestamp(struct sock * sk,unsigned long flags)429 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
430 {
431 	if (sk->sk_flags & flags) {
432 		sk->sk_flags &= ~flags;
433 		if (sock_needs_netstamp(sk) &&
434 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
435 			net_disable_timestamp();
436 	}
437 }
438 
439 
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)440 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
441 {
442 	unsigned long flags;
443 	struct sk_buff_head *list = &sk->sk_receive_queue;
444 
445 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
446 		atomic_inc(&sk->sk_drops);
447 		trace_sock_rcvqueue_full(sk, skb);
448 		return -ENOMEM;
449 	}
450 
451 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
452 		atomic_inc(&sk->sk_drops);
453 		return -ENOBUFS;
454 	}
455 
456 	skb->dev = NULL;
457 	skb_set_owner_r(skb, sk);
458 
459 	/* we escape from rcu protected region, make sure we dont leak
460 	 * a norefcounted dst
461 	 */
462 	skb_dst_force(skb);
463 
464 	spin_lock_irqsave(&list->lock, flags);
465 	sock_skb_set_dropcount(sk, skb);
466 	__skb_queue_tail(list, skb);
467 	spin_unlock_irqrestore(&list->lock, flags);
468 
469 	if (!sock_flag(sk, SOCK_DEAD))
470 		sk->sk_data_ready(sk);
471 	return 0;
472 }
473 EXPORT_SYMBOL(__sock_queue_rcv_skb);
474 
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)475 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
476 {
477 	int err;
478 
479 	err = sk_filter(sk, skb);
480 	if (err)
481 		return err;
482 
483 	return __sock_queue_rcv_skb(sk, skb);
484 }
485 EXPORT_SYMBOL(sock_queue_rcv_skb);
486 
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)487 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
488 		     const int nested, unsigned int trim_cap, bool refcounted)
489 {
490 	int rc = NET_RX_SUCCESS;
491 
492 	if (sk_filter_trim_cap(sk, skb, trim_cap))
493 		goto discard_and_relse;
494 
495 	skb->dev = NULL;
496 
497 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
498 		atomic_inc(&sk->sk_drops);
499 		goto discard_and_relse;
500 	}
501 	if (nested)
502 		bh_lock_sock_nested(sk);
503 	else
504 		bh_lock_sock(sk);
505 	if (!sock_owned_by_user(sk)) {
506 		/*
507 		 * trylock + unlock semantics:
508 		 */
509 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
510 
511 		rc = sk_backlog_rcv(sk, skb);
512 
513 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
514 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
515 		bh_unlock_sock(sk);
516 		atomic_inc(&sk->sk_drops);
517 		goto discard_and_relse;
518 	}
519 
520 	bh_unlock_sock(sk);
521 out:
522 	if (refcounted)
523 		sock_put(sk);
524 	return rc;
525 discard_and_relse:
526 	kfree_skb(skb);
527 	goto out;
528 }
529 EXPORT_SYMBOL(__sk_receive_skb);
530 
__sk_dst_check(struct sock * sk,u32 cookie)531 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
532 {
533 	struct dst_entry *dst = __sk_dst_get(sk);
534 
535 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
536 		sk_tx_queue_clear(sk);
537 		sk->sk_dst_pending_confirm = 0;
538 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
539 		dst_release(dst);
540 		return NULL;
541 	}
542 
543 	return dst;
544 }
545 EXPORT_SYMBOL(__sk_dst_check);
546 
sk_dst_check(struct sock * sk,u32 cookie)547 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
548 {
549 	struct dst_entry *dst = sk_dst_get(sk);
550 
551 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
552 		sk_dst_reset(sk);
553 		dst_release(dst);
554 		return NULL;
555 	}
556 
557 	return dst;
558 }
559 EXPORT_SYMBOL(sk_dst_check);
560 
sock_setbindtodevice(struct sock * sk,char __user * optval,int optlen)561 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
562 				int optlen)
563 {
564 	int ret = -ENOPROTOOPT;
565 #ifdef CONFIG_NETDEVICES
566 	struct net *net = sock_net(sk);
567 	char devname[IFNAMSIZ];
568 	int index;
569 
570 	/* Sorry... */
571 	ret = -EPERM;
572 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
573 		goto out;
574 
575 	ret = -EINVAL;
576 	if (optlen < 0)
577 		goto out;
578 
579 	/* Bind this socket to a particular device like "eth0",
580 	 * as specified in the passed interface name. If the
581 	 * name is "" or the option length is zero the socket
582 	 * is not bound.
583 	 */
584 	if (optlen > IFNAMSIZ - 1)
585 		optlen = IFNAMSIZ - 1;
586 	memset(devname, 0, sizeof(devname));
587 
588 	ret = -EFAULT;
589 	if (copy_from_user(devname, optval, optlen))
590 		goto out;
591 
592 	index = 0;
593 	if (devname[0] != '\0') {
594 		struct net_device *dev;
595 
596 		rcu_read_lock();
597 		dev = dev_get_by_name_rcu(net, devname);
598 		if (dev)
599 			index = dev->ifindex;
600 		rcu_read_unlock();
601 		ret = -ENODEV;
602 		if (!dev)
603 			goto out;
604 	}
605 
606 	lock_sock(sk);
607 	sk->sk_bound_dev_if = index;
608 	sk_dst_reset(sk);
609 	release_sock(sk);
610 
611 	ret = 0;
612 
613 out:
614 #endif
615 
616 	return ret;
617 }
618 
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)619 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
620 				int __user *optlen, int len)
621 {
622 	int ret = -ENOPROTOOPT;
623 #ifdef CONFIG_NETDEVICES
624 	struct net *net = sock_net(sk);
625 	char devname[IFNAMSIZ];
626 
627 	if (sk->sk_bound_dev_if == 0) {
628 		len = 0;
629 		goto zero;
630 	}
631 
632 	ret = -EINVAL;
633 	if (len < IFNAMSIZ)
634 		goto out;
635 
636 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
637 	if (ret)
638 		goto out;
639 
640 	len = strlen(devname) + 1;
641 
642 	ret = -EFAULT;
643 	if (copy_to_user(optval, devname, len))
644 		goto out;
645 
646 zero:
647 	ret = -EFAULT;
648 	if (put_user(len, optlen))
649 		goto out;
650 
651 	ret = 0;
652 
653 out:
654 #endif
655 
656 	return ret;
657 }
658 
sock_valbool_flag(struct sock * sk,int bit,int valbool)659 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
660 {
661 	if (valbool)
662 		sock_set_flag(sk, bit);
663 	else
664 		sock_reset_flag(sk, bit);
665 }
666 
sk_mc_loop(struct sock * sk)667 bool sk_mc_loop(struct sock *sk)
668 {
669 	if (dev_recursion_level())
670 		return false;
671 	if (!sk)
672 		return true;
673 	switch (sk->sk_family) {
674 	case AF_INET:
675 		return inet_sk(sk)->mc_loop;
676 #if IS_ENABLED(CONFIG_IPV6)
677 	case AF_INET6:
678 		return inet6_sk(sk)->mc_loop;
679 #endif
680 	}
681 	WARN_ON(1);
682 	return true;
683 }
684 EXPORT_SYMBOL(sk_mc_loop);
685 
686 /*
687  *	This is meant for all protocols to use and covers goings on
688  *	at the socket level. Everything here is generic.
689  */
690 
sock_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)691 int sock_setsockopt(struct socket *sock, int level, int optname,
692 		    char __user *optval, unsigned int optlen)
693 {
694 	struct sock *sk = sock->sk;
695 	int val;
696 	int valbool;
697 	struct linger ling;
698 	int ret = 0;
699 
700 	/*
701 	 *	Options without arguments
702 	 */
703 
704 	if (optname == SO_BINDTODEVICE)
705 		return sock_setbindtodevice(sk, optval, optlen);
706 
707 	if (optlen < sizeof(int))
708 		return -EINVAL;
709 
710 	if (get_user(val, (int __user *)optval))
711 		return -EFAULT;
712 
713 	valbool = val ? 1 : 0;
714 
715 	lock_sock(sk);
716 
717 	switch (optname) {
718 	case SO_DEBUG:
719 		if (val && !capable(CAP_NET_ADMIN))
720 			ret = -EACCES;
721 		else
722 			sock_valbool_flag(sk, SOCK_DBG, valbool);
723 		break;
724 	case SO_REUSEADDR:
725 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
726 		break;
727 	case SO_REUSEPORT:
728 		sk->sk_reuseport = valbool;
729 		break;
730 	case SO_TYPE:
731 	case SO_PROTOCOL:
732 	case SO_DOMAIN:
733 	case SO_ERROR:
734 		ret = -ENOPROTOOPT;
735 		break;
736 	case SO_DONTROUTE:
737 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
738 		sk_dst_reset(sk);
739 		break;
740 	case SO_BROADCAST:
741 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
742 		break;
743 	case SO_SNDBUF:
744 		/* Don't error on this BSD doesn't and if you think
745 		 * about it this is right. Otherwise apps have to
746 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
747 		 * are treated in BSD as hints
748 		 */
749 		val = min_t(u32, val, sysctl_wmem_max);
750 set_sndbuf:
751 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
752 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
753 		/* Wake up sending tasks if we upped the value. */
754 		sk->sk_write_space(sk);
755 		break;
756 
757 	case SO_SNDBUFFORCE:
758 		if (!capable(CAP_NET_ADMIN)) {
759 			ret = -EPERM;
760 			break;
761 		}
762 		goto set_sndbuf;
763 
764 	case SO_RCVBUF:
765 		/* Don't error on this BSD doesn't and if you think
766 		 * about it this is right. Otherwise apps have to
767 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
768 		 * are treated in BSD as hints
769 		 */
770 		val = min_t(u32, val, sysctl_rmem_max);
771 set_rcvbuf:
772 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
773 		/*
774 		 * We double it on the way in to account for
775 		 * "struct sk_buff" etc. overhead.   Applications
776 		 * assume that the SO_RCVBUF setting they make will
777 		 * allow that much actual data to be received on that
778 		 * socket.
779 		 *
780 		 * Applications are unaware that "struct sk_buff" and
781 		 * other overheads allocate from the receive buffer
782 		 * during socket buffer allocation.
783 		 *
784 		 * And after considering the possible alternatives,
785 		 * returning the value we actually used in getsockopt
786 		 * is the most desirable behavior.
787 		 */
788 		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
789 		break;
790 
791 	case SO_RCVBUFFORCE:
792 		if (!capable(CAP_NET_ADMIN)) {
793 			ret = -EPERM;
794 			break;
795 		}
796 		goto set_rcvbuf;
797 
798 	case SO_KEEPALIVE:
799 		if (sk->sk_prot->keepalive)
800 			sk->sk_prot->keepalive(sk, valbool);
801 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
802 		break;
803 
804 	case SO_OOBINLINE:
805 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
806 		break;
807 
808 	case SO_NO_CHECK:
809 		sk->sk_no_check_tx = valbool;
810 		break;
811 
812 	case SO_PRIORITY:
813 		if ((val >= 0 && val <= 6) ||
814 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
815 			sk->sk_priority = val;
816 		else
817 			ret = -EPERM;
818 		break;
819 
820 	case SO_LINGER:
821 		if (optlen < sizeof(ling)) {
822 			ret = -EINVAL;	/* 1003.1g */
823 			break;
824 		}
825 		if (copy_from_user(&ling, optval, sizeof(ling))) {
826 			ret = -EFAULT;
827 			break;
828 		}
829 		if (!ling.l_onoff)
830 			sock_reset_flag(sk, SOCK_LINGER);
831 		else {
832 #if (BITS_PER_LONG == 32)
833 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
834 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
835 			else
836 #endif
837 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
838 			sock_set_flag(sk, SOCK_LINGER);
839 		}
840 		break;
841 
842 	case SO_BSDCOMPAT:
843 		sock_warn_obsolete_bsdism("setsockopt");
844 		break;
845 
846 	case SO_PASSCRED:
847 		if (valbool)
848 			set_bit(SOCK_PASSCRED, &sock->flags);
849 		else
850 			clear_bit(SOCK_PASSCRED, &sock->flags);
851 		break;
852 
853 	case SO_TIMESTAMP:
854 	case SO_TIMESTAMPNS:
855 		if (valbool)  {
856 			if (optname == SO_TIMESTAMP)
857 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
858 			else
859 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
860 			sock_set_flag(sk, SOCK_RCVTSTAMP);
861 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
862 		} else {
863 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
864 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
865 		}
866 		break;
867 
868 	case SO_TIMESTAMPING:
869 		if (val & ~SOF_TIMESTAMPING_MASK) {
870 			ret = -EINVAL;
871 			break;
872 		}
873 
874 		if (val & SOF_TIMESTAMPING_OPT_ID &&
875 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
876 			if (sk->sk_protocol == IPPROTO_TCP &&
877 			    sk->sk_type == SOCK_STREAM) {
878 				if ((1 << sk->sk_state) &
879 				    (TCPF_CLOSE | TCPF_LISTEN)) {
880 					ret = -EINVAL;
881 					break;
882 				}
883 				sk->sk_tskey = tcp_sk(sk)->snd_una;
884 			} else {
885 				sk->sk_tskey = 0;
886 			}
887 		}
888 
889 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
890 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
891 			ret = -EINVAL;
892 			break;
893 		}
894 
895 		sk->sk_tsflags = val;
896 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
897 			sock_enable_timestamp(sk,
898 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
899 		else
900 			sock_disable_timestamp(sk,
901 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
902 		break;
903 
904 	case SO_RCVLOWAT:
905 		if (val < 0)
906 			val = INT_MAX;
907 		sk->sk_rcvlowat = val ? : 1;
908 		break;
909 
910 	case SO_RCVTIMEO:
911 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
912 		break;
913 
914 	case SO_SNDTIMEO:
915 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
916 		break;
917 
918 	case SO_ATTACH_FILTER:
919 		ret = -EINVAL;
920 		if (optlen == sizeof(struct sock_fprog)) {
921 			struct sock_fprog fprog;
922 
923 			ret = -EFAULT;
924 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
925 				break;
926 
927 			ret = sk_attach_filter(&fprog, sk);
928 		}
929 		break;
930 
931 	case SO_ATTACH_BPF:
932 		ret = -EINVAL;
933 		if (optlen == sizeof(u32)) {
934 			u32 ufd;
935 
936 			ret = -EFAULT;
937 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
938 				break;
939 
940 			ret = sk_attach_bpf(ufd, sk);
941 		}
942 		break;
943 
944 	case SO_ATTACH_REUSEPORT_CBPF:
945 		ret = -EINVAL;
946 		if (optlen == sizeof(struct sock_fprog)) {
947 			struct sock_fprog fprog;
948 
949 			ret = -EFAULT;
950 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
951 				break;
952 
953 			ret = sk_reuseport_attach_filter(&fprog, sk);
954 		}
955 		break;
956 
957 	case SO_ATTACH_REUSEPORT_EBPF:
958 		ret = -EINVAL;
959 		if (optlen == sizeof(u32)) {
960 			u32 ufd;
961 
962 			ret = -EFAULT;
963 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
964 				break;
965 
966 			ret = sk_reuseport_attach_bpf(ufd, sk);
967 		}
968 		break;
969 
970 	case SO_DETACH_FILTER:
971 		ret = sk_detach_filter(sk);
972 		break;
973 
974 	case SO_LOCK_FILTER:
975 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
976 			ret = -EPERM;
977 		else
978 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
979 		break;
980 
981 	case SO_PASSSEC:
982 		if (valbool)
983 			set_bit(SOCK_PASSSEC, &sock->flags);
984 		else
985 			clear_bit(SOCK_PASSSEC, &sock->flags);
986 		break;
987 	case SO_MARK:
988 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
989 			ret = -EPERM;
990 		else
991 			sk->sk_mark = val;
992 		break;
993 
994 	case SO_RXQ_OVFL:
995 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
996 		break;
997 
998 	case SO_WIFI_STATUS:
999 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1000 		break;
1001 
1002 	case SO_PEEK_OFF:
1003 		if (sock->ops->set_peek_off)
1004 			ret = sock->ops->set_peek_off(sk, val);
1005 		else
1006 			ret = -EOPNOTSUPP;
1007 		break;
1008 
1009 	case SO_NOFCS:
1010 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1011 		break;
1012 
1013 	case SO_SELECT_ERR_QUEUE:
1014 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1015 		break;
1016 
1017 #ifdef CONFIG_NET_RX_BUSY_POLL
1018 	case SO_BUSY_POLL:
1019 		/* allow unprivileged users to decrease the value */
1020 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1021 			ret = -EPERM;
1022 		else {
1023 			if (val < 0)
1024 				ret = -EINVAL;
1025 			else
1026 				sk->sk_ll_usec = val;
1027 		}
1028 		break;
1029 #endif
1030 
1031 	case SO_MAX_PACING_RATE:
1032 		if (val != ~0U)
1033 			cmpxchg(&sk->sk_pacing_status,
1034 				SK_PACING_NONE,
1035 				SK_PACING_NEEDED);
1036 		sk->sk_max_pacing_rate = val;
1037 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1038 					 sk->sk_max_pacing_rate);
1039 		break;
1040 
1041 	case SO_INCOMING_CPU:
1042 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1043 		break;
1044 
1045 	case SO_CNX_ADVICE:
1046 		if (val == 1)
1047 			dst_negative_advice(sk);
1048 		break;
1049 
1050 	case SO_ZEROCOPY:
1051 		if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
1052 			ret = -ENOTSUPP;
1053 		else if (sk->sk_protocol != IPPROTO_TCP)
1054 			ret = -ENOTSUPP;
1055 		else if (sk->sk_state != TCP_CLOSE)
1056 			ret = -EBUSY;
1057 		else if (val < 0 || val > 1)
1058 			ret = -EINVAL;
1059 		else
1060 			sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1061 		break;
1062 
1063 	default:
1064 		ret = -ENOPROTOOPT;
1065 		break;
1066 	}
1067 	release_sock(sk);
1068 	return ret;
1069 }
1070 EXPORT_SYMBOL(sock_setsockopt);
1071 
1072 
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1073 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1074 			  struct ucred *ucred)
1075 {
1076 	ucred->pid = pid_vnr(pid);
1077 	ucred->uid = ucred->gid = -1;
1078 	if (cred) {
1079 		struct user_namespace *current_ns = current_user_ns();
1080 
1081 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1082 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1083 	}
1084 }
1085 
groups_to_user(gid_t __user * dst,const struct group_info * src)1086 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1087 {
1088 	struct user_namespace *user_ns = current_user_ns();
1089 	int i;
1090 
1091 	for (i = 0; i < src->ngroups; i++)
1092 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1093 			return -EFAULT;
1094 
1095 	return 0;
1096 }
1097 
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1098 int sock_getsockopt(struct socket *sock, int level, int optname,
1099 		    char __user *optval, int __user *optlen)
1100 {
1101 	struct sock *sk = sock->sk;
1102 
1103 	union {
1104 		int val;
1105 		u64 val64;
1106 		struct linger ling;
1107 		struct timeval tm;
1108 	} v;
1109 
1110 	int lv = sizeof(int);
1111 	int len;
1112 
1113 	if (get_user(len, optlen))
1114 		return -EFAULT;
1115 	if (len < 0)
1116 		return -EINVAL;
1117 
1118 	memset(&v, 0, sizeof(v));
1119 
1120 	switch (optname) {
1121 	case SO_DEBUG:
1122 		v.val = sock_flag(sk, SOCK_DBG);
1123 		break;
1124 
1125 	case SO_DONTROUTE:
1126 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1127 		break;
1128 
1129 	case SO_BROADCAST:
1130 		v.val = sock_flag(sk, SOCK_BROADCAST);
1131 		break;
1132 
1133 	case SO_SNDBUF:
1134 		v.val = sk->sk_sndbuf;
1135 		break;
1136 
1137 	case SO_RCVBUF:
1138 		v.val = sk->sk_rcvbuf;
1139 		break;
1140 
1141 	case SO_REUSEADDR:
1142 		v.val = sk->sk_reuse;
1143 		break;
1144 
1145 	case SO_REUSEPORT:
1146 		v.val = sk->sk_reuseport;
1147 		break;
1148 
1149 	case SO_KEEPALIVE:
1150 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1151 		break;
1152 
1153 	case SO_TYPE:
1154 		v.val = sk->sk_type;
1155 		break;
1156 
1157 	case SO_PROTOCOL:
1158 		v.val = sk->sk_protocol;
1159 		break;
1160 
1161 	case SO_DOMAIN:
1162 		v.val = sk->sk_family;
1163 		break;
1164 
1165 	case SO_ERROR:
1166 		v.val = -sock_error(sk);
1167 		if (v.val == 0)
1168 			v.val = xchg(&sk->sk_err_soft, 0);
1169 		break;
1170 
1171 	case SO_OOBINLINE:
1172 		v.val = sock_flag(sk, SOCK_URGINLINE);
1173 		break;
1174 
1175 	case SO_NO_CHECK:
1176 		v.val = sk->sk_no_check_tx;
1177 		break;
1178 
1179 	case SO_PRIORITY:
1180 		v.val = sk->sk_priority;
1181 		break;
1182 
1183 	case SO_LINGER:
1184 		lv		= sizeof(v.ling);
1185 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1186 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1187 		break;
1188 
1189 	case SO_BSDCOMPAT:
1190 		sock_warn_obsolete_bsdism("getsockopt");
1191 		break;
1192 
1193 	case SO_TIMESTAMP:
1194 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1195 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1196 		break;
1197 
1198 	case SO_TIMESTAMPNS:
1199 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1200 		break;
1201 
1202 	case SO_TIMESTAMPING:
1203 		v.val = sk->sk_tsflags;
1204 		break;
1205 
1206 	case SO_RCVTIMEO:
1207 		lv = sizeof(struct timeval);
1208 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1209 			v.tm.tv_sec = 0;
1210 			v.tm.tv_usec = 0;
1211 		} else {
1212 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1213 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1214 		}
1215 		break;
1216 
1217 	case SO_SNDTIMEO:
1218 		lv = sizeof(struct timeval);
1219 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1220 			v.tm.tv_sec = 0;
1221 			v.tm.tv_usec = 0;
1222 		} else {
1223 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1224 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1225 		}
1226 		break;
1227 
1228 	case SO_RCVLOWAT:
1229 		v.val = sk->sk_rcvlowat;
1230 		break;
1231 
1232 	case SO_SNDLOWAT:
1233 		v.val = 1;
1234 		break;
1235 
1236 	case SO_PASSCRED:
1237 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1238 		break;
1239 
1240 	case SO_PEERCRED:
1241 	{
1242 		struct ucred peercred;
1243 		if (len > sizeof(peercred))
1244 			len = sizeof(peercred);
1245 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1246 		if (copy_to_user(optval, &peercred, len))
1247 			return -EFAULT;
1248 		goto lenout;
1249 	}
1250 
1251 	case SO_PEERGROUPS:
1252 	{
1253 		int ret, n;
1254 
1255 		if (!sk->sk_peer_cred)
1256 			return -ENODATA;
1257 
1258 		n = sk->sk_peer_cred->group_info->ngroups;
1259 		if (len < n * sizeof(gid_t)) {
1260 			len = n * sizeof(gid_t);
1261 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1262 		}
1263 		len = n * sizeof(gid_t);
1264 
1265 		ret = groups_to_user((gid_t __user *)optval,
1266 				     sk->sk_peer_cred->group_info);
1267 		if (ret)
1268 			return ret;
1269 		goto lenout;
1270 	}
1271 
1272 	case SO_PEERNAME:
1273 	{
1274 		char address[128];
1275 
1276 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1277 			return -ENOTCONN;
1278 		if (lv < len)
1279 			return -EINVAL;
1280 		if (copy_to_user(optval, address, len))
1281 			return -EFAULT;
1282 		goto lenout;
1283 	}
1284 
1285 	/* Dubious BSD thing... Probably nobody even uses it, but
1286 	 * the UNIX standard wants it for whatever reason... -DaveM
1287 	 */
1288 	case SO_ACCEPTCONN:
1289 		v.val = sk->sk_state == TCP_LISTEN;
1290 		break;
1291 
1292 	case SO_PASSSEC:
1293 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1294 		break;
1295 
1296 	case SO_PEERSEC:
1297 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1298 
1299 	case SO_MARK:
1300 		v.val = sk->sk_mark;
1301 		break;
1302 
1303 	case SO_RXQ_OVFL:
1304 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1305 		break;
1306 
1307 	case SO_WIFI_STATUS:
1308 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1309 		break;
1310 
1311 	case SO_PEEK_OFF:
1312 		if (!sock->ops->set_peek_off)
1313 			return -EOPNOTSUPP;
1314 
1315 		v.val = sk->sk_peek_off;
1316 		break;
1317 	case SO_NOFCS:
1318 		v.val = sock_flag(sk, SOCK_NOFCS);
1319 		break;
1320 
1321 	case SO_BINDTODEVICE:
1322 		return sock_getbindtodevice(sk, optval, optlen, len);
1323 
1324 	case SO_GET_FILTER:
1325 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1326 		if (len < 0)
1327 			return len;
1328 
1329 		goto lenout;
1330 
1331 	case SO_LOCK_FILTER:
1332 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1333 		break;
1334 
1335 	case SO_BPF_EXTENSIONS:
1336 		v.val = bpf_tell_extensions();
1337 		break;
1338 
1339 	case SO_SELECT_ERR_QUEUE:
1340 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1341 		break;
1342 
1343 #ifdef CONFIG_NET_RX_BUSY_POLL
1344 	case SO_BUSY_POLL:
1345 		v.val = sk->sk_ll_usec;
1346 		break;
1347 #endif
1348 
1349 	case SO_MAX_PACING_RATE:
1350 		v.val = sk->sk_max_pacing_rate;
1351 		break;
1352 
1353 	case SO_INCOMING_CPU:
1354 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1355 		break;
1356 
1357 	case SO_MEMINFO:
1358 	{
1359 		u32 meminfo[SK_MEMINFO_VARS];
1360 
1361 		sk_get_meminfo(sk, meminfo);
1362 
1363 		len = min_t(unsigned int, len, sizeof(meminfo));
1364 		if (copy_to_user(optval, &meminfo, len))
1365 			return -EFAULT;
1366 
1367 		goto lenout;
1368 	}
1369 
1370 #ifdef CONFIG_NET_RX_BUSY_POLL
1371 	case SO_INCOMING_NAPI_ID:
1372 		v.val = READ_ONCE(sk->sk_napi_id);
1373 
1374 		/* aggregate non-NAPI IDs down to 0 */
1375 		if (v.val < MIN_NAPI_ID)
1376 			v.val = 0;
1377 
1378 		break;
1379 #endif
1380 
1381 	case SO_COOKIE:
1382 		lv = sizeof(u64);
1383 		if (len < lv)
1384 			return -EINVAL;
1385 		v.val64 = sock_gen_cookie(sk);
1386 		break;
1387 
1388 	case SO_ZEROCOPY:
1389 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1390 		break;
1391 
1392 	default:
1393 		/* We implement the SO_SNDLOWAT etc to not be settable
1394 		 * (1003.1g 7).
1395 		 */
1396 		return -ENOPROTOOPT;
1397 	}
1398 
1399 	if (len > lv)
1400 		len = lv;
1401 	if (copy_to_user(optval, &v, len))
1402 		return -EFAULT;
1403 lenout:
1404 	if (put_user(len, optlen))
1405 		return -EFAULT;
1406 	return 0;
1407 }
1408 
1409 /*
1410  * Initialize an sk_lock.
1411  *
1412  * (We also register the sk_lock with the lock validator.)
1413  */
sock_lock_init(struct sock * sk)1414 static inline void sock_lock_init(struct sock *sk)
1415 {
1416 	if (sk->sk_kern_sock)
1417 		sock_lock_init_class_and_name(
1418 			sk,
1419 			af_family_kern_slock_key_strings[sk->sk_family],
1420 			af_family_kern_slock_keys + sk->sk_family,
1421 			af_family_kern_key_strings[sk->sk_family],
1422 			af_family_kern_keys + sk->sk_family);
1423 	else
1424 		sock_lock_init_class_and_name(
1425 			sk,
1426 			af_family_slock_key_strings[sk->sk_family],
1427 			af_family_slock_keys + sk->sk_family,
1428 			af_family_key_strings[sk->sk_family],
1429 			af_family_keys + sk->sk_family);
1430 }
1431 
1432 /*
1433  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1434  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1435  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1436  */
sock_copy(struct sock * nsk,const struct sock * osk)1437 static void sock_copy(struct sock *nsk, const struct sock *osk)
1438 {
1439 #ifdef CONFIG_SECURITY_NETWORK
1440 	void *sptr = nsk->sk_security;
1441 #endif
1442 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1443 
1444 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1445 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1446 
1447 #ifdef CONFIG_SECURITY_NETWORK
1448 	nsk->sk_security = sptr;
1449 	security_sk_clone(osk, nsk);
1450 #endif
1451 }
1452 
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1453 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1454 		int family)
1455 {
1456 	struct sock *sk;
1457 	struct kmem_cache *slab;
1458 
1459 	slab = prot->slab;
1460 	if (slab != NULL) {
1461 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1462 		if (!sk)
1463 			return sk;
1464 		if (priority & __GFP_ZERO)
1465 			sk_prot_clear_nulls(sk, prot->obj_size);
1466 	} else
1467 		sk = kmalloc(prot->obj_size, priority);
1468 
1469 	if (sk != NULL) {
1470 		if (security_sk_alloc(sk, family, priority))
1471 			goto out_free;
1472 
1473 		if (!try_module_get(prot->owner))
1474 			goto out_free_sec;
1475 		sk_tx_queue_clear(sk);
1476 	}
1477 
1478 	return sk;
1479 
1480 out_free_sec:
1481 	security_sk_free(sk);
1482 out_free:
1483 	if (slab != NULL)
1484 		kmem_cache_free(slab, sk);
1485 	else
1486 		kfree(sk);
1487 	return NULL;
1488 }
1489 
sk_prot_free(struct proto * prot,struct sock * sk)1490 static void sk_prot_free(struct proto *prot, struct sock *sk)
1491 {
1492 	struct kmem_cache *slab;
1493 	struct module *owner;
1494 
1495 	owner = prot->owner;
1496 	slab = prot->slab;
1497 
1498 	cgroup_sk_free(&sk->sk_cgrp_data);
1499 	mem_cgroup_sk_free(sk);
1500 	security_sk_free(sk);
1501 	if (slab != NULL)
1502 		kmem_cache_free(slab, sk);
1503 	else
1504 		kfree(sk);
1505 	module_put(owner);
1506 }
1507 
1508 /**
1509  *	sk_alloc - All socket objects are allocated here
1510  *	@net: the applicable net namespace
1511  *	@family: protocol family
1512  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1513  *	@prot: struct proto associated with this new sock instance
1514  *	@kern: is this to be a kernel socket?
1515  */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)1516 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1517 		      struct proto *prot, int kern)
1518 {
1519 	struct sock *sk;
1520 
1521 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1522 	if (sk) {
1523 		sk->sk_family = family;
1524 		/*
1525 		 * See comment in struct sock definition to understand
1526 		 * why we need sk_prot_creator -acme
1527 		 */
1528 		sk->sk_prot = sk->sk_prot_creator = prot;
1529 		sk->sk_kern_sock = kern;
1530 		sock_lock_init(sk);
1531 		sk->sk_net_refcnt = kern ? 0 : 1;
1532 		if (likely(sk->sk_net_refcnt))
1533 			get_net(net);
1534 		sock_net_set(sk, net);
1535 		refcount_set(&sk->sk_wmem_alloc, 1);
1536 
1537 		mem_cgroup_sk_alloc(sk);
1538 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1539 		sock_update_classid(&sk->sk_cgrp_data);
1540 		sock_update_netprioidx(&sk->sk_cgrp_data);
1541 	}
1542 
1543 	return sk;
1544 }
1545 EXPORT_SYMBOL(sk_alloc);
1546 
1547 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1548  * grace period. This is the case for UDP sockets and TCP listeners.
1549  */
__sk_destruct(struct rcu_head * head)1550 static void __sk_destruct(struct rcu_head *head)
1551 {
1552 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1553 	struct sk_filter *filter;
1554 
1555 	if (sk->sk_destruct)
1556 		sk->sk_destruct(sk);
1557 
1558 	filter = rcu_dereference_check(sk->sk_filter,
1559 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1560 	if (filter) {
1561 		sk_filter_uncharge(sk, filter);
1562 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1563 	}
1564 
1565 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1566 
1567 	if (atomic_read(&sk->sk_omem_alloc))
1568 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1569 			 __func__, atomic_read(&sk->sk_omem_alloc));
1570 
1571 	if (sk->sk_frag.page) {
1572 		put_page(sk->sk_frag.page);
1573 		sk->sk_frag.page = NULL;
1574 	}
1575 
1576 	if (sk->sk_peer_cred)
1577 		put_cred(sk->sk_peer_cred);
1578 	put_pid(sk->sk_peer_pid);
1579 	if (likely(sk->sk_net_refcnt))
1580 		put_net(sock_net(sk));
1581 	sk_prot_free(sk->sk_prot_creator, sk);
1582 }
1583 
sk_destruct(struct sock * sk)1584 void sk_destruct(struct sock *sk)
1585 {
1586 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1587 
1588 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1589 		reuseport_detach_sock(sk);
1590 		use_call_rcu = true;
1591 	}
1592 
1593 	if (use_call_rcu)
1594 		call_rcu(&sk->sk_rcu, __sk_destruct);
1595 	else
1596 		__sk_destruct(&sk->sk_rcu);
1597 }
1598 
__sk_free(struct sock * sk)1599 static void __sk_free(struct sock *sk)
1600 {
1601 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1602 		sock_diag_broadcast_destroy(sk);
1603 	else
1604 		sk_destruct(sk);
1605 }
1606 
sk_free(struct sock * sk)1607 void sk_free(struct sock *sk)
1608 {
1609 	/*
1610 	 * We subtract one from sk_wmem_alloc and can know if
1611 	 * some packets are still in some tx queue.
1612 	 * If not null, sock_wfree() will call __sk_free(sk) later
1613 	 */
1614 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1615 		__sk_free(sk);
1616 }
1617 EXPORT_SYMBOL(sk_free);
1618 
sk_init_common(struct sock * sk)1619 static void sk_init_common(struct sock *sk)
1620 {
1621 	skb_queue_head_init(&sk->sk_receive_queue);
1622 	skb_queue_head_init(&sk->sk_write_queue);
1623 	skb_queue_head_init(&sk->sk_error_queue);
1624 
1625 	rwlock_init(&sk->sk_callback_lock);
1626 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1627 			af_rlock_keys + sk->sk_family,
1628 			af_family_rlock_key_strings[sk->sk_family]);
1629 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1630 			af_wlock_keys + sk->sk_family,
1631 			af_family_wlock_key_strings[sk->sk_family]);
1632 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1633 			af_elock_keys + sk->sk_family,
1634 			af_family_elock_key_strings[sk->sk_family]);
1635 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1636 			af_callback_keys + sk->sk_family,
1637 			af_family_clock_key_strings[sk->sk_family]);
1638 }
1639 
1640 /**
1641  *	sk_clone_lock - clone a socket, and lock its clone
1642  *	@sk: the socket to clone
1643  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1644  *
1645  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1646  */
sk_clone_lock(const struct sock * sk,const gfp_t priority)1647 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1648 {
1649 	struct sock *newsk;
1650 	bool is_charged = true;
1651 
1652 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1653 	if (newsk != NULL) {
1654 		struct sk_filter *filter;
1655 
1656 		sock_copy(newsk, sk);
1657 
1658 		newsk->sk_prot_creator = sk->sk_prot;
1659 
1660 		/* SANITY */
1661 		if (likely(newsk->sk_net_refcnt))
1662 			get_net(sock_net(newsk));
1663 		sk_node_init(&newsk->sk_node);
1664 		sock_lock_init(newsk);
1665 		bh_lock_sock(newsk);
1666 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1667 		newsk->sk_backlog.len = 0;
1668 
1669 		atomic_set(&newsk->sk_rmem_alloc, 0);
1670 		/*
1671 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1672 		 */
1673 		refcount_set(&newsk->sk_wmem_alloc, 1);
1674 		atomic_set(&newsk->sk_omem_alloc, 0);
1675 		sk_init_common(newsk);
1676 
1677 		newsk->sk_dst_cache	= NULL;
1678 		newsk->sk_dst_pending_confirm = 0;
1679 		newsk->sk_wmem_queued	= 0;
1680 		newsk->sk_forward_alloc = 0;
1681 		atomic_set(&newsk->sk_drops, 0);
1682 		newsk->sk_send_head	= NULL;
1683 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1684 		atomic_set(&newsk->sk_zckey, 0);
1685 
1686 		sock_reset_flag(newsk, SOCK_DONE);
1687 
1688 		/* sk->sk_memcg will be populated at accept() time */
1689 		newsk->sk_memcg = NULL;
1690 
1691 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1692 
1693 		rcu_read_lock();
1694 		filter = rcu_dereference(sk->sk_filter);
1695 		if (filter != NULL)
1696 			/* though it's an empty new sock, the charging may fail
1697 			 * if sysctl_optmem_max was changed between creation of
1698 			 * original socket and cloning
1699 			 */
1700 			is_charged = sk_filter_charge(newsk, filter);
1701 		RCU_INIT_POINTER(newsk->sk_filter, filter);
1702 		rcu_read_unlock();
1703 
1704 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1705 			/* We need to make sure that we don't uncharge the new
1706 			 * socket if we couldn't charge it in the first place
1707 			 * as otherwise we uncharge the parent's filter.
1708 			 */
1709 			if (!is_charged)
1710 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1711 			sk_free_unlock_clone(newsk);
1712 			newsk = NULL;
1713 			goto out;
1714 		}
1715 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1716 
1717 		newsk->sk_err	   = 0;
1718 		newsk->sk_err_soft = 0;
1719 		newsk->sk_priority = 0;
1720 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1721 		atomic64_set(&newsk->sk_cookie, 0);
1722 
1723 		/*
1724 		 * Before updating sk_refcnt, we must commit prior changes to memory
1725 		 * (Documentation/RCU/rculist_nulls.txt for details)
1726 		 */
1727 		smp_wmb();
1728 		refcount_set(&newsk->sk_refcnt, 2);
1729 
1730 		/*
1731 		 * Increment the counter in the same struct proto as the master
1732 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1733 		 * is the same as sk->sk_prot->socks, as this field was copied
1734 		 * with memcpy).
1735 		 *
1736 		 * This _changes_ the previous behaviour, where
1737 		 * tcp_create_openreq_child always was incrementing the
1738 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1739 		 * to be taken into account in all callers. -acme
1740 		 */
1741 		sk_refcnt_debug_inc(newsk);
1742 		sk_set_socket(newsk, NULL);
1743 		newsk->sk_wq = NULL;
1744 
1745 		if (newsk->sk_prot->sockets_allocated)
1746 			sk_sockets_allocated_inc(newsk);
1747 
1748 		if (sock_needs_netstamp(sk) &&
1749 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1750 			net_enable_timestamp();
1751 	}
1752 out:
1753 	return newsk;
1754 }
1755 EXPORT_SYMBOL_GPL(sk_clone_lock);
1756 
sk_free_unlock_clone(struct sock * sk)1757 void sk_free_unlock_clone(struct sock *sk)
1758 {
1759 	/* It is still raw copy of parent, so invalidate
1760 	 * destructor and make plain sk_free() */
1761 	sk->sk_destruct = NULL;
1762 	bh_unlock_sock(sk);
1763 	sk_free(sk);
1764 }
1765 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1766 
sk_setup_caps(struct sock * sk,struct dst_entry * dst)1767 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1768 {
1769 	u32 max_segs = 1;
1770 
1771 	sk_dst_set(sk, dst);
1772 	sk->sk_route_caps = dst->dev->features;
1773 	if (sk->sk_route_caps & NETIF_F_GSO)
1774 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1775 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1776 	if (sk_can_gso(sk)) {
1777 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1778 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1779 		} else {
1780 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1781 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1782 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1783 		}
1784 	}
1785 	sk->sk_gso_max_segs = max_segs;
1786 }
1787 EXPORT_SYMBOL_GPL(sk_setup_caps);
1788 
1789 /*
1790  *	Simple resource managers for sockets.
1791  */
1792 
1793 
1794 /*
1795  * Write buffer destructor automatically called from kfree_skb.
1796  */
sock_wfree(struct sk_buff * skb)1797 void sock_wfree(struct sk_buff *skb)
1798 {
1799 	struct sock *sk = skb->sk;
1800 	unsigned int len = skb->truesize;
1801 
1802 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1803 		/*
1804 		 * Keep a reference on sk_wmem_alloc, this will be released
1805 		 * after sk_write_space() call
1806 		 */
1807 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1808 		sk->sk_write_space(sk);
1809 		len = 1;
1810 	}
1811 	/*
1812 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1813 	 * could not do because of in-flight packets
1814 	 */
1815 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1816 		__sk_free(sk);
1817 }
1818 EXPORT_SYMBOL(sock_wfree);
1819 
1820 /* This variant of sock_wfree() is used by TCP,
1821  * since it sets SOCK_USE_WRITE_QUEUE.
1822  */
__sock_wfree(struct sk_buff * skb)1823 void __sock_wfree(struct sk_buff *skb)
1824 {
1825 	struct sock *sk = skb->sk;
1826 
1827 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1828 		__sk_free(sk);
1829 }
1830 
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)1831 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1832 {
1833 	skb_orphan(skb);
1834 	skb->sk = sk;
1835 #ifdef CONFIG_INET
1836 	if (unlikely(!sk_fullsock(sk))) {
1837 		skb->destructor = sock_edemux;
1838 		sock_hold(sk);
1839 		return;
1840 	}
1841 #endif
1842 	skb->destructor = sock_wfree;
1843 	skb_set_hash_from_sk(skb, sk);
1844 	/*
1845 	 * We used to take a refcount on sk, but following operation
1846 	 * is enough to guarantee sk_free() wont free this sock until
1847 	 * all in-flight packets are completed
1848 	 */
1849 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1850 }
1851 EXPORT_SYMBOL(skb_set_owner_w);
1852 
1853 /* This helper is used by netem, as it can hold packets in its
1854  * delay queue. We want to allow the owner socket to send more
1855  * packets, as if they were already TX completed by a typical driver.
1856  * But we also want to keep skb->sk set because some packet schedulers
1857  * rely on it (sch_fq for example).
1858  */
skb_orphan_partial(struct sk_buff * skb)1859 void skb_orphan_partial(struct sk_buff *skb)
1860 {
1861 	if (skb_is_tcp_pure_ack(skb))
1862 		return;
1863 
1864 	if (skb->destructor == sock_wfree
1865 #ifdef CONFIG_INET
1866 	    || skb->destructor == tcp_wfree
1867 #endif
1868 		) {
1869 		struct sock *sk = skb->sk;
1870 
1871 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1872 			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1873 			skb->destructor = sock_efree;
1874 		}
1875 	} else {
1876 		skb_orphan(skb);
1877 	}
1878 }
1879 EXPORT_SYMBOL(skb_orphan_partial);
1880 
1881 /*
1882  * Read buffer destructor automatically called from kfree_skb.
1883  */
sock_rfree(struct sk_buff * skb)1884 void sock_rfree(struct sk_buff *skb)
1885 {
1886 	struct sock *sk = skb->sk;
1887 	unsigned int len = skb->truesize;
1888 
1889 	atomic_sub(len, &sk->sk_rmem_alloc);
1890 	sk_mem_uncharge(sk, len);
1891 }
1892 EXPORT_SYMBOL(sock_rfree);
1893 
1894 /*
1895  * Buffer destructor for skbs that are not used directly in read or write
1896  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1897  */
sock_efree(struct sk_buff * skb)1898 void sock_efree(struct sk_buff *skb)
1899 {
1900 	sock_put(skb->sk);
1901 }
1902 EXPORT_SYMBOL(sock_efree);
1903 
sock_i_uid(struct sock * sk)1904 kuid_t sock_i_uid(struct sock *sk)
1905 {
1906 	kuid_t uid;
1907 
1908 	read_lock_bh(&sk->sk_callback_lock);
1909 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1910 	read_unlock_bh(&sk->sk_callback_lock);
1911 	return uid;
1912 }
1913 EXPORT_SYMBOL(sock_i_uid);
1914 
sock_i_ino(struct sock * sk)1915 unsigned long sock_i_ino(struct sock *sk)
1916 {
1917 	unsigned long ino;
1918 
1919 	read_lock_bh(&sk->sk_callback_lock);
1920 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1921 	read_unlock_bh(&sk->sk_callback_lock);
1922 	return ino;
1923 }
1924 EXPORT_SYMBOL(sock_i_ino);
1925 
1926 /*
1927  * Allocate a skb from the socket's send buffer.
1928  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)1929 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1930 			     gfp_t priority)
1931 {
1932 	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1933 		struct sk_buff *skb = alloc_skb(size, priority);
1934 		if (skb) {
1935 			skb_set_owner_w(skb, sk);
1936 			return skb;
1937 		}
1938 	}
1939 	return NULL;
1940 }
1941 EXPORT_SYMBOL(sock_wmalloc);
1942 
sock_ofree(struct sk_buff * skb)1943 static void sock_ofree(struct sk_buff *skb)
1944 {
1945 	struct sock *sk = skb->sk;
1946 
1947 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1948 }
1949 
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)1950 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1951 			     gfp_t priority)
1952 {
1953 	struct sk_buff *skb;
1954 
1955 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1956 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1957 	    sysctl_optmem_max)
1958 		return NULL;
1959 
1960 	skb = alloc_skb(size, priority);
1961 	if (!skb)
1962 		return NULL;
1963 
1964 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
1965 	skb->sk = sk;
1966 	skb->destructor = sock_ofree;
1967 	return skb;
1968 }
1969 
1970 /*
1971  * Allocate a memory block from the socket's option memory buffer.
1972  */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)1973 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1974 {
1975 	if ((unsigned int)size <= sysctl_optmem_max &&
1976 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1977 		void *mem;
1978 		/* First do the add, to avoid the race if kmalloc
1979 		 * might sleep.
1980 		 */
1981 		atomic_add(size, &sk->sk_omem_alloc);
1982 		mem = kmalloc(size, priority);
1983 		if (mem)
1984 			return mem;
1985 		atomic_sub(size, &sk->sk_omem_alloc);
1986 	}
1987 	return NULL;
1988 }
1989 EXPORT_SYMBOL(sock_kmalloc);
1990 
1991 /* Free an option memory block. Note, we actually want the inline
1992  * here as this allows gcc to detect the nullify and fold away the
1993  * condition entirely.
1994  */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)1995 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1996 				  const bool nullify)
1997 {
1998 	if (WARN_ON_ONCE(!mem))
1999 		return;
2000 	if (nullify)
2001 		kzfree(mem);
2002 	else
2003 		kfree(mem);
2004 	atomic_sub(size, &sk->sk_omem_alloc);
2005 }
2006 
sock_kfree_s(struct sock * sk,void * mem,int size)2007 void sock_kfree_s(struct sock *sk, void *mem, int size)
2008 {
2009 	__sock_kfree_s(sk, mem, size, false);
2010 }
2011 EXPORT_SYMBOL(sock_kfree_s);
2012 
sock_kzfree_s(struct sock * sk,void * mem,int size)2013 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2014 {
2015 	__sock_kfree_s(sk, mem, size, true);
2016 }
2017 EXPORT_SYMBOL(sock_kzfree_s);
2018 
2019 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2020    I think, these locks should be removed for datagram sockets.
2021  */
sock_wait_for_wmem(struct sock * sk,long timeo)2022 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2023 {
2024 	DEFINE_WAIT(wait);
2025 
2026 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2027 	for (;;) {
2028 		if (!timeo)
2029 			break;
2030 		if (signal_pending(current))
2031 			break;
2032 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2033 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2034 		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2035 			break;
2036 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2037 			break;
2038 		if (sk->sk_err)
2039 			break;
2040 		timeo = schedule_timeout(timeo);
2041 	}
2042 	finish_wait(sk_sleep(sk), &wait);
2043 	return timeo;
2044 }
2045 
2046 
2047 /*
2048  *	Generic send/receive buffer handlers
2049  */
2050 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2051 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2052 				     unsigned long data_len, int noblock,
2053 				     int *errcode, int max_page_order)
2054 {
2055 	struct sk_buff *skb;
2056 	long timeo;
2057 	int err;
2058 
2059 	timeo = sock_sndtimeo(sk, noblock);
2060 	for (;;) {
2061 		err = sock_error(sk);
2062 		if (err != 0)
2063 			goto failure;
2064 
2065 		err = -EPIPE;
2066 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2067 			goto failure;
2068 
2069 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2070 			break;
2071 
2072 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2073 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2074 		err = -EAGAIN;
2075 		if (!timeo)
2076 			goto failure;
2077 		if (signal_pending(current))
2078 			goto interrupted;
2079 		timeo = sock_wait_for_wmem(sk, timeo);
2080 	}
2081 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2082 				   errcode, sk->sk_allocation);
2083 	if (skb)
2084 		skb_set_owner_w(skb, sk);
2085 	return skb;
2086 
2087 interrupted:
2088 	err = sock_intr_errno(timeo);
2089 failure:
2090 	*errcode = err;
2091 	return NULL;
2092 }
2093 EXPORT_SYMBOL(sock_alloc_send_pskb);
2094 
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)2095 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2096 				    int noblock, int *errcode)
2097 {
2098 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2099 }
2100 EXPORT_SYMBOL(sock_alloc_send_skb);
2101 
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2102 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2103 		     struct sockcm_cookie *sockc)
2104 {
2105 	u32 tsflags;
2106 
2107 	switch (cmsg->cmsg_type) {
2108 	case SO_MARK:
2109 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2110 			return -EPERM;
2111 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2112 			return -EINVAL;
2113 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2114 		break;
2115 	case SO_TIMESTAMPING:
2116 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2117 			return -EINVAL;
2118 
2119 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2120 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2121 			return -EINVAL;
2122 
2123 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2124 		sockc->tsflags |= tsflags;
2125 		break;
2126 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2127 	case SCM_RIGHTS:
2128 	case SCM_CREDENTIALS:
2129 		break;
2130 	default:
2131 		return -EINVAL;
2132 	}
2133 	return 0;
2134 }
2135 EXPORT_SYMBOL(__sock_cmsg_send);
2136 
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2137 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2138 		   struct sockcm_cookie *sockc)
2139 {
2140 	struct cmsghdr *cmsg;
2141 	int ret;
2142 
2143 	for_each_cmsghdr(cmsg, msg) {
2144 		if (!CMSG_OK(msg, cmsg))
2145 			return -EINVAL;
2146 		if (cmsg->cmsg_level != SOL_SOCKET)
2147 			continue;
2148 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2149 		if (ret)
2150 			return ret;
2151 	}
2152 	return 0;
2153 }
2154 EXPORT_SYMBOL(sock_cmsg_send);
2155 
sk_enter_memory_pressure(struct sock * sk)2156 static void sk_enter_memory_pressure(struct sock *sk)
2157 {
2158 	if (!sk->sk_prot->enter_memory_pressure)
2159 		return;
2160 
2161 	sk->sk_prot->enter_memory_pressure(sk);
2162 }
2163 
sk_leave_memory_pressure(struct sock * sk)2164 static void sk_leave_memory_pressure(struct sock *sk)
2165 {
2166 	if (sk->sk_prot->leave_memory_pressure) {
2167 		sk->sk_prot->leave_memory_pressure(sk);
2168 	} else {
2169 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2170 
2171 		if (memory_pressure && READ_ONCE(*memory_pressure))
2172 			WRITE_ONCE(*memory_pressure, 0);
2173 	}
2174 }
2175 
2176 /* On 32bit arches, an skb frag is limited to 2^15 */
2177 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2178 
2179 /**
2180  * skb_page_frag_refill - check that a page_frag contains enough room
2181  * @sz: minimum size of the fragment we want to get
2182  * @pfrag: pointer to page_frag
2183  * @gfp: priority for memory allocation
2184  *
2185  * Note: While this allocator tries to use high order pages, there is
2186  * no guarantee that allocations succeed. Therefore, @sz MUST be
2187  * less or equal than PAGE_SIZE.
2188  */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2189 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2190 {
2191 	if (pfrag->page) {
2192 		if (page_ref_count(pfrag->page) == 1) {
2193 			pfrag->offset = 0;
2194 			return true;
2195 		}
2196 		if (pfrag->offset + sz <= pfrag->size)
2197 			return true;
2198 		put_page(pfrag->page);
2199 	}
2200 
2201 	pfrag->offset = 0;
2202 	if (SKB_FRAG_PAGE_ORDER) {
2203 		/* Avoid direct reclaim but allow kswapd to wake */
2204 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2205 					  __GFP_COMP | __GFP_NOWARN |
2206 					  __GFP_NORETRY,
2207 					  SKB_FRAG_PAGE_ORDER);
2208 		if (likely(pfrag->page)) {
2209 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2210 			return true;
2211 		}
2212 	}
2213 	pfrag->page = alloc_page(gfp);
2214 	if (likely(pfrag->page)) {
2215 		pfrag->size = PAGE_SIZE;
2216 		return true;
2217 	}
2218 	return false;
2219 }
2220 EXPORT_SYMBOL(skb_page_frag_refill);
2221 
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2222 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2223 {
2224 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2225 		return true;
2226 
2227 	sk_enter_memory_pressure(sk);
2228 	sk_stream_moderate_sndbuf(sk);
2229 	return false;
2230 }
2231 EXPORT_SYMBOL(sk_page_frag_refill);
2232 
__lock_sock(struct sock * sk)2233 static void __lock_sock(struct sock *sk)
2234 	__releases(&sk->sk_lock.slock)
2235 	__acquires(&sk->sk_lock.slock)
2236 {
2237 	DEFINE_WAIT(wait);
2238 
2239 	for (;;) {
2240 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2241 					TASK_UNINTERRUPTIBLE);
2242 		spin_unlock_bh(&sk->sk_lock.slock);
2243 		schedule();
2244 		spin_lock_bh(&sk->sk_lock.slock);
2245 		if (!sock_owned_by_user(sk))
2246 			break;
2247 	}
2248 	finish_wait(&sk->sk_lock.wq, &wait);
2249 }
2250 
__release_sock(struct sock * sk)2251 void __release_sock(struct sock *sk)
2252 	__releases(&sk->sk_lock.slock)
2253 	__acquires(&sk->sk_lock.slock)
2254 {
2255 	struct sk_buff *skb, *next;
2256 
2257 	while ((skb = sk->sk_backlog.head) != NULL) {
2258 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2259 
2260 		spin_unlock_bh(&sk->sk_lock.slock);
2261 
2262 		do {
2263 			next = skb->next;
2264 			prefetch(next);
2265 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2266 			skb->next = NULL;
2267 			sk_backlog_rcv(sk, skb);
2268 
2269 			cond_resched();
2270 
2271 			skb = next;
2272 		} while (skb != NULL);
2273 
2274 		spin_lock_bh(&sk->sk_lock.slock);
2275 	}
2276 
2277 	/*
2278 	 * Doing the zeroing here guarantee we can not loop forever
2279 	 * while a wild producer attempts to flood us.
2280 	 */
2281 	sk->sk_backlog.len = 0;
2282 }
2283 
__sk_flush_backlog(struct sock * sk)2284 void __sk_flush_backlog(struct sock *sk)
2285 {
2286 	spin_lock_bh(&sk->sk_lock.slock);
2287 	__release_sock(sk);
2288 	spin_unlock_bh(&sk->sk_lock.slock);
2289 }
2290 
2291 /**
2292  * sk_wait_data - wait for data to arrive at sk_receive_queue
2293  * @sk:    sock to wait on
2294  * @timeo: for how long
2295  * @skb:   last skb seen on sk_receive_queue
2296  *
2297  * Now socket state including sk->sk_err is changed only under lock,
2298  * hence we may omit checks after joining wait queue.
2299  * We check receive queue before schedule() only as optimization;
2300  * it is very likely that release_sock() added new data.
2301  */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2302 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2303 {
2304 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2305 	int rc;
2306 
2307 	add_wait_queue(sk_sleep(sk), &wait);
2308 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2309 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2310 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2311 	remove_wait_queue(sk_sleep(sk), &wait);
2312 	return rc;
2313 }
2314 EXPORT_SYMBOL(sk_wait_data);
2315 
2316 /**
2317  *	__sk_mem_raise_allocated - increase memory_allocated
2318  *	@sk: socket
2319  *	@size: memory size to allocate
2320  *	@amt: pages to allocate
2321  *	@kind: allocation type
2322  *
2323  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2324  */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)2325 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2326 {
2327 	struct proto *prot = sk->sk_prot;
2328 	long allocated = sk_memory_allocated_add(sk, amt);
2329 
2330 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2331 	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2332 		goto suppress_allocation;
2333 
2334 	/* Under limit. */
2335 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2336 		sk_leave_memory_pressure(sk);
2337 		return 1;
2338 	}
2339 
2340 	/* Under pressure. */
2341 	if (allocated > sk_prot_mem_limits(sk, 1))
2342 		sk_enter_memory_pressure(sk);
2343 
2344 	/* Over hard limit. */
2345 	if (allocated > sk_prot_mem_limits(sk, 2))
2346 		goto suppress_allocation;
2347 
2348 	/* guarantee minimum buffer size under pressure */
2349 	if (kind == SK_MEM_RECV) {
2350 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2351 			return 1;
2352 
2353 	} else { /* SK_MEM_SEND */
2354 		if (sk->sk_type == SOCK_STREAM) {
2355 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2356 				return 1;
2357 		} else if (refcount_read(&sk->sk_wmem_alloc) <
2358 			   prot->sysctl_wmem[0])
2359 				return 1;
2360 	}
2361 
2362 	if (sk_has_memory_pressure(sk)) {
2363 		u64 alloc;
2364 
2365 		if (!sk_under_memory_pressure(sk))
2366 			return 1;
2367 		alloc = sk_sockets_allocated_read_positive(sk);
2368 		if (sk_prot_mem_limits(sk, 2) > alloc *
2369 		    sk_mem_pages(sk->sk_wmem_queued +
2370 				 atomic_read(&sk->sk_rmem_alloc) +
2371 				 sk->sk_forward_alloc))
2372 			return 1;
2373 	}
2374 
2375 suppress_allocation:
2376 
2377 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2378 		sk_stream_moderate_sndbuf(sk);
2379 
2380 		/* Fail only if socket is _under_ its sndbuf.
2381 		 * In this case we cannot block, so that we have to fail.
2382 		 */
2383 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2384 			return 1;
2385 	}
2386 
2387 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2388 
2389 	sk_memory_allocated_sub(sk, amt);
2390 
2391 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2392 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2393 
2394 	return 0;
2395 }
2396 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2397 
2398 /**
2399  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2400  *	@sk: socket
2401  *	@size: memory size to allocate
2402  *	@kind: allocation type
2403  *
2404  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2405  *	rmem allocation. This function assumes that protocols which have
2406  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2407  */
__sk_mem_schedule(struct sock * sk,int size,int kind)2408 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2409 {
2410 	int ret, amt = sk_mem_pages(size);
2411 
2412 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2413 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2414 	if (!ret)
2415 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2416 	return ret;
2417 }
2418 EXPORT_SYMBOL(__sk_mem_schedule);
2419 
2420 /**
2421  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2422  *	@sk: socket
2423  *	@amount: number of quanta
2424  *
2425  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2426  */
__sk_mem_reduce_allocated(struct sock * sk,int amount)2427 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2428 {
2429 	sk_memory_allocated_sub(sk, amount);
2430 
2431 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2432 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2433 
2434 	if (sk_under_memory_pressure(sk) &&
2435 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2436 		sk_leave_memory_pressure(sk);
2437 }
2438 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2439 
2440 /**
2441  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2442  *	@sk: socket
2443  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2444  */
__sk_mem_reclaim(struct sock * sk,int amount)2445 void __sk_mem_reclaim(struct sock *sk, int amount)
2446 {
2447 	amount >>= SK_MEM_QUANTUM_SHIFT;
2448 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2449 	__sk_mem_reduce_allocated(sk, amount);
2450 }
2451 EXPORT_SYMBOL(__sk_mem_reclaim);
2452 
sk_set_peek_off(struct sock * sk,int val)2453 int sk_set_peek_off(struct sock *sk, int val)
2454 {
2455 	sk->sk_peek_off = val;
2456 	return 0;
2457 }
2458 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2459 
2460 /*
2461  * Set of default routines for initialising struct proto_ops when
2462  * the protocol does not support a particular function. In certain
2463  * cases where it makes no sense for a protocol to have a "do nothing"
2464  * function, some default processing is provided.
2465  */
2466 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2467 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2468 {
2469 	return -EOPNOTSUPP;
2470 }
2471 EXPORT_SYMBOL(sock_no_bind);
2472 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2473 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2474 		    int len, int flags)
2475 {
2476 	return -EOPNOTSUPP;
2477 }
2478 EXPORT_SYMBOL(sock_no_connect);
2479 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2480 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2481 {
2482 	return -EOPNOTSUPP;
2483 }
2484 EXPORT_SYMBOL(sock_no_socketpair);
2485 
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)2486 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2487 		   bool kern)
2488 {
2489 	return -EOPNOTSUPP;
2490 }
2491 EXPORT_SYMBOL(sock_no_accept);
2492 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int * len,int peer)2493 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2494 		    int *len, int peer)
2495 {
2496 	return -EOPNOTSUPP;
2497 }
2498 EXPORT_SYMBOL(sock_no_getname);
2499 
sock_no_poll(struct file * file,struct socket * sock,poll_table * pt)2500 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2501 {
2502 	return 0;
2503 }
2504 EXPORT_SYMBOL(sock_no_poll);
2505 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2506 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2507 {
2508 	return -EOPNOTSUPP;
2509 }
2510 EXPORT_SYMBOL(sock_no_ioctl);
2511 
sock_no_listen(struct socket * sock,int backlog)2512 int sock_no_listen(struct socket *sock, int backlog)
2513 {
2514 	return -EOPNOTSUPP;
2515 }
2516 EXPORT_SYMBOL(sock_no_listen);
2517 
sock_no_shutdown(struct socket * sock,int how)2518 int sock_no_shutdown(struct socket *sock, int how)
2519 {
2520 	return -EOPNOTSUPP;
2521 }
2522 EXPORT_SYMBOL(sock_no_shutdown);
2523 
sock_no_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2524 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2525 		    char __user *optval, unsigned int optlen)
2526 {
2527 	return -EOPNOTSUPP;
2528 }
2529 EXPORT_SYMBOL(sock_no_setsockopt);
2530 
sock_no_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2531 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2532 		    char __user *optval, int __user *optlen)
2533 {
2534 	return -EOPNOTSUPP;
2535 }
2536 EXPORT_SYMBOL(sock_no_getsockopt);
2537 
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)2538 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2539 {
2540 	return -EOPNOTSUPP;
2541 }
2542 EXPORT_SYMBOL(sock_no_sendmsg);
2543 
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)2544 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2545 {
2546 	return -EOPNOTSUPP;
2547 }
2548 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2549 
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)2550 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2551 		    int flags)
2552 {
2553 	return -EOPNOTSUPP;
2554 }
2555 EXPORT_SYMBOL(sock_no_recvmsg);
2556 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)2557 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2558 {
2559 	/* Mirror missing mmap method error code */
2560 	return -ENODEV;
2561 }
2562 EXPORT_SYMBOL(sock_no_mmap);
2563 
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)2564 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2565 {
2566 	ssize_t res;
2567 	struct msghdr msg = {.msg_flags = flags};
2568 	struct kvec iov;
2569 	char *kaddr = kmap(page);
2570 	iov.iov_base = kaddr + offset;
2571 	iov.iov_len = size;
2572 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2573 	kunmap(page);
2574 	return res;
2575 }
2576 EXPORT_SYMBOL(sock_no_sendpage);
2577 
sock_no_sendpage_locked(struct sock * sk,struct page * page,int offset,size_t size,int flags)2578 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2579 				int offset, size_t size, int flags)
2580 {
2581 	ssize_t res;
2582 	struct msghdr msg = {.msg_flags = flags};
2583 	struct kvec iov;
2584 	char *kaddr = kmap(page);
2585 
2586 	iov.iov_base = kaddr + offset;
2587 	iov.iov_len = size;
2588 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2589 	kunmap(page);
2590 	return res;
2591 }
2592 EXPORT_SYMBOL(sock_no_sendpage_locked);
2593 
2594 /*
2595  *	Default Socket Callbacks
2596  */
2597 
sock_def_wakeup(struct sock * sk)2598 static void sock_def_wakeup(struct sock *sk)
2599 {
2600 	struct socket_wq *wq;
2601 
2602 	rcu_read_lock();
2603 	wq = rcu_dereference(sk->sk_wq);
2604 	if (skwq_has_sleeper(wq))
2605 		wake_up_interruptible_all(&wq->wait);
2606 	rcu_read_unlock();
2607 }
2608 
sock_def_error_report(struct sock * sk)2609 static void sock_def_error_report(struct sock *sk)
2610 {
2611 	struct socket_wq *wq;
2612 
2613 	rcu_read_lock();
2614 	wq = rcu_dereference(sk->sk_wq);
2615 	if (skwq_has_sleeper(wq))
2616 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2617 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2618 	rcu_read_unlock();
2619 }
2620 
sock_def_readable(struct sock * sk)2621 static void sock_def_readable(struct sock *sk)
2622 {
2623 	struct socket_wq *wq;
2624 
2625 	rcu_read_lock();
2626 	wq = rcu_dereference(sk->sk_wq);
2627 	if (skwq_has_sleeper(wq))
2628 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2629 						POLLRDNORM | POLLRDBAND);
2630 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2631 	rcu_read_unlock();
2632 }
2633 
sock_def_write_space(struct sock * sk)2634 static void sock_def_write_space(struct sock *sk)
2635 {
2636 	struct socket_wq *wq;
2637 
2638 	rcu_read_lock();
2639 
2640 	/* Do not wake up a writer until he can make "significant"
2641 	 * progress.  --DaveM
2642 	 */
2643 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2644 		wq = rcu_dereference(sk->sk_wq);
2645 		if (skwq_has_sleeper(wq))
2646 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2647 						POLLWRNORM | POLLWRBAND);
2648 
2649 		/* Should agree with poll, otherwise some programs break */
2650 		if (sock_writeable(sk))
2651 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2652 	}
2653 
2654 	rcu_read_unlock();
2655 }
2656 
sock_def_destruct(struct sock * sk)2657 static void sock_def_destruct(struct sock *sk)
2658 {
2659 }
2660 
sk_send_sigurg(struct sock * sk)2661 void sk_send_sigurg(struct sock *sk)
2662 {
2663 	if (sk->sk_socket && sk->sk_socket->file)
2664 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2665 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2666 }
2667 EXPORT_SYMBOL(sk_send_sigurg);
2668 
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)2669 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2670 		    unsigned long expires)
2671 {
2672 	if (!mod_timer(timer, expires))
2673 		sock_hold(sk);
2674 }
2675 EXPORT_SYMBOL(sk_reset_timer);
2676 
sk_stop_timer(struct sock * sk,struct timer_list * timer)2677 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2678 {
2679 	if (del_timer(timer))
2680 		__sock_put(sk);
2681 }
2682 EXPORT_SYMBOL(sk_stop_timer);
2683 
sock_init_data(struct socket * sock,struct sock * sk)2684 void sock_init_data(struct socket *sock, struct sock *sk)
2685 {
2686 	sk_init_common(sk);
2687 	sk->sk_send_head	=	NULL;
2688 
2689 	init_timer(&sk->sk_timer);
2690 
2691 	sk->sk_allocation	=	GFP_KERNEL;
2692 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2693 	sk->sk_sndbuf		=	sysctl_wmem_default;
2694 	sk->sk_state		=	TCP_CLOSE;
2695 	sk_set_socket(sk, sock);
2696 
2697 	sock_set_flag(sk, SOCK_ZAPPED);
2698 
2699 	if (sock) {
2700 		sk->sk_type	=	sock->type;
2701 		sk->sk_wq	=	sock->wq;
2702 		sock->sk	=	sk;
2703 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2704 	} else {
2705 		sk->sk_wq	=	NULL;
2706 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2707 	}
2708 
2709 	rwlock_init(&sk->sk_callback_lock);
2710 	if (sk->sk_kern_sock)
2711 		lockdep_set_class_and_name(
2712 			&sk->sk_callback_lock,
2713 			af_kern_callback_keys + sk->sk_family,
2714 			af_family_kern_clock_key_strings[sk->sk_family]);
2715 	else
2716 		lockdep_set_class_and_name(
2717 			&sk->sk_callback_lock,
2718 			af_callback_keys + sk->sk_family,
2719 			af_family_clock_key_strings[sk->sk_family]);
2720 
2721 	sk->sk_state_change	=	sock_def_wakeup;
2722 	sk->sk_data_ready	=	sock_def_readable;
2723 	sk->sk_write_space	=	sock_def_write_space;
2724 	sk->sk_error_report	=	sock_def_error_report;
2725 	sk->sk_destruct		=	sock_def_destruct;
2726 
2727 	sk->sk_frag.page	=	NULL;
2728 	sk->sk_frag.offset	=	0;
2729 	sk->sk_peek_off		=	-1;
2730 
2731 	sk->sk_peer_pid 	=	NULL;
2732 	sk->sk_peer_cred	=	NULL;
2733 	sk->sk_write_pending	=	0;
2734 	sk->sk_rcvlowat		=	1;
2735 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2736 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2737 
2738 	sk->sk_stamp = SK_DEFAULT_STAMP;
2739 #if BITS_PER_LONG==32
2740 	seqlock_init(&sk->sk_stamp_seq);
2741 #endif
2742 	atomic_set(&sk->sk_zckey, 0);
2743 
2744 #ifdef CONFIG_NET_RX_BUSY_POLL
2745 	sk->sk_napi_id		=	0;
2746 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2747 #endif
2748 
2749 	sk->sk_max_pacing_rate = ~0U;
2750 	sk->sk_pacing_rate = ~0U;
2751 	sk->sk_incoming_cpu = -1;
2752 	/*
2753 	 * Before updating sk_refcnt, we must commit prior changes to memory
2754 	 * (Documentation/RCU/rculist_nulls.txt for details)
2755 	 */
2756 	smp_wmb();
2757 	refcount_set(&sk->sk_refcnt, 1);
2758 	atomic_set(&sk->sk_drops, 0);
2759 }
2760 EXPORT_SYMBOL(sock_init_data);
2761 
lock_sock_nested(struct sock * sk,int subclass)2762 void lock_sock_nested(struct sock *sk, int subclass)
2763 {
2764 	might_sleep();
2765 	spin_lock_bh(&sk->sk_lock.slock);
2766 	if (sk->sk_lock.owned)
2767 		__lock_sock(sk);
2768 	sk->sk_lock.owned = 1;
2769 	spin_unlock(&sk->sk_lock.slock);
2770 	/*
2771 	 * The sk_lock has mutex_lock() semantics here:
2772 	 */
2773 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2774 	local_bh_enable();
2775 }
2776 EXPORT_SYMBOL(lock_sock_nested);
2777 
release_sock(struct sock * sk)2778 void release_sock(struct sock *sk)
2779 {
2780 	spin_lock_bh(&sk->sk_lock.slock);
2781 	if (sk->sk_backlog.tail)
2782 		__release_sock(sk);
2783 
2784 	/* Warning : release_cb() might need to release sk ownership,
2785 	 * ie call sock_release_ownership(sk) before us.
2786 	 */
2787 	if (sk->sk_prot->release_cb)
2788 		sk->sk_prot->release_cb(sk);
2789 
2790 	sock_release_ownership(sk);
2791 	if (waitqueue_active(&sk->sk_lock.wq))
2792 		wake_up(&sk->sk_lock.wq);
2793 	spin_unlock_bh(&sk->sk_lock.slock);
2794 }
2795 EXPORT_SYMBOL(release_sock);
2796 
2797 /**
2798  * lock_sock_fast - fast version of lock_sock
2799  * @sk: socket
2800  *
2801  * This version should be used for very small section, where process wont block
2802  * return false if fast path is taken:
2803  *
2804  *   sk_lock.slock locked, owned = 0, BH disabled
2805  *
2806  * return true if slow path is taken:
2807  *
2808  *   sk_lock.slock unlocked, owned = 1, BH enabled
2809  */
lock_sock_fast(struct sock * sk)2810 bool lock_sock_fast(struct sock *sk)
2811 {
2812 	might_sleep();
2813 	spin_lock_bh(&sk->sk_lock.slock);
2814 
2815 	if (!sk->sk_lock.owned)
2816 		/*
2817 		 * Note : We must disable BH
2818 		 */
2819 		return false;
2820 
2821 	__lock_sock(sk);
2822 	sk->sk_lock.owned = 1;
2823 	spin_unlock(&sk->sk_lock.slock);
2824 	/*
2825 	 * The sk_lock has mutex_lock() semantics here:
2826 	 */
2827 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2828 	local_bh_enable();
2829 	return true;
2830 }
2831 EXPORT_SYMBOL(lock_sock_fast);
2832 
sock_get_timestamp(struct sock * sk,struct timeval __user * userstamp)2833 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2834 {
2835 	struct timeval tv;
2836 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2837 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2838 	tv = ktime_to_timeval(sk->sk_stamp);
2839 	if (tv.tv_sec == -1)
2840 		return -ENOENT;
2841 	if (tv.tv_sec == 0) {
2842 		sk->sk_stamp = ktime_get_real();
2843 		tv = ktime_to_timeval(sk->sk_stamp);
2844 	}
2845 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2846 }
2847 EXPORT_SYMBOL(sock_get_timestamp);
2848 
sock_get_timestampns(struct sock * sk,struct timespec __user * userstamp)2849 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2850 {
2851 	struct timespec ts;
2852 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2853 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2854 	ts = ktime_to_timespec(sk->sk_stamp);
2855 	if (ts.tv_sec == -1)
2856 		return -ENOENT;
2857 	if (ts.tv_sec == 0) {
2858 		sk->sk_stamp = ktime_get_real();
2859 		ts = ktime_to_timespec(sk->sk_stamp);
2860 	}
2861 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2862 }
2863 EXPORT_SYMBOL(sock_get_timestampns);
2864 
sock_enable_timestamp(struct sock * sk,int flag)2865 void sock_enable_timestamp(struct sock *sk, int flag)
2866 {
2867 	if (!sock_flag(sk, flag)) {
2868 		unsigned long previous_flags = sk->sk_flags;
2869 
2870 		sock_set_flag(sk, flag);
2871 		/*
2872 		 * we just set one of the two flags which require net
2873 		 * time stamping, but time stamping might have been on
2874 		 * already because of the other one
2875 		 */
2876 		if (sock_needs_netstamp(sk) &&
2877 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2878 			net_enable_timestamp();
2879 	}
2880 }
2881 
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)2882 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2883 		       int level, int type)
2884 {
2885 	struct sock_exterr_skb *serr;
2886 	struct sk_buff *skb;
2887 	int copied, err;
2888 
2889 	err = -EAGAIN;
2890 	skb = sock_dequeue_err_skb(sk);
2891 	if (skb == NULL)
2892 		goto out;
2893 
2894 	copied = skb->len;
2895 	if (copied > len) {
2896 		msg->msg_flags |= MSG_TRUNC;
2897 		copied = len;
2898 	}
2899 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2900 	if (err)
2901 		goto out_free_skb;
2902 
2903 	sock_recv_timestamp(msg, sk, skb);
2904 
2905 	serr = SKB_EXT_ERR(skb);
2906 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2907 
2908 	msg->msg_flags |= MSG_ERRQUEUE;
2909 	err = copied;
2910 
2911 out_free_skb:
2912 	kfree_skb(skb);
2913 out:
2914 	return err;
2915 }
2916 EXPORT_SYMBOL(sock_recv_errqueue);
2917 
2918 /*
2919  *	Get a socket option on an socket.
2920  *
2921  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2922  *	asynchronous errors should be reported by getsockopt. We assume
2923  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2924  */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2925 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2926 			   char __user *optval, int __user *optlen)
2927 {
2928 	struct sock *sk = sock->sk;
2929 
2930 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2931 }
2932 EXPORT_SYMBOL(sock_common_getsockopt);
2933 
2934 #ifdef CONFIG_COMPAT
compat_sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2935 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2936 				  char __user *optval, int __user *optlen)
2937 {
2938 	struct sock *sk = sock->sk;
2939 
2940 	if (sk->sk_prot->compat_getsockopt != NULL)
2941 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2942 						      optval, optlen);
2943 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2944 }
2945 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2946 #endif
2947 
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2948 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2949 			int flags)
2950 {
2951 	struct sock *sk = sock->sk;
2952 	int addr_len = 0;
2953 	int err;
2954 
2955 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2956 				   flags & ~MSG_DONTWAIT, &addr_len);
2957 	if (err >= 0)
2958 		msg->msg_namelen = addr_len;
2959 	return err;
2960 }
2961 EXPORT_SYMBOL(sock_common_recvmsg);
2962 
2963 /*
2964  *	Set socket options on an inet socket.
2965  */
sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2966 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2967 			   char __user *optval, unsigned int optlen)
2968 {
2969 	struct sock *sk = sock->sk;
2970 
2971 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2972 }
2973 EXPORT_SYMBOL(sock_common_setsockopt);
2974 
2975 #ifdef CONFIG_COMPAT
compat_sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2976 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2977 				  char __user *optval, unsigned int optlen)
2978 {
2979 	struct sock *sk = sock->sk;
2980 
2981 	if (sk->sk_prot->compat_setsockopt != NULL)
2982 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2983 						      optval, optlen);
2984 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2985 }
2986 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2987 #endif
2988 
sk_common_release(struct sock * sk)2989 void sk_common_release(struct sock *sk)
2990 {
2991 	if (sk->sk_prot->destroy)
2992 		sk->sk_prot->destroy(sk);
2993 
2994 	/*
2995 	 * Observation: when sock_common_release is called, processes have
2996 	 * no access to socket. But net still has.
2997 	 * Step one, detach it from networking:
2998 	 *
2999 	 * A. Remove from hash tables.
3000 	 */
3001 
3002 	sk->sk_prot->unhash(sk);
3003 
3004 	/*
3005 	 * In this point socket cannot receive new packets, but it is possible
3006 	 * that some packets are in flight because some CPU runs receiver and
3007 	 * did hash table lookup before we unhashed socket. They will achieve
3008 	 * receive queue and will be purged by socket destructor.
3009 	 *
3010 	 * Also we still have packets pending on receive queue and probably,
3011 	 * our own packets waiting in device queues. sock_destroy will drain
3012 	 * receive queue, but transmitted packets will delay socket destruction
3013 	 * until the last reference will be released.
3014 	 */
3015 
3016 	sock_orphan(sk);
3017 
3018 	xfrm_sk_free_policy(sk);
3019 
3020 	sk_refcnt_debug_release(sk);
3021 
3022 	sock_put(sk);
3023 }
3024 EXPORT_SYMBOL(sk_common_release);
3025 
sk_get_meminfo(const struct sock * sk,u32 * mem)3026 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3027 {
3028 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3029 
3030 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3031 	mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3032 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3033 	mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3034 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3035 	mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3036 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3037 	mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3038 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3039 }
3040 
3041 #ifdef CONFIG_PROC_FS
3042 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3043 struct prot_inuse {
3044 	int val[PROTO_INUSE_NR];
3045 };
3046 
3047 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3048 
3049 #ifdef CONFIG_NET_NS
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)3050 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3051 {
3052 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
3053 }
3054 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3055 
sock_prot_inuse_get(struct net * net,struct proto * prot)3056 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3057 {
3058 	int cpu, idx = prot->inuse_idx;
3059 	int res = 0;
3060 
3061 	for_each_possible_cpu(cpu)
3062 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
3063 
3064 	return res >= 0 ? res : 0;
3065 }
3066 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3067 
sock_inuse_init_net(struct net * net)3068 static int __net_init sock_inuse_init_net(struct net *net)
3069 {
3070 	net->core.inuse = alloc_percpu(struct prot_inuse);
3071 	return net->core.inuse ? 0 : -ENOMEM;
3072 }
3073 
sock_inuse_exit_net(struct net * net)3074 static void __net_exit sock_inuse_exit_net(struct net *net)
3075 {
3076 	free_percpu(net->core.inuse);
3077 }
3078 
3079 static struct pernet_operations net_inuse_ops = {
3080 	.init = sock_inuse_init_net,
3081 	.exit = sock_inuse_exit_net,
3082 };
3083 
net_inuse_init(void)3084 static __init int net_inuse_init(void)
3085 {
3086 	if (register_pernet_subsys(&net_inuse_ops))
3087 		panic("Cannot initialize net inuse counters");
3088 
3089 	return 0;
3090 }
3091 
3092 core_initcall(net_inuse_init);
3093 #else
3094 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
3095 
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)3096 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3097 {
3098 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
3099 }
3100 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3101 
sock_prot_inuse_get(struct net * net,struct proto * prot)3102 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3103 {
3104 	int cpu, idx = prot->inuse_idx;
3105 	int res = 0;
3106 
3107 	for_each_possible_cpu(cpu)
3108 		res += per_cpu(prot_inuse, cpu).val[idx];
3109 
3110 	return res >= 0 ? res : 0;
3111 }
3112 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3113 #endif
3114 
assign_proto_idx(struct proto * prot)3115 static void assign_proto_idx(struct proto *prot)
3116 {
3117 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3118 
3119 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3120 		pr_err("PROTO_INUSE_NR exhausted\n");
3121 		return;
3122 	}
3123 
3124 	set_bit(prot->inuse_idx, proto_inuse_idx);
3125 }
3126 
release_proto_idx(struct proto * prot)3127 static void release_proto_idx(struct proto *prot)
3128 {
3129 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3130 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3131 }
3132 #else
assign_proto_idx(struct proto * prot)3133 static inline void assign_proto_idx(struct proto *prot)
3134 {
3135 }
3136 
release_proto_idx(struct proto * prot)3137 static inline void release_proto_idx(struct proto *prot)
3138 {
3139 }
3140 #endif
3141 
req_prot_cleanup(struct request_sock_ops * rsk_prot)3142 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3143 {
3144 	if (!rsk_prot)
3145 		return;
3146 	kfree(rsk_prot->slab_name);
3147 	rsk_prot->slab_name = NULL;
3148 	kmem_cache_destroy(rsk_prot->slab);
3149 	rsk_prot->slab = NULL;
3150 }
3151 
req_prot_init(const struct proto * prot)3152 static int req_prot_init(const struct proto *prot)
3153 {
3154 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3155 
3156 	if (!rsk_prot)
3157 		return 0;
3158 
3159 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3160 					prot->name);
3161 	if (!rsk_prot->slab_name)
3162 		return -ENOMEM;
3163 
3164 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3165 					   rsk_prot->obj_size, 0,
3166 					   prot->slab_flags, NULL);
3167 
3168 	if (!rsk_prot->slab) {
3169 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3170 			prot->name);
3171 		return -ENOMEM;
3172 	}
3173 	return 0;
3174 }
3175 
proto_register(struct proto * prot,int alloc_slab)3176 int proto_register(struct proto *prot, int alloc_slab)
3177 {
3178 	if (alloc_slab) {
3179 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
3180 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
3181 					NULL);
3182 
3183 		if (prot->slab == NULL) {
3184 			pr_crit("%s: Can't create sock SLAB cache!\n",
3185 				prot->name);
3186 			goto out;
3187 		}
3188 
3189 		if (req_prot_init(prot))
3190 			goto out_free_request_sock_slab;
3191 
3192 		if (prot->twsk_prot != NULL) {
3193 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3194 
3195 			if (prot->twsk_prot->twsk_slab_name == NULL)
3196 				goto out_free_request_sock_slab;
3197 
3198 			prot->twsk_prot->twsk_slab =
3199 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3200 						  prot->twsk_prot->twsk_obj_size,
3201 						  0,
3202 						  prot->slab_flags,
3203 						  NULL);
3204 			if (prot->twsk_prot->twsk_slab == NULL)
3205 				goto out_free_timewait_sock_slab_name;
3206 		}
3207 	}
3208 
3209 	mutex_lock(&proto_list_mutex);
3210 	list_add(&prot->node, &proto_list);
3211 	assign_proto_idx(prot);
3212 	mutex_unlock(&proto_list_mutex);
3213 	return 0;
3214 
3215 out_free_timewait_sock_slab_name:
3216 	kfree(prot->twsk_prot->twsk_slab_name);
3217 out_free_request_sock_slab:
3218 	req_prot_cleanup(prot->rsk_prot);
3219 
3220 	kmem_cache_destroy(prot->slab);
3221 	prot->slab = NULL;
3222 out:
3223 	return -ENOBUFS;
3224 }
3225 EXPORT_SYMBOL(proto_register);
3226 
proto_unregister(struct proto * prot)3227 void proto_unregister(struct proto *prot)
3228 {
3229 	mutex_lock(&proto_list_mutex);
3230 	release_proto_idx(prot);
3231 	list_del(&prot->node);
3232 	mutex_unlock(&proto_list_mutex);
3233 
3234 	kmem_cache_destroy(prot->slab);
3235 	prot->slab = NULL;
3236 
3237 	req_prot_cleanup(prot->rsk_prot);
3238 
3239 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3240 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3241 		kfree(prot->twsk_prot->twsk_slab_name);
3242 		prot->twsk_prot->twsk_slab = NULL;
3243 	}
3244 }
3245 EXPORT_SYMBOL(proto_unregister);
3246 
3247 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)3248 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3249 	__acquires(proto_list_mutex)
3250 {
3251 	mutex_lock(&proto_list_mutex);
3252 	return seq_list_start_head(&proto_list, *pos);
3253 }
3254 
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)3255 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3256 {
3257 	return seq_list_next(v, &proto_list, pos);
3258 }
3259 
proto_seq_stop(struct seq_file * seq,void * v)3260 static void proto_seq_stop(struct seq_file *seq, void *v)
3261 	__releases(proto_list_mutex)
3262 {
3263 	mutex_unlock(&proto_list_mutex);
3264 }
3265 
proto_method_implemented(const void * method)3266 static char proto_method_implemented(const void *method)
3267 {
3268 	return method == NULL ? 'n' : 'y';
3269 }
sock_prot_memory_allocated(struct proto * proto)3270 static long sock_prot_memory_allocated(struct proto *proto)
3271 {
3272 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3273 }
3274 
sock_prot_memory_pressure(struct proto * proto)3275 static char *sock_prot_memory_pressure(struct proto *proto)
3276 {
3277 	return proto->memory_pressure != NULL ?
3278 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3279 }
3280 
proto_seq_printf(struct seq_file * seq,struct proto * proto)3281 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3282 {
3283 
3284 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3285 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3286 		   proto->name,
3287 		   proto->obj_size,
3288 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3289 		   sock_prot_memory_allocated(proto),
3290 		   sock_prot_memory_pressure(proto),
3291 		   proto->max_header,
3292 		   proto->slab == NULL ? "no" : "yes",
3293 		   module_name(proto->owner),
3294 		   proto_method_implemented(proto->close),
3295 		   proto_method_implemented(proto->connect),
3296 		   proto_method_implemented(proto->disconnect),
3297 		   proto_method_implemented(proto->accept),
3298 		   proto_method_implemented(proto->ioctl),
3299 		   proto_method_implemented(proto->init),
3300 		   proto_method_implemented(proto->destroy),
3301 		   proto_method_implemented(proto->shutdown),
3302 		   proto_method_implemented(proto->setsockopt),
3303 		   proto_method_implemented(proto->getsockopt),
3304 		   proto_method_implemented(proto->sendmsg),
3305 		   proto_method_implemented(proto->recvmsg),
3306 		   proto_method_implemented(proto->sendpage),
3307 		   proto_method_implemented(proto->bind),
3308 		   proto_method_implemented(proto->backlog_rcv),
3309 		   proto_method_implemented(proto->hash),
3310 		   proto_method_implemented(proto->unhash),
3311 		   proto_method_implemented(proto->get_port),
3312 		   proto_method_implemented(proto->enter_memory_pressure));
3313 }
3314 
proto_seq_show(struct seq_file * seq,void * v)3315 static int proto_seq_show(struct seq_file *seq, void *v)
3316 {
3317 	if (v == &proto_list)
3318 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3319 			   "protocol",
3320 			   "size",
3321 			   "sockets",
3322 			   "memory",
3323 			   "press",
3324 			   "maxhdr",
3325 			   "slab",
3326 			   "module",
3327 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3328 	else
3329 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3330 	return 0;
3331 }
3332 
3333 static const struct seq_operations proto_seq_ops = {
3334 	.start  = proto_seq_start,
3335 	.next   = proto_seq_next,
3336 	.stop   = proto_seq_stop,
3337 	.show   = proto_seq_show,
3338 };
3339 
proto_seq_open(struct inode * inode,struct file * file)3340 static int proto_seq_open(struct inode *inode, struct file *file)
3341 {
3342 	return seq_open_net(inode, file, &proto_seq_ops,
3343 			    sizeof(struct seq_net_private));
3344 }
3345 
3346 static const struct file_operations proto_seq_fops = {
3347 	.owner		= THIS_MODULE,
3348 	.open		= proto_seq_open,
3349 	.read		= seq_read,
3350 	.llseek		= seq_lseek,
3351 	.release	= seq_release_net,
3352 };
3353 
proto_init_net(struct net * net)3354 static __net_init int proto_init_net(struct net *net)
3355 {
3356 	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3357 		return -ENOMEM;
3358 
3359 	return 0;
3360 }
3361 
proto_exit_net(struct net * net)3362 static __net_exit void proto_exit_net(struct net *net)
3363 {
3364 	remove_proc_entry("protocols", net->proc_net);
3365 }
3366 
3367 
3368 static __net_initdata struct pernet_operations proto_net_ops = {
3369 	.init = proto_init_net,
3370 	.exit = proto_exit_net,
3371 };
3372 
proto_init(void)3373 static int __init proto_init(void)
3374 {
3375 	return register_pernet_subsys(&proto_net_ops);
3376 }
3377 
3378 subsys_initcall(proto_init);
3379 
3380 #endif /* PROC_FS */
3381 
3382 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)3383 bool sk_busy_loop_end(void *p, unsigned long start_time)
3384 {
3385 	struct sock *sk = p;
3386 
3387 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3388 	       sk_busy_loop_timeout(sk, start_time);
3389 }
3390 EXPORT_SYMBOL(sk_busy_loop_end);
3391 #endif /* CONFIG_NET_RX_BUSY_POLL */
3392