• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
120 
121 #include <asm/uaccess.h>
122 
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 
135 #include <linux/filter.h>
136 
137 #include <trace/events/sock.h>
138 
139 #ifdef CONFIG_INET
140 #include <net/tcp.h>
141 #endif
142 
143 #include <net/busy_poll.h>
144 
145 static DEFINE_MUTEX(proto_list_mutex);
146 static LIST_HEAD(proto_list);
147 
148 /**
149  * sk_ns_capable - General socket capability test
150  * @sk: Socket to use a capability on or through
151  * @user_ns: The user namespace of the capability to use
152  * @cap: The capability to use
153  *
154  * Test to see if the opener of the socket had when the socket was
155  * created and the current process has the capability @cap in the user
156  * namespace @user_ns.
157  */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)158 bool sk_ns_capable(const struct sock *sk,
159 		   struct user_namespace *user_ns, int cap)
160 {
161 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
162 		ns_capable(user_ns, cap);
163 }
164 EXPORT_SYMBOL(sk_ns_capable);
165 
166 /**
167  * sk_capable - Socket global capability test
168  * @sk: Socket to use a capability on or through
169  * @cap: The global capability to use
170  *
171  * Test to see if the opener of the socket had when the socket was
172  * created and the current process has the capability @cap in all user
173  * namespaces.
174  */
sk_capable(const struct sock * sk,int cap)175 bool sk_capable(const struct sock *sk, int cap)
176 {
177 	return sk_ns_capable(sk, &init_user_ns, cap);
178 }
179 EXPORT_SYMBOL(sk_capable);
180 
181 /**
182  * sk_net_capable - Network namespace socket capability test
183  * @sk: Socket to use a capability on or through
184  * @cap: The capability to use
185  *
186  * Test to see if the opener of the socket had when the socket was created
187  * and the current process has the capability @cap over the network namespace
188  * the socket is a member of.
189  */
sk_net_capable(const struct sock * sk,int cap)190 bool sk_net_capable(const struct sock *sk, int cap)
191 {
192 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
193 }
194 EXPORT_SYMBOL(sk_net_capable);
195 
196 
197 #ifdef CONFIG_MEMCG_KMEM
mem_cgroup_sockets_init(struct mem_cgroup * memcg,struct cgroup_subsys * ss)198 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
199 {
200 	struct proto *proto;
201 	int ret = 0;
202 
203 	mutex_lock(&proto_list_mutex);
204 	list_for_each_entry(proto, &proto_list, node) {
205 		if (proto->init_cgroup) {
206 			ret = proto->init_cgroup(memcg, ss);
207 			if (ret)
208 				goto out;
209 		}
210 	}
211 
212 	mutex_unlock(&proto_list_mutex);
213 	return ret;
214 out:
215 	list_for_each_entry_continue_reverse(proto, &proto_list, node)
216 		if (proto->destroy_cgroup)
217 			proto->destroy_cgroup(memcg);
218 	mutex_unlock(&proto_list_mutex);
219 	return ret;
220 }
221 
mem_cgroup_sockets_destroy(struct mem_cgroup * memcg)222 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
223 {
224 	struct proto *proto;
225 
226 	mutex_lock(&proto_list_mutex);
227 	list_for_each_entry_reverse(proto, &proto_list, node)
228 		if (proto->destroy_cgroup)
229 			proto->destroy_cgroup(memcg);
230 	mutex_unlock(&proto_list_mutex);
231 }
232 #endif
233 
234 /*
235  * Each address family might have different locking rules, so we have
236  * one slock key per address family:
237  */
238 static struct lock_class_key af_family_keys[AF_MAX];
239 static struct lock_class_key af_family_slock_keys[AF_MAX];
240 
241 #if defined(CONFIG_MEMCG_KMEM)
242 struct static_key memcg_socket_limit_enabled;
243 EXPORT_SYMBOL(memcg_socket_limit_enabled);
244 #endif
245 
246 /*
247  * Make lock validator output more readable. (we pre-construct these
248  * strings build-time, so that runtime initialization of socket
249  * locks is fast):
250  */
251 static const char *const af_family_key_strings[AF_MAX+1] = {
252   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
253   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
254   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
255   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
256   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
257   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
258   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
259   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
260   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
261   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
262   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
263   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
264   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
265   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
266 };
267 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
268   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
269   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
270   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
271   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
272   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
273   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
274   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
275   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
276   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
277   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
278   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
279   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
280   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
281   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
282 };
283 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
284   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
285   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
286   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
287   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
288   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
289   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
290   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
291   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
292   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
293   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
294   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
295   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
296   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
297   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
298 };
299 
300 /*
301  * sk_callback_lock locking rules are per-address-family,
302  * so split the lock classes by using a per-AF key:
303  */
304 static struct lock_class_key af_callback_keys[AF_MAX];
305 
306 /* Take into consideration the size of the struct sk_buff overhead in the
307  * determination of these values, since that is non-constant across
308  * platforms.  This makes socket queueing behavior and performance
309  * not depend upon such differences.
310  */
311 #define _SK_MEM_PACKETS		256
312 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
313 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
314 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
315 
316 /* Run time adjustable parameters. */
317 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
318 EXPORT_SYMBOL(sysctl_wmem_max);
319 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
320 EXPORT_SYMBOL(sysctl_rmem_max);
321 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
322 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
323 
324 /* Maximal space eaten by iovec or ancillary data plus some space */
325 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
326 EXPORT_SYMBOL(sysctl_optmem_max);
327 
328 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
329 EXPORT_SYMBOL_GPL(memalloc_socks);
330 
331 /**
332  * sk_set_memalloc - sets %SOCK_MEMALLOC
333  * @sk: socket to set it on
334  *
335  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
336  * It's the responsibility of the admin to adjust min_free_kbytes
337  * to meet the requirements
338  */
sk_set_memalloc(struct sock * sk)339 void sk_set_memalloc(struct sock *sk)
340 {
341 	sock_set_flag(sk, SOCK_MEMALLOC);
342 	sk->sk_allocation |= __GFP_MEMALLOC;
343 	static_key_slow_inc(&memalloc_socks);
344 }
345 EXPORT_SYMBOL_GPL(sk_set_memalloc);
346 
sk_clear_memalloc(struct sock * sk)347 void sk_clear_memalloc(struct sock *sk)
348 {
349 	sock_reset_flag(sk, SOCK_MEMALLOC);
350 	sk->sk_allocation &= ~__GFP_MEMALLOC;
351 	static_key_slow_dec(&memalloc_socks);
352 
353 	/*
354 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
355 	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
356 	 * it has rmem allocations there is a risk that the user of the
357 	 * socket cannot make forward progress due to exceeding the rmem
358 	 * limits. By rights, sk_clear_memalloc() should only be called
359 	 * on sockets being torn down but warn and reset the accounting if
360 	 * that assumption breaks.
361 	 */
362 	if (WARN_ON(sk->sk_forward_alloc))
363 		sk_mem_reclaim(sk);
364 }
365 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
366 
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)367 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
368 {
369 	int ret;
370 	unsigned long pflags = current->flags;
371 
372 	/* these should have been dropped before queueing */
373 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
374 
375 	current->flags |= PF_MEMALLOC;
376 	ret = sk->sk_backlog_rcv(sk, skb);
377 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
378 
379 	return ret;
380 }
381 EXPORT_SYMBOL(__sk_backlog_rcv);
382 
sock_set_timeout(long * timeo_p,char __user * optval,int optlen)383 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
384 {
385 	struct timeval tv;
386 
387 	if (optlen < sizeof(tv))
388 		return -EINVAL;
389 	if (copy_from_user(&tv, optval, sizeof(tv)))
390 		return -EFAULT;
391 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
392 		return -EDOM;
393 
394 	if (tv.tv_sec < 0) {
395 		static int warned __read_mostly;
396 
397 		*timeo_p = 0;
398 		if (warned < 10 && net_ratelimit()) {
399 			warned++;
400 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
401 				__func__, current->comm, task_pid_nr(current));
402 		}
403 		return 0;
404 	}
405 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
406 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
407 		return 0;
408 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
409 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
410 	return 0;
411 }
412 
sock_warn_obsolete_bsdism(const char * name)413 static void sock_warn_obsolete_bsdism(const char *name)
414 {
415 	static int warned;
416 	static char warncomm[TASK_COMM_LEN];
417 	if (strcmp(warncomm, current->comm) && warned < 5) {
418 		strcpy(warncomm,  current->comm);
419 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
420 			warncomm, name);
421 		warned++;
422 	}
423 }
424 
sock_disable_timestamp(struct sock * sk,unsigned long flags)425 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
426 {
427 	if (sk->sk_flags & flags) {
428 		sk->sk_flags &= ~flags;
429 		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
430 			net_disable_timestamp();
431 	}
432 }
433 
434 
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)435 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
436 {
437 	int err;
438 	unsigned long flags;
439 	struct sk_buff_head *list = &sk->sk_receive_queue;
440 
441 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
442 		atomic_inc(&sk->sk_drops);
443 		trace_sock_rcvqueue_full(sk, skb);
444 		return -ENOMEM;
445 	}
446 
447 	err = sk_filter(sk, skb);
448 	if (err)
449 		return err;
450 
451 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
452 		atomic_inc(&sk->sk_drops);
453 		return -ENOBUFS;
454 	}
455 
456 	skb->dev = NULL;
457 	skb_set_owner_r(skb, sk);
458 
459 	/* we escape from rcu protected region, make sure we dont leak
460 	 * a norefcounted dst
461 	 */
462 	skb_dst_force(skb);
463 
464 	spin_lock_irqsave(&list->lock, flags);
465 	skb->dropcount = atomic_read(&sk->sk_drops);
466 	__skb_queue_tail(list, skb);
467 	spin_unlock_irqrestore(&list->lock, flags);
468 
469 	if (!sock_flag(sk, SOCK_DEAD))
470 		sk->sk_data_ready(sk);
471 	return 0;
472 }
473 EXPORT_SYMBOL(sock_queue_rcv_skb);
474 
sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested)475 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
476 {
477 	int rc = NET_RX_SUCCESS;
478 
479 	if (sk_filter(sk, skb))
480 		goto discard_and_relse;
481 
482 	skb->dev = NULL;
483 
484 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
485 		atomic_inc(&sk->sk_drops);
486 		goto discard_and_relse;
487 	}
488 	if (nested)
489 		bh_lock_sock_nested(sk);
490 	else
491 		bh_lock_sock(sk);
492 	if (!sock_owned_by_user(sk)) {
493 		/*
494 		 * trylock + unlock semantics:
495 		 */
496 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
497 
498 		rc = sk_backlog_rcv(sk, skb);
499 
500 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
501 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
502 		bh_unlock_sock(sk);
503 		atomic_inc(&sk->sk_drops);
504 		goto discard_and_relse;
505 	}
506 
507 	bh_unlock_sock(sk);
508 out:
509 	sock_put(sk);
510 	return rc;
511 discard_and_relse:
512 	kfree_skb(skb);
513 	goto out;
514 }
515 EXPORT_SYMBOL(sk_receive_skb);
516 
__sk_dst_check(struct sock * sk,u32 cookie)517 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
518 {
519 	struct dst_entry *dst = __sk_dst_get(sk);
520 
521 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
522 		sk_tx_queue_clear(sk);
523 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
524 		dst_release(dst);
525 		return NULL;
526 	}
527 
528 	return dst;
529 }
530 EXPORT_SYMBOL(__sk_dst_check);
531 
sk_dst_check(struct sock * sk,u32 cookie)532 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
533 {
534 	struct dst_entry *dst = sk_dst_get(sk);
535 
536 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
537 		sk_dst_reset(sk);
538 		dst_release(dst);
539 		return NULL;
540 	}
541 
542 	return dst;
543 }
544 EXPORT_SYMBOL(sk_dst_check);
545 
sock_setbindtodevice(struct sock * sk,char __user * optval,int optlen)546 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
547 				int optlen)
548 {
549 	int ret = -ENOPROTOOPT;
550 #ifdef CONFIG_NETDEVICES
551 	struct net *net = sock_net(sk);
552 	char devname[IFNAMSIZ];
553 	int index;
554 
555 	/* Sorry... */
556 	ret = -EPERM;
557 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
558 		goto out;
559 
560 	ret = -EINVAL;
561 	if (optlen < 0)
562 		goto out;
563 
564 	/* Bind this socket to a particular device like "eth0",
565 	 * as specified in the passed interface name. If the
566 	 * name is "" or the option length is zero the socket
567 	 * is not bound.
568 	 */
569 	if (optlen > IFNAMSIZ - 1)
570 		optlen = IFNAMSIZ - 1;
571 	memset(devname, 0, sizeof(devname));
572 
573 	ret = -EFAULT;
574 	if (copy_from_user(devname, optval, optlen))
575 		goto out;
576 
577 	index = 0;
578 	if (devname[0] != '\0') {
579 		struct net_device *dev;
580 
581 		rcu_read_lock();
582 		dev = dev_get_by_name_rcu(net, devname);
583 		if (dev)
584 			index = dev->ifindex;
585 		rcu_read_unlock();
586 		ret = -ENODEV;
587 		if (!dev)
588 			goto out;
589 	}
590 
591 	lock_sock(sk);
592 	sk->sk_bound_dev_if = index;
593 	sk_dst_reset(sk);
594 	release_sock(sk);
595 
596 	ret = 0;
597 
598 out:
599 #endif
600 
601 	return ret;
602 }
603 
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)604 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
605 				int __user *optlen, int len)
606 {
607 	int ret = -ENOPROTOOPT;
608 #ifdef CONFIG_NETDEVICES
609 	struct net *net = sock_net(sk);
610 	char devname[IFNAMSIZ];
611 
612 	if (sk->sk_bound_dev_if == 0) {
613 		len = 0;
614 		goto zero;
615 	}
616 
617 	ret = -EINVAL;
618 	if (len < IFNAMSIZ)
619 		goto out;
620 
621 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
622 	if (ret)
623 		goto out;
624 
625 	len = strlen(devname) + 1;
626 
627 	ret = -EFAULT;
628 	if (copy_to_user(optval, devname, len))
629 		goto out;
630 
631 zero:
632 	ret = -EFAULT;
633 	if (put_user(len, optlen))
634 		goto out;
635 
636 	ret = 0;
637 
638 out:
639 #endif
640 
641 	return ret;
642 }
643 
sock_valbool_flag(struct sock * sk,int bit,int valbool)644 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
645 {
646 	if (valbool)
647 		sock_set_flag(sk, bit);
648 	else
649 		sock_reset_flag(sk, bit);
650 }
651 
sk_mc_loop(struct sock * sk)652 bool sk_mc_loop(struct sock *sk)
653 {
654 	if (dev_recursion_level())
655 		return false;
656 	if (!sk)
657 		return true;
658 	switch (sk->sk_family) {
659 	case AF_INET:
660 		return inet_sk(sk)->mc_loop;
661 #if IS_ENABLED(CONFIG_IPV6)
662 	case AF_INET6:
663 		return inet6_sk(sk)->mc_loop;
664 #endif
665 	}
666 	WARN_ON(1);
667 	return true;
668 }
669 EXPORT_SYMBOL(sk_mc_loop);
670 
671 /*
672  *	This is meant for all protocols to use and covers goings on
673  *	at the socket level. Everything here is generic.
674  */
675 
sock_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)676 int sock_setsockopt(struct socket *sock, int level, int optname,
677 		    char __user *optval, unsigned int optlen)
678 {
679 	struct sock *sk = sock->sk;
680 	int val;
681 	int valbool;
682 	struct linger ling;
683 	int ret = 0;
684 
685 	/*
686 	 *	Options without arguments
687 	 */
688 
689 	if (optname == SO_BINDTODEVICE)
690 		return sock_setbindtodevice(sk, optval, optlen);
691 
692 	if (optlen < sizeof(int))
693 		return -EINVAL;
694 
695 	if (get_user(val, (int __user *)optval))
696 		return -EFAULT;
697 
698 	valbool = val ? 1 : 0;
699 
700 	lock_sock(sk);
701 
702 	switch (optname) {
703 	case SO_DEBUG:
704 		if (val && !capable(CAP_NET_ADMIN))
705 			ret = -EACCES;
706 		else
707 			sock_valbool_flag(sk, SOCK_DBG, valbool);
708 		break;
709 	case SO_REUSEADDR:
710 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
711 		break;
712 	case SO_REUSEPORT:
713 		sk->sk_reuseport = valbool;
714 		break;
715 	case SO_TYPE:
716 	case SO_PROTOCOL:
717 	case SO_DOMAIN:
718 	case SO_ERROR:
719 		ret = -ENOPROTOOPT;
720 		break;
721 	case SO_DONTROUTE:
722 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
723 		break;
724 	case SO_BROADCAST:
725 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
726 		break;
727 	case SO_SNDBUF:
728 		/* Don't error on this BSD doesn't and if you think
729 		 * about it this is right. Otherwise apps have to
730 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
731 		 * are treated in BSD as hints
732 		 */
733 		val = min_t(u32, val, sysctl_wmem_max);
734 set_sndbuf:
735 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
736 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
737 		/* Wake up sending tasks if we upped the value. */
738 		sk->sk_write_space(sk);
739 		break;
740 
741 	case SO_SNDBUFFORCE:
742 		if (!capable(CAP_NET_ADMIN)) {
743 			ret = -EPERM;
744 			break;
745 		}
746 		goto set_sndbuf;
747 
748 	case SO_RCVBUF:
749 		/* Don't error on this BSD doesn't and if you think
750 		 * about it this is right. Otherwise apps have to
751 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
752 		 * are treated in BSD as hints
753 		 */
754 		val = min_t(u32, val, sysctl_rmem_max);
755 set_rcvbuf:
756 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
757 		/*
758 		 * We double it on the way in to account for
759 		 * "struct sk_buff" etc. overhead.   Applications
760 		 * assume that the SO_RCVBUF setting they make will
761 		 * allow that much actual data to be received on that
762 		 * socket.
763 		 *
764 		 * Applications are unaware that "struct sk_buff" and
765 		 * other overheads allocate from the receive buffer
766 		 * during socket buffer allocation.
767 		 *
768 		 * And after considering the possible alternatives,
769 		 * returning the value we actually used in getsockopt
770 		 * is the most desirable behavior.
771 		 */
772 		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
773 		break;
774 
775 	case SO_RCVBUFFORCE:
776 		if (!capable(CAP_NET_ADMIN)) {
777 			ret = -EPERM;
778 			break;
779 		}
780 		goto set_rcvbuf;
781 
782 	case SO_KEEPALIVE:
783 #ifdef CONFIG_INET
784 		if (sk->sk_protocol == IPPROTO_TCP &&
785 		    sk->sk_type == SOCK_STREAM)
786 			tcp_set_keepalive(sk, valbool);
787 #endif
788 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
789 		break;
790 
791 	case SO_OOBINLINE:
792 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
793 		break;
794 
795 	case SO_NO_CHECK:
796 		sk->sk_no_check_tx = valbool;
797 		break;
798 
799 	case SO_PRIORITY:
800 		if ((val >= 0 && val <= 6) ||
801 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
802 			sk->sk_priority = val;
803 		else
804 			ret = -EPERM;
805 		break;
806 
807 	case SO_LINGER:
808 		if (optlen < sizeof(ling)) {
809 			ret = -EINVAL;	/* 1003.1g */
810 			break;
811 		}
812 		if (copy_from_user(&ling, optval, sizeof(ling))) {
813 			ret = -EFAULT;
814 			break;
815 		}
816 		if (!ling.l_onoff)
817 			sock_reset_flag(sk, SOCK_LINGER);
818 		else {
819 #if (BITS_PER_LONG == 32)
820 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
821 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
822 			else
823 #endif
824 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
825 			sock_set_flag(sk, SOCK_LINGER);
826 		}
827 		break;
828 
829 	case SO_BSDCOMPAT:
830 		sock_warn_obsolete_bsdism("setsockopt");
831 		break;
832 
833 	case SO_PASSCRED:
834 		if (valbool)
835 			set_bit(SOCK_PASSCRED, &sock->flags);
836 		else
837 			clear_bit(SOCK_PASSCRED, &sock->flags);
838 		break;
839 
840 	case SO_TIMESTAMP:
841 	case SO_TIMESTAMPNS:
842 		if (valbool)  {
843 			if (optname == SO_TIMESTAMP)
844 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
845 			else
846 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
847 			sock_set_flag(sk, SOCK_RCVTSTAMP);
848 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
849 		} else {
850 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
851 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
852 		}
853 		break;
854 
855 	case SO_TIMESTAMPING:
856 		if (val & ~SOF_TIMESTAMPING_MASK) {
857 			ret = -EINVAL;
858 			break;
859 		}
860 		if (val & SOF_TIMESTAMPING_OPT_ID &&
861 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
862 			if (sk->sk_protocol == IPPROTO_TCP &&
863 			    sk->sk_type == SOCK_STREAM) {
864 				if (sk->sk_state != TCP_ESTABLISHED) {
865 					ret = -EINVAL;
866 					break;
867 				}
868 				sk->sk_tskey = tcp_sk(sk)->snd_una;
869 			} else {
870 				sk->sk_tskey = 0;
871 			}
872 		}
873 		sk->sk_tsflags = val;
874 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
875 			sock_enable_timestamp(sk,
876 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
877 		else
878 			sock_disable_timestamp(sk,
879 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
880 		break;
881 
882 	case SO_RCVLOWAT:
883 		if (val < 0)
884 			val = INT_MAX;
885 		sk->sk_rcvlowat = val ? : 1;
886 		break;
887 
888 	case SO_RCVTIMEO:
889 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
890 		break;
891 
892 	case SO_SNDTIMEO:
893 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
894 		break;
895 
896 	case SO_ATTACH_FILTER:
897 		ret = -EINVAL;
898 		if (optlen == sizeof(struct sock_fprog)) {
899 			struct sock_fprog fprog;
900 
901 			ret = -EFAULT;
902 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
903 				break;
904 
905 			ret = sk_attach_filter(&fprog, sk);
906 		}
907 		break;
908 
909 	case SO_DETACH_FILTER:
910 		ret = sk_detach_filter(sk);
911 		break;
912 
913 	case SO_LOCK_FILTER:
914 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
915 			ret = -EPERM;
916 		else
917 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
918 		break;
919 
920 	case SO_PASSSEC:
921 		if (valbool)
922 			set_bit(SOCK_PASSSEC, &sock->flags);
923 		else
924 			clear_bit(SOCK_PASSSEC, &sock->flags);
925 		break;
926 	case SO_MARK:
927 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
928 			ret = -EPERM;
929 		else
930 			sk->sk_mark = val;
931 		break;
932 
933 		/* We implement the SO_SNDLOWAT etc to
934 		   not be settable (1003.1g 5.3) */
935 	case SO_RXQ_OVFL:
936 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
937 		break;
938 
939 	case SO_WIFI_STATUS:
940 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
941 		break;
942 
943 	case SO_PEEK_OFF:
944 		if (sock->ops->set_peek_off)
945 			ret = sock->ops->set_peek_off(sk, val);
946 		else
947 			ret = -EOPNOTSUPP;
948 		break;
949 
950 	case SO_NOFCS:
951 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
952 		break;
953 
954 	case SO_SELECT_ERR_QUEUE:
955 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
956 		break;
957 
958 #ifdef CONFIG_NET_RX_BUSY_POLL
959 	case SO_BUSY_POLL:
960 		/* allow unprivileged users to decrease the value */
961 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
962 			ret = -EPERM;
963 		else {
964 			if (val < 0)
965 				ret = -EINVAL;
966 			else
967 				sk->sk_ll_usec = val;
968 		}
969 		break;
970 #endif
971 
972 	case SO_MAX_PACING_RATE:
973 		sk->sk_max_pacing_rate = val;
974 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
975 					 sk->sk_max_pacing_rate);
976 		break;
977 
978 	default:
979 		ret = -ENOPROTOOPT;
980 		break;
981 	}
982 	release_sock(sk);
983 	return ret;
984 }
985 EXPORT_SYMBOL(sock_setsockopt);
986 
987 
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)988 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
989 			  struct ucred *ucred)
990 {
991 	ucred->pid = pid_vnr(pid);
992 	ucred->uid = ucred->gid = -1;
993 	if (cred) {
994 		struct user_namespace *current_ns = current_user_ns();
995 
996 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
997 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
998 	}
999 }
1000 
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1001 int sock_getsockopt(struct socket *sock, int level, int optname,
1002 		    char __user *optval, int __user *optlen)
1003 {
1004 	struct sock *sk = sock->sk;
1005 
1006 	union {
1007 		int val;
1008 		struct linger ling;
1009 		struct timeval tm;
1010 	} v;
1011 
1012 	int lv = sizeof(int);
1013 	int len;
1014 
1015 	if (get_user(len, optlen))
1016 		return -EFAULT;
1017 	if (len < 0)
1018 		return -EINVAL;
1019 
1020 	memset(&v, 0, sizeof(v));
1021 
1022 	switch (optname) {
1023 	case SO_DEBUG:
1024 		v.val = sock_flag(sk, SOCK_DBG);
1025 		break;
1026 
1027 	case SO_DONTROUTE:
1028 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1029 		break;
1030 
1031 	case SO_BROADCAST:
1032 		v.val = sock_flag(sk, SOCK_BROADCAST);
1033 		break;
1034 
1035 	case SO_SNDBUF:
1036 		v.val = sk->sk_sndbuf;
1037 		break;
1038 
1039 	case SO_RCVBUF:
1040 		v.val = sk->sk_rcvbuf;
1041 		break;
1042 
1043 	case SO_REUSEADDR:
1044 		v.val = sk->sk_reuse;
1045 		break;
1046 
1047 	case SO_REUSEPORT:
1048 		v.val = sk->sk_reuseport;
1049 		break;
1050 
1051 	case SO_KEEPALIVE:
1052 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1053 		break;
1054 
1055 	case SO_TYPE:
1056 		v.val = sk->sk_type;
1057 		break;
1058 
1059 	case SO_PROTOCOL:
1060 		v.val = sk->sk_protocol;
1061 		break;
1062 
1063 	case SO_DOMAIN:
1064 		v.val = sk->sk_family;
1065 		break;
1066 
1067 	case SO_ERROR:
1068 		v.val = -sock_error(sk);
1069 		if (v.val == 0)
1070 			v.val = xchg(&sk->sk_err_soft, 0);
1071 		break;
1072 
1073 	case SO_OOBINLINE:
1074 		v.val = sock_flag(sk, SOCK_URGINLINE);
1075 		break;
1076 
1077 	case SO_NO_CHECK:
1078 		v.val = sk->sk_no_check_tx;
1079 		break;
1080 
1081 	case SO_PRIORITY:
1082 		v.val = sk->sk_priority;
1083 		break;
1084 
1085 	case SO_LINGER:
1086 		lv		= sizeof(v.ling);
1087 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1088 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1089 		break;
1090 
1091 	case SO_BSDCOMPAT:
1092 		sock_warn_obsolete_bsdism("getsockopt");
1093 		break;
1094 
1095 	case SO_TIMESTAMP:
1096 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1097 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1098 		break;
1099 
1100 	case SO_TIMESTAMPNS:
1101 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1102 		break;
1103 
1104 	case SO_TIMESTAMPING:
1105 		v.val = sk->sk_tsflags;
1106 		break;
1107 
1108 	case SO_RCVTIMEO:
1109 		lv = sizeof(struct timeval);
1110 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1111 			v.tm.tv_sec = 0;
1112 			v.tm.tv_usec = 0;
1113 		} else {
1114 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1115 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1116 		}
1117 		break;
1118 
1119 	case SO_SNDTIMEO:
1120 		lv = sizeof(struct timeval);
1121 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1122 			v.tm.tv_sec = 0;
1123 			v.tm.tv_usec = 0;
1124 		} else {
1125 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1126 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1127 		}
1128 		break;
1129 
1130 	case SO_RCVLOWAT:
1131 		v.val = sk->sk_rcvlowat;
1132 		break;
1133 
1134 	case SO_SNDLOWAT:
1135 		v.val = 1;
1136 		break;
1137 
1138 	case SO_PASSCRED:
1139 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1140 		break;
1141 
1142 	case SO_PEERCRED:
1143 	{
1144 		struct ucred peercred;
1145 		if (len > sizeof(peercred))
1146 			len = sizeof(peercred);
1147 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1148 		if (copy_to_user(optval, &peercred, len))
1149 			return -EFAULT;
1150 		goto lenout;
1151 	}
1152 
1153 	case SO_PEERNAME:
1154 	{
1155 		char address[128];
1156 
1157 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1158 			return -ENOTCONN;
1159 		if (lv < len)
1160 			return -EINVAL;
1161 		if (copy_to_user(optval, address, len))
1162 			return -EFAULT;
1163 		goto lenout;
1164 	}
1165 
1166 	/* Dubious BSD thing... Probably nobody even uses it, but
1167 	 * the UNIX standard wants it for whatever reason... -DaveM
1168 	 */
1169 	case SO_ACCEPTCONN:
1170 		v.val = sk->sk_state == TCP_LISTEN;
1171 		break;
1172 
1173 	case SO_PASSSEC:
1174 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1175 		break;
1176 
1177 	case SO_PEERSEC:
1178 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1179 
1180 	case SO_MARK:
1181 		v.val = sk->sk_mark;
1182 		break;
1183 
1184 	case SO_RXQ_OVFL:
1185 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1186 		break;
1187 
1188 	case SO_WIFI_STATUS:
1189 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1190 		break;
1191 
1192 	case SO_PEEK_OFF:
1193 		if (!sock->ops->set_peek_off)
1194 			return -EOPNOTSUPP;
1195 
1196 		v.val = sk->sk_peek_off;
1197 		break;
1198 	case SO_NOFCS:
1199 		v.val = sock_flag(sk, SOCK_NOFCS);
1200 		break;
1201 
1202 	case SO_BINDTODEVICE:
1203 		return sock_getbindtodevice(sk, optval, optlen, len);
1204 
1205 	case SO_GET_FILTER:
1206 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1207 		if (len < 0)
1208 			return len;
1209 
1210 		goto lenout;
1211 
1212 	case SO_LOCK_FILTER:
1213 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1214 		break;
1215 
1216 	case SO_BPF_EXTENSIONS:
1217 		v.val = bpf_tell_extensions();
1218 		break;
1219 
1220 	case SO_SELECT_ERR_QUEUE:
1221 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1222 		break;
1223 
1224 #ifdef CONFIG_NET_RX_BUSY_POLL
1225 	case SO_BUSY_POLL:
1226 		v.val = sk->sk_ll_usec;
1227 		break;
1228 #endif
1229 
1230 	case SO_MAX_PACING_RATE:
1231 		v.val = sk->sk_max_pacing_rate;
1232 		break;
1233 
1234 	default:
1235 		return -ENOPROTOOPT;
1236 	}
1237 
1238 	if (len > lv)
1239 		len = lv;
1240 	if (copy_to_user(optval, &v, len))
1241 		return -EFAULT;
1242 lenout:
1243 	if (put_user(len, optlen))
1244 		return -EFAULT;
1245 	return 0;
1246 }
1247 
1248 /*
1249  * Initialize an sk_lock.
1250  *
1251  * (We also register the sk_lock with the lock validator.)
1252  */
sock_lock_init(struct sock * sk)1253 static inline void sock_lock_init(struct sock *sk)
1254 {
1255 	sock_lock_init_class_and_name(sk,
1256 			af_family_slock_key_strings[sk->sk_family],
1257 			af_family_slock_keys + sk->sk_family,
1258 			af_family_key_strings[sk->sk_family],
1259 			af_family_keys + sk->sk_family);
1260 }
1261 
1262 /*
1263  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1264  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1265  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1266  */
sock_copy(struct sock * nsk,const struct sock * osk)1267 static void sock_copy(struct sock *nsk, const struct sock *osk)
1268 {
1269 #ifdef CONFIG_SECURITY_NETWORK
1270 	void *sptr = nsk->sk_security;
1271 #endif
1272 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1273 
1274 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1275 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1276 
1277 #ifdef CONFIG_SECURITY_NETWORK
1278 	nsk->sk_security = sptr;
1279 	security_sk_clone(osk, nsk);
1280 #endif
1281 }
1282 
sk_prot_clear_portaddr_nulls(struct sock * sk,int size)1283 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1284 {
1285 	unsigned long nulls1, nulls2;
1286 
1287 	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1288 	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1289 	if (nulls1 > nulls2)
1290 		swap(nulls1, nulls2);
1291 
1292 	if (nulls1 != 0)
1293 		memset((char *)sk, 0, nulls1);
1294 	memset((char *)sk + nulls1 + sizeof(void *), 0,
1295 	       nulls2 - nulls1 - sizeof(void *));
1296 	memset((char *)sk + nulls2 + sizeof(void *), 0,
1297 	       size - nulls2 - sizeof(void *));
1298 }
1299 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1300 
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1301 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1302 		int family)
1303 {
1304 	struct sock *sk;
1305 	struct kmem_cache *slab;
1306 
1307 	slab = prot->slab;
1308 	if (slab != NULL) {
1309 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1310 		if (!sk)
1311 			return sk;
1312 		if (priority & __GFP_ZERO) {
1313 			if (prot->clear_sk)
1314 				prot->clear_sk(sk, prot->obj_size);
1315 			else
1316 				sk_prot_clear_nulls(sk, prot->obj_size);
1317 		}
1318 	} else
1319 		sk = kmalloc(prot->obj_size, priority);
1320 
1321 	if (sk != NULL) {
1322 		kmemcheck_annotate_bitfield(sk, flags);
1323 
1324 		if (security_sk_alloc(sk, family, priority))
1325 			goto out_free;
1326 
1327 		if (!try_module_get(prot->owner))
1328 			goto out_free_sec;
1329 		sk_tx_queue_clear(sk);
1330 	}
1331 
1332 	return sk;
1333 
1334 out_free_sec:
1335 	security_sk_free(sk);
1336 out_free:
1337 	if (slab != NULL)
1338 		kmem_cache_free(slab, sk);
1339 	else
1340 		kfree(sk);
1341 	return NULL;
1342 }
1343 
sk_prot_free(struct proto * prot,struct sock * sk)1344 static void sk_prot_free(struct proto *prot, struct sock *sk)
1345 {
1346 	struct kmem_cache *slab;
1347 	struct module *owner;
1348 
1349 	owner = prot->owner;
1350 	slab = prot->slab;
1351 
1352 	security_sk_free(sk);
1353 	if (slab != NULL)
1354 		kmem_cache_free(slab, sk);
1355 	else
1356 		kfree(sk);
1357 	module_put(owner);
1358 }
1359 
1360 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
sock_update_netprioidx(struct sock * sk)1361 void sock_update_netprioidx(struct sock *sk)
1362 {
1363 	if (in_interrupt())
1364 		return;
1365 
1366 	sk->sk_cgrp_prioidx = task_netprioidx(current);
1367 }
1368 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1369 #endif
1370 
1371 /**
1372  *	sk_alloc - All socket objects are allocated here
1373  *	@net: the applicable net namespace
1374  *	@family: protocol family
1375  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1376  *	@prot: struct proto associated with this new sock instance
1377  */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot)1378 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1379 		      struct proto *prot)
1380 {
1381 	struct sock *sk;
1382 
1383 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1384 	if (sk) {
1385 		sk->sk_family = family;
1386 		/*
1387 		 * See comment in struct sock definition to understand
1388 		 * why we need sk_prot_creator -acme
1389 		 */
1390 		sk->sk_prot = sk->sk_prot_creator = prot;
1391 		sock_lock_init(sk);
1392 		sock_net_set(sk, get_net(net));
1393 		atomic_set(&sk->sk_wmem_alloc, 1);
1394 
1395 		sock_update_classid(sk);
1396 		sock_update_netprioidx(sk);
1397 	}
1398 
1399 	return sk;
1400 }
1401 EXPORT_SYMBOL(sk_alloc);
1402 
__sk_free(struct sock * sk)1403 static void __sk_free(struct sock *sk)
1404 {
1405 	struct sk_filter *filter;
1406 
1407 	if (sk->sk_destruct)
1408 		sk->sk_destruct(sk);
1409 
1410 	filter = rcu_dereference_check(sk->sk_filter,
1411 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1412 	if (filter) {
1413 		sk_filter_uncharge(sk, filter);
1414 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1415 	}
1416 
1417 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1418 
1419 	if (atomic_read(&sk->sk_omem_alloc))
1420 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1421 			 __func__, atomic_read(&sk->sk_omem_alloc));
1422 
1423 	if (sk->sk_frag.page) {
1424 		put_page(sk->sk_frag.page);
1425 		sk->sk_frag.page = NULL;
1426 	}
1427 
1428 	if (sk->sk_peer_cred)
1429 		put_cred(sk->sk_peer_cred);
1430 	put_pid(sk->sk_peer_pid);
1431 	put_net(sock_net(sk));
1432 	sk_prot_free(sk->sk_prot_creator, sk);
1433 }
1434 
sk_free(struct sock * sk)1435 void sk_free(struct sock *sk)
1436 {
1437 	/*
1438 	 * We subtract one from sk_wmem_alloc and can know if
1439 	 * some packets are still in some tx queue.
1440 	 * If not null, sock_wfree() will call __sk_free(sk) later
1441 	 */
1442 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1443 		__sk_free(sk);
1444 }
1445 EXPORT_SYMBOL(sk_free);
1446 
1447 /*
1448  * Last sock_put should drop reference to sk->sk_net. It has already
1449  * been dropped in sk_change_net. Taking reference to stopping namespace
1450  * is not an option.
1451  * Take reference to a socket to remove it from hash _alive_ and after that
1452  * destroy it in the context of init_net.
1453  */
sk_release_kernel(struct sock * sk)1454 void sk_release_kernel(struct sock *sk)
1455 {
1456 	if (sk == NULL || sk->sk_socket == NULL)
1457 		return;
1458 
1459 	sock_hold(sk);
1460 	sock_release(sk->sk_socket);
1461 	release_net(sock_net(sk));
1462 	sock_net_set(sk, get_net(&init_net));
1463 	sock_put(sk);
1464 }
1465 EXPORT_SYMBOL(sk_release_kernel);
1466 
sk_update_clone(const struct sock * sk,struct sock * newsk)1467 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1468 {
1469 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1470 		sock_update_memcg(newsk);
1471 }
1472 
1473 /**
1474  *	sk_clone_lock - clone a socket, and lock its clone
1475  *	@sk: the socket to clone
1476  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1477  *
1478  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1479  */
sk_clone_lock(const struct sock * sk,const gfp_t priority)1480 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1481 {
1482 	struct sock *newsk;
1483 	bool is_charged = true;
1484 
1485 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1486 	if (newsk != NULL) {
1487 		struct sk_filter *filter;
1488 
1489 		sock_copy(newsk, sk);
1490 
1491 		newsk->sk_prot_creator = sk->sk_prot;
1492 
1493 		/* SANITY */
1494 		get_net(sock_net(newsk));
1495 		sk_node_init(&newsk->sk_node);
1496 		sock_lock_init(newsk);
1497 		bh_lock_sock(newsk);
1498 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1499 		newsk->sk_backlog.len = 0;
1500 
1501 		atomic_set(&newsk->sk_rmem_alloc, 0);
1502 		/*
1503 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1504 		 */
1505 		atomic_set(&newsk->sk_wmem_alloc, 1);
1506 		atomic_set(&newsk->sk_omem_alloc, 0);
1507 		skb_queue_head_init(&newsk->sk_receive_queue);
1508 		skb_queue_head_init(&newsk->sk_write_queue);
1509 
1510 		spin_lock_init(&newsk->sk_dst_lock);
1511 		rwlock_init(&newsk->sk_callback_lock);
1512 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1513 				af_callback_keys + newsk->sk_family,
1514 				af_family_clock_key_strings[newsk->sk_family]);
1515 
1516 		newsk->sk_dst_cache	= NULL;
1517 		newsk->sk_wmem_queued	= 0;
1518 		newsk->sk_forward_alloc = 0;
1519 		newsk->sk_send_head	= NULL;
1520 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1521 
1522 		sock_reset_flag(newsk, SOCK_DONE);
1523 		skb_queue_head_init(&newsk->sk_error_queue);
1524 
1525 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1526 		if (filter != NULL)
1527 			/* though it's an empty new sock, the charging may fail
1528 			 * if sysctl_optmem_max was changed between creation of
1529 			 * original socket and cloning
1530 			 */
1531 			is_charged = sk_filter_charge(newsk, filter);
1532 
1533 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
1534 			/* We need to make sure that we don't uncharge the new
1535 			 * socket if we couldn't charge it in the first place
1536 			 * as otherwise we uncharge the parent's filter.
1537 			 */
1538 			if (!is_charged)
1539 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1540 			/* It is still raw copy of parent, so invalidate
1541 			 * destructor and make plain sk_free() */
1542 			newsk->sk_destruct = NULL;
1543 			bh_unlock_sock(newsk);
1544 			sk_free(newsk);
1545 			newsk = NULL;
1546 			goto out;
1547 		}
1548 
1549 		newsk->sk_err	   = 0;
1550 		newsk->sk_priority = 0;
1551 		/*
1552 		 * Before updating sk_refcnt, we must commit prior changes to memory
1553 		 * (Documentation/RCU/rculist_nulls.txt for details)
1554 		 */
1555 		smp_wmb();
1556 		atomic_set(&newsk->sk_refcnt, 2);
1557 
1558 		/*
1559 		 * Increment the counter in the same struct proto as the master
1560 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1561 		 * is the same as sk->sk_prot->socks, as this field was copied
1562 		 * with memcpy).
1563 		 *
1564 		 * This _changes_ the previous behaviour, where
1565 		 * tcp_create_openreq_child always was incrementing the
1566 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1567 		 * to be taken into account in all callers. -acme
1568 		 */
1569 		sk_refcnt_debug_inc(newsk);
1570 		sk_set_socket(newsk, NULL);
1571 		newsk->sk_wq = NULL;
1572 
1573 		sk_update_clone(sk, newsk);
1574 
1575 		if (newsk->sk_prot->sockets_allocated)
1576 			sk_sockets_allocated_inc(newsk);
1577 
1578 		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1579 			net_enable_timestamp();
1580 	}
1581 out:
1582 	return newsk;
1583 }
1584 EXPORT_SYMBOL_GPL(sk_clone_lock);
1585 
sk_setup_caps(struct sock * sk,struct dst_entry * dst)1586 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1587 {
1588 	__sk_dst_set(sk, dst);
1589 	sk->sk_route_caps = dst->dev->features;
1590 	if (sk->sk_route_caps & NETIF_F_GSO)
1591 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1592 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1593 	if (sk_can_gso(sk)) {
1594 		if (dst->header_len) {
1595 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1596 		} else {
1597 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1598 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1599 			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1600 		}
1601 	}
1602 }
1603 EXPORT_SYMBOL_GPL(sk_setup_caps);
1604 
1605 /*
1606  *	Simple resource managers for sockets.
1607  */
1608 
1609 
1610 /*
1611  * Write buffer destructor automatically called from kfree_skb.
1612  */
sock_wfree(struct sk_buff * skb)1613 void sock_wfree(struct sk_buff *skb)
1614 {
1615 	struct sock *sk = skb->sk;
1616 	unsigned int len = skb->truesize;
1617 
1618 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1619 		/*
1620 		 * Keep a reference on sk_wmem_alloc, this will be released
1621 		 * after sk_write_space() call
1622 		 */
1623 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1624 		sk->sk_write_space(sk);
1625 		len = 1;
1626 	}
1627 	/*
1628 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1629 	 * could not do because of in-flight packets
1630 	 */
1631 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1632 		__sk_free(sk);
1633 }
1634 EXPORT_SYMBOL(sock_wfree);
1635 
skb_orphan_partial(struct sk_buff * skb)1636 void skb_orphan_partial(struct sk_buff *skb)
1637 {
1638 	if (skb->destructor == sock_wfree
1639 #ifdef CONFIG_INET
1640 	    || skb->destructor == tcp_wfree
1641 #endif
1642 		) {
1643 		struct sock *sk = skb->sk;
1644 
1645 		if (atomic_inc_not_zero(&sk->sk_refcnt)) {
1646 			atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1647 			skb->destructor = sock_efree;
1648 		}
1649 	} else {
1650 		skb_orphan(skb);
1651 	}
1652 }
1653 EXPORT_SYMBOL(skb_orphan_partial);
1654 
1655 /*
1656  * Read buffer destructor automatically called from kfree_skb.
1657  */
sock_rfree(struct sk_buff * skb)1658 void sock_rfree(struct sk_buff *skb)
1659 {
1660 	struct sock *sk = skb->sk;
1661 	unsigned int len = skb->truesize;
1662 
1663 	atomic_sub(len, &sk->sk_rmem_alloc);
1664 	sk_mem_uncharge(sk, len);
1665 }
1666 EXPORT_SYMBOL(sock_rfree);
1667 
sock_efree(struct sk_buff * skb)1668 void sock_efree(struct sk_buff *skb)
1669 {
1670 	sock_put(skb->sk);
1671 }
1672 EXPORT_SYMBOL(sock_efree);
1673 
1674 #ifdef CONFIG_INET
sock_edemux(struct sk_buff * skb)1675 void sock_edemux(struct sk_buff *skb)
1676 {
1677 	struct sock *sk = skb->sk;
1678 
1679 	if (sk->sk_state == TCP_TIME_WAIT)
1680 		inet_twsk_put(inet_twsk(sk));
1681 	else
1682 		sock_put(sk);
1683 }
1684 EXPORT_SYMBOL(sock_edemux);
1685 #endif
1686 
sock_i_uid(struct sock * sk)1687 kuid_t sock_i_uid(struct sock *sk)
1688 {
1689 	kuid_t uid;
1690 
1691 	read_lock_bh(&sk->sk_callback_lock);
1692 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1693 	read_unlock_bh(&sk->sk_callback_lock);
1694 	return uid;
1695 }
1696 EXPORT_SYMBOL(sock_i_uid);
1697 
sock_i_ino(struct sock * sk)1698 unsigned long sock_i_ino(struct sock *sk)
1699 {
1700 	unsigned long ino;
1701 
1702 	read_lock_bh(&sk->sk_callback_lock);
1703 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1704 	read_unlock_bh(&sk->sk_callback_lock);
1705 	return ino;
1706 }
1707 EXPORT_SYMBOL(sock_i_ino);
1708 
1709 /*
1710  * Allocate a skb from the socket's send buffer.
1711  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)1712 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1713 			     gfp_t priority)
1714 {
1715 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1716 		struct sk_buff *skb = alloc_skb(size, priority);
1717 		if (skb) {
1718 			skb_set_owner_w(skb, sk);
1719 			return skb;
1720 		}
1721 	}
1722 	return NULL;
1723 }
1724 EXPORT_SYMBOL(sock_wmalloc);
1725 
1726 /*
1727  * Allocate a memory block from the socket's option memory buffer.
1728  */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)1729 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1730 {
1731 	if ((unsigned int)size <= sysctl_optmem_max &&
1732 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1733 		void *mem;
1734 		/* First do the add, to avoid the race if kmalloc
1735 		 * might sleep.
1736 		 */
1737 		atomic_add(size, &sk->sk_omem_alloc);
1738 		mem = kmalloc(size, priority);
1739 		if (mem)
1740 			return mem;
1741 		atomic_sub(size, &sk->sk_omem_alloc);
1742 	}
1743 	return NULL;
1744 }
1745 EXPORT_SYMBOL(sock_kmalloc);
1746 
1747 /*
1748  * Free an option memory block.
1749  */
sock_kfree_s(struct sock * sk,void * mem,int size)1750 void sock_kfree_s(struct sock *sk, void *mem, int size)
1751 {
1752 	if (WARN_ON_ONCE(!mem))
1753 		return;
1754 	kfree(mem);
1755 	atomic_sub(size, &sk->sk_omem_alloc);
1756 }
1757 EXPORT_SYMBOL(sock_kfree_s);
1758 
1759 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1760    I think, these locks should be removed for datagram sockets.
1761  */
sock_wait_for_wmem(struct sock * sk,long timeo)1762 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1763 {
1764 	DEFINE_WAIT(wait);
1765 
1766 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1767 	for (;;) {
1768 		if (!timeo)
1769 			break;
1770 		if (signal_pending(current))
1771 			break;
1772 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1773 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1774 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1775 			break;
1776 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1777 			break;
1778 		if (sk->sk_err)
1779 			break;
1780 		timeo = schedule_timeout(timeo);
1781 	}
1782 	finish_wait(sk_sleep(sk), &wait);
1783 	return timeo;
1784 }
1785 
1786 
1787 /*
1788  *	Generic send/receive buffer handlers
1789  */
1790 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)1791 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1792 				     unsigned long data_len, int noblock,
1793 				     int *errcode, int max_page_order)
1794 {
1795 	struct sk_buff *skb;
1796 	long timeo;
1797 	int err;
1798 
1799 	timeo = sock_sndtimeo(sk, noblock);
1800 	for (;;) {
1801 		err = sock_error(sk);
1802 		if (err != 0)
1803 			goto failure;
1804 
1805 		err = -EPIPE;
1806 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1807 			goto failure;
1808 
1809 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1810 			break;
1811 
1812 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1813 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1814 		err = -EAGAIN;
1815 		if (!timeo)
1816 			goto failure;
1817 		if (signal_pending(current))
1818 			goto interrupted;
1819 		timeo = sock_wait_for_wmem(sk, timeo);
1820 	}
1821 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1822 				   errcode, sk->sk_allocation);
1823 	if (skb)
1824 		skb_set_owner_w(skb, sk);
1825 	return skb;
1826 
1827 interrupted:
1828 	err = sock_intr_errno(timeo);
1829 failure:
1830 	*errcode = err;
1831 	return NULL;
1832 }
1833 EXPORT_SYMBOL(sock_alloc_send_pskb);
1834 
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)1835 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1836 				    int noblock, int *errcode)
1837 {
1838 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1839 }
1840 EXPORT_SYMBOL(sock_alloc_send_skb);
1841 
1842 /* On 32bit arches, an skb frag is limited to 2^15 */
1843 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
1844 
1845 /**
1846  * skb_page_frag_refill - check that a page_frag contains enough room
1847  * @sz: minimum size of the fragment we want to get
1848  * @pfrag: pointer to page_frag
1849  * @gfp: priority for memory allocation
1850  *
1851  * Note: While this allocator tries to use high order pages, there is
1852  * no guarantee that allocations succeed. Therefore, @sz MUST be
1853  * less or equal than PAGE_SIZE.
1854  */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)1855 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1856 {
1857 	if (pfrag->page) {
1858 		if (atomic_read(&pfrag->page->_count) == 1) {
1859 			pfrag->offset = 0;
1860 			return true;
1861 		}
1862 		if (pfrag->offset + sz <= pfrag->size)
1863 			return true;
1864 		put_page(pfrag->page);
1865 	}
1866 
1867 	pfrag->offset = 0;
1868 	if (SKB_FRAG_PAGE_ORDER) {
1869 		pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
1870 					  __GFP_NOWARN | __GFP_NORETRY,
1871 					  SKB_FRAG_PAGE_ORDER);
1872 		if (likely(pfrag->page)) {
1873 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1874 			return true;
1875 		}
1876 	}
1877 	pfrag->page = alloc_page(gfp);
1878 	if (likely(pfrag->page)) {
1879 		pfrag->size = PAGE_SIZE;
1880 		return true;
1881 	}
1882 	return false;
1883 }
1884 EXPORT_SYMBOL(skb_page_frag_refill);
1885 
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)1886 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1887 {
1888 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1889 		return true;
1890 
1891 	sk_enter_memory_pressure(sk);
1892 	sk_stream_moderate_sndbuf(sk);
1893 	return false;
1894 }
1895 EXPORT_SYMBOL(sk_page_frag_refill);
1896 
__lock_sock(struct sock * sk)1897 static void __lock_sock(struct sock *sk)
1898 	__releases(&sk->sk_lock.slock)
1899 	__acquires(&sk->sk_lock.slock)
1900 {
1901 	DEFINE_WAIT(wait);
1902 
1903 	for (;;) {
1904 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1905 					TASK_UNINTERRUPTIBLE);
1906 		spin_unlock_bh(&sk->sk_lock.slock);
1907 		schedule();
1908 		spin_lock_bh(&sk->sk_lock.slock);
1909 		if (!sock_owned_by_user(sk))
1910 			break;
1911 	}
1912 	finish_wait(&sk->sk_lock.wq, &wait);
1913 }
1914 
__release_sock(struct sock * sk)1915 static void __release_sock(struct sock *sk)
1916 	__releases(&sk->sk_lock.slock)
1917 	__acquires(&sk->sk_lock.slock)
1918 {
1919 	struct sk_buff *skb = sk->sk_backlog.head;
1920 
1921 	do {
1922 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1923 		bh_unlock_sock(sk);
1924 
1925 		do {
1926 			struct sk_buff *next = skb->next;
1927 
1928 			prefetch(next);
1929 			WARN_ON_ONCE(skb_dst_is_noref(skb));
1930 			skb->next = NULL;
1931 			sk_backlog_rcv(sk, skb);
1932 
1933 			/*
1934 			 * We are in process context here with softirqs
1935 			 * disabled, use cond_resched_softirq() to preempt.
1936 			 * This is safe to do because we've taken the backlog
1937 			 * queue private:
1938 			 */
1939 			cond_resched_softirq();
1940 
1941 			skb = next;
1942 		} while (skb != NULL);
1943 
1944 		bh_lock_sock(sk);
1945 	} while ((skb = sk->sk_backlog.head) != NULL);
1946 
1947 	/*
1948 	 * Doing the zeroing here guarantee we can not loop forever
1949 	 * while a wild producer attempts to flood us.
1950 	 */
1951 	sk->sk_backlog.len = 0;
1952 }
1953 
1954 /**
1955  * sk_wait_data - wait for data to arrive at sk_receive_queue
1956  * @sk:    sock to wait on
1957  * @timeo: for how long
1958  *
1959  * Now socket state including sk->sk_err is changed only under lock,
1960  * hence we may omit checks after joining wait queue.
1961  * We check receive queue before schedule() only as optimization;
1962  * it is very likely that release_sock() added new data.
1963  */
sk_wait_data(struct sock * sk,long * timeo)1964 int sk_wait_data(struct sock *sk, long *timeo)
1965 {
1966 	int rc;
1967 	DEFINE_WAIT(wait);
1968 
1969 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1970 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1971 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1972 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1973 	finish_wait(sk_sleep(sk), &wait);
1974 	return rc;
1975 }
1976 EXPORT_SYMBOL(sk_wait_data);
1977 
1978 /**
1979  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1980  *	@sk: socket
1981  *	@size: memory size to allocate
1982  *	@kind: allocation type
1983  *
1984  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1985  *	rmem allocation. This function assumes that protocols which have
1986  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1987  */
__sk_mem_schedule(struct sock * sk,int size,int kind)1988 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1989 {
1990 	struct proto *prot = sk->sk_prot;
1991 	int amt = sk_mem_pages(size);
1992 	long allocated;
1993 	int parent_status = UNDER_LIMIT;
1994 
1995 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1996 
1997 	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1998 
1999 	/* Under limit. */
2000 	if (parent_status == UNDER_LIMIT &&
2001 			allocated <= sk_prot_mem_limits(sk, 0)) {
2002 		sk_leave_memory_pressure(sk);
2003 		return 1;
2004 	}
2005 
2006 	/* Under pressure. (we or our parents) */
2007 	if ((parent_status > SOFT_LIMIT) ||
2008 			allocated > sk_prot_mem_limits(sk, 1))
2009 		sk_enter_memory_pressure(sk);
2010 
2011 	/* Over hard limit (we or our parents) */
2012 	if ((parent_status == OVER_LIMIT) ||
2013 			(allocated > sk_prot_mem_limits(sk, 2)))
2014 		goto suppress_allocation;
2015 
2016 	/* guarantee minimum buffer size under pressure */
2017 	if (kind == SK_MEM_RECV) {
2018 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2019 			return 1;
2020 
2021 	} else { /* SK_MEM_SEND */
2022 		if (sk->sk_type == SOCK_STREAM) {
2023 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2024 				return 1;
2025 		} else if (atomic_read(&sk->sk_wmem_alloc) <
2026 			   prot->sysctl_wmem[0])
2027 				return 1;
2028 	}
2029 
2030 	if (sk_has_memory_pressure(sk)) {
2031 		int alloc;
2032 
2033 		if (!sk_under_memory_pressure(sk))
2034 			return 1;
2035 		alloc = sk_sockets_allocated_read_positive(sk);
2036 		if (sk_prot_mem_limits(sk, 2) > alloc *
2037 		    sk_mem_pages(sk->sk_wmem_queued +
2038 				 atomic_read(&sk->sk_rmem_alloc) +
2039 				 sk->sk_forward_alloc))
2040 			return 1;
2041 	}
2042 
2043 suppress_allocation:
2044 
2045 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2046 		sk_stream_moderate_sndbuf(sk);
2047 
2048 		/* Fail only if socket is _under_ its sndbuf.
2049 		 * In this case we cannot block, so that we have to fail.
2050 		 */
2051 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2052 			return 1;
2053 	}
2054 
2055 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2056 
2057 	/* Alas. Undo changes. */
2058 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2059 
2060 	sk_memory_allocated_sub(sk, amt);
2061 
2062 	return 0;
2063 }
2064 EXPORT_SYMBOL(__sk_mem_schedule);
2065 
2066 /**
2067  *	__sk_reclaim - reclaim memory_allocated
2068  *	@sk: socket
2069  */
__sk_mem_reclaim(struct sock * sk)2070 void __sk_mem_reclaim(struct sock *sk)
2071 {
2072 	sk_memory_allocated_sub(sk,
2073 				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2074 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2075 
2076 	if (sk_under_memory_pressure(sk) &&
2077 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2078 		sk_leave_memory_pressure(sk);
2079 }
2080 EXPORT_SYMBOL(__sk_mem_reclaim);
2081 
2082 
2083 /*
2084  * Set of default routines for initialising struct proto_ops when
2085  * the protocol does not support a particular function. In certain
2086  * cases where it makes no sense for a protocol to have a "do nothing"
2087  * function, some default processing is provided.
2088  */
2089 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2090 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2091 {
2092 	return -EOPNOTSUPP;
2093 }
2094 EXPORT_SYMBOL(sock_no_bind);
2095 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2096 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2097 		    int len, int flags)
2098 {
2099 	return -EOPNOTSUPP;
2100 }
2101 EXPORT_SYMBOL(sock_no_connect);
2102 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2103 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2104 {
2105 	return -EOPNOTSUPP;
2106 }
2107 EXPORT_SYMBOL(sock_no_socketpair);
2108 
sock_no_accept(struct socket * sock,struct socket * newsock,int flags)2109 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2110 {
2111 	return -EOPNOTSUPP;
2112 }
2113 EXPORT_SYMBOL(sock_no_accept);
2114 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int * len,int peer)2115 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2116 		    int *len, int peer)
2117 {
2118 	return -EOPNOTSUPP;
2119 }
2120 EXPORT_SYMBOL(sock_no_getname);
2121 
sock_no_poll(struct file * file,struct socket * sock,poll_table * pt)2122 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2123 {
2124 	return 0;
2125 }
2126 EXPORT_SYMBOL(sock_no_poll);
2127 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2128 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2129 {
2130 	return -EOPNOTSUPP;
2131 }
2132 EXPORT_SYMBOL(sock_no_ioctl);
2133 
sock_no_listen(struct socket * sock,int backlog)2134 int sock_no_listen(struct socket *sock, int backlog)
2135 {
2136 	return -EOPNOTSUPP;
2137 }
2138 EXPORT_SYMBOL(sock_no_listen);
2139 
sock_no_shutdown(struct socket * sock,int how)2140 int sock_no_shutdown(struct socket *sock, int how)
2141 {
2142 	return -EOPNOTSUPP;
2143 }
2144 EXPORT_SYMBOL(sock_no_shutdown);
2145 
sock_no_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2146 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2147 		    char __user *optval, unsigned int optlen)
2148 {
2149 	return -EOPNOTSUPP;
2150 }
2151 EXPORT_SYMBOL(sock_no_setsockopt);
2152 
sock_no_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2153 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2154 		    char __user *optval, int __user *optlen)
2155 {
2156 	return -EOPNOTSUPP;
2157 }
2158 EXPORT_SYMBOL(sock_no_getsockopt);
2159 
sock_no_sendmsg(struct kiocb * iocb,struct socket * sock,struct msghdr * m,size_t len)2160 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2161 		    size_t len)
2162 {
2163 	return -EOPNOTSUPP;
2164 }
2165 EXPORT_SYMBOL(sock_no_sendmsg);
2166 
sock_no_recvmsg(struct kiocb * iocb,struct socket * sock,struct msghdr * m,size_t len,int flags)2167 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2168 		    size_t len, int flags)
2169 {
2170 	return -EOPNOTSUPP;
2171 }
2172 EXPORT_SYMBOL(sock_no_recvmsg);
2173 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)2174 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2175 {
2176 	/* Mirror missing mmap method error code */
2177 	return -ENODEV;
2178 }
2179 EXPORT_SYMBOL(sock_no_mmap);
2180 
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)2181 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2182 {
2183 	ssize_t res;
2184 	struct msghdr msg = {.msg_flags = flags};
2185 	struct kvec iov;
2186 	char *kaddr = kmap(page);
2187 	iov.iov_base = kaddr + offset;
2188 	iov.iov_len = size;
2189 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2190 	kunmap(page);
2191 	return res;
2192 }
2193 EXPORT_SYMBOL(sock_no_sendpage);
2194 
2195 /*
2196  *	Default Socket Callbacks
2197  */
2198 
sock_def_wakeup(struct sock * sk)2199 static void sock_def_wakeup(struct sock *sk)
2200 {
2201 	struct socket_wq *wq;
2202 
2203 	rcu_read_lock();
2204 	wq = rcu_dereference(sk->sk_wq);
2205 	if (wq_has_sleeper(wq))
2206 		wake_up_interruptible_all(&wq->wait);
2207 	rcu_read_unlock();
2208 }
2209 
sock_def_error_report(struct sock * sk)2210 static void sock_def_error_report(struct sock *sk)
2211 {
2212 	struct socket_wq *wq;
2213 
2214 	rcu_read_lock();
2215 	wq = rcu_dereference(sk->sk_wq);
2216 	if (wq_has_sleeper(wq))
2217 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2218 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2219 	rcu_read_unlock();
2220 }
2221 
sock_def_readable(struct sock * sk)2222 static void sock_def_readable(struct sock *sk)
2223 {
2224 	struct socket_wq *wq;
2225 
2226 	rcu_read_lock();
2227 	wq = rcu_dereference(sk->sk_wq);
2228 	if (wq_has_sleeper(wq))
2229 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2230 						POLLRDNORM | POLLRDBAND);
2231 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2232 	rcu_read_unlock();
2233 }
2234 
sock_def_write_space(struct sock * sk)2235 static void sock_def_write_space(struct sock *sk)
2236 {
2237 	struct socket_wq *wq;
2238 
2239 	rcu_read_lock();
2240 
2241 	/* Do not wake up a writer until he can make "significant"
2242 	 * progress.  --DaveM
2243 	 */
2244 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2245 		wq = rcu_dereference(sk->sk_wq);
2246 		if (wq_has_sleeper(wq))
2247 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2248 						POLLWRNORM | POLLWRBAND);
2249 
2250 		/* Should agree with poll, otherwise some programs break */
2251 		if (sock_writeable(sk))
2252 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2253 	}
2254 
2255 	rcu_read_unlock();
2256 }
2257 
sock_def_destruct(struct sock * sk)2258 static void sock_def_destruct(struct sock *sk)
2259 {
2260 	kfree(sk->sk_protinfo);
2261 }
2262 
sk_send_sigurg(struct sock * sk)2263 void sk_send_sigurg(struct sock *sk)
2264 {
2265 	if (sk->sk_socket && sk->sk_socket->file)
2266 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2267 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2268 }
2269 EXPORT_SYMBOL(sk_send_sigurg);
2270 
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)2271 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2272 		    unsigned long expires)
2273 {
2274 	if (!mod_timer(timer, expires))
2275 		sock_hold(sk);
2276 }
2277 EXPORT_SYMBOL(sk_reset_timer);
2278 
sk_stop_timer(struct sock * sk,struct timer_list * timer)2279 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2280 {
2281 	if (del_timer(timer))
2282 		__sock_put(sk);
2283 }
2284 EXPORT_SYMBOL(sk_stop_timer);
2285 
sock_init_data(struct socket * sock,struct sock * sk)2286 void sock_init_data(struct socket *sock, struct sock *sk)
2287 {
2288 	skb_queue_head_init(&sk->sk_receive_queue);
2289 	skb_queue_head_init(&sk->sk_write_queue);
2290 	skb_queue_head_init(&sk->sk_error_queue);
2291 
2292 	sk->sk_send_head	=	NULL;
2293 
2294 	init_timer(&sk->sk_timer);
2295 
2296 	sk->sk_allocation	=	GFP_KERNEL;
2297 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2298 	sk->sk_sndbuf		=	sysctl_wmem_default;
2299 	sk->sk_state		=	TCP_CLOSE;
2300 	sk_set_socket(sk, sock);
2301 
2302 	sock_set_flag(sk, SOCK_ZAPPED);
2303 
2304 	if (sock) {
2305 		sk->sk_type	=	sock->type;
2306 		sk->sk_wq	=	sock->wq;
2307 		sock->sk	=	sk;
2308 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2309 	} else {
2310 		sk->sk_wq	=	NULL;
2311 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2312 	}
2313 
2314 	spin_lock_init(&sk->sk_dst_lock);
2315 	rwlock_init(&sk->sk_callback_lock);
2316 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2317 			af_callback_keys + sk->sk_family,
2318 			af_family_clock_key_strings[sk->sk_family]);
2319 
2320 	sk->sk_state_change	=	sock_def_wakeup;
2321 	sk->sk_data_ready	=	sock_def_readable;
2322 	sk->sk_write_space	=	sock_def_write_space;
2323 	sk->sk_error_report	=	sock_def_error_report;
2324 	sk->sk_destruct		=	sock_def_destruct;
2325 
2326 	sk->sk_frag.page	=	NULL;
2327 	sk->sk_frag.offset	=	0;
2328 	sk->sk_peek_off		=	-1;
2329 
2330 	sk->sk_peer_pid 	=	NULL;
2331 	sk->sk_peer_cred	=	NULL;
2332 	sk->sk_write_pending	=	0;
2333 	sk->sk_rcvlowat		=	1;
2334 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2335 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2336 
2337 	sk->sk_stamp = ktime_set(-1L, 0);
2338 
2339 #ifdef CONFIG_NET_RX_BUSY_POLL
2340 	sk->sk_napi_id		=	0;
2341 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2342 #endif
2343 
2344 	sk->sk_max_pacing_rate = ~0U;
2345 	sk->sk_pacing_rate = ~0U;
2346 	/*
2347 	 * Before updating sk_refcnt, we must commit prior changes to memory
2348 	 * (Documentation/RCU/rculist_nulls.txt for details)
2349 	 */
2350 	smp_wmb();
2351 	atomic_set(&sk->sk_refcnt, 1);
2352 	atomic_set(&sk->sk_drops, 0);
2353 }
2354 EXPORT_SYMBOL(sock_init_data);
2355 
lock_sock_nested(struct sock * sk,int subclass)2356 void lock_sock_nested(struct sock *sk, int subclass)
2357 {
2358 	might_sleep();
2359 	spin_lock_bh(&sk->sk_lock.slock);
2360 	if (sk->sk_lock.owned)
2361 		__lock_sock(sk);
2362 	sk->sk_lock.owned = 1;
2363 	spin_unlock(&sk->sk_lock.slock);
2364 	/*
2365 	 * The sk_lock has mutex_lock() semantics here:
2366 	 */
2367 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2368 	local_bh_enable();
2369 }
2370 EXPORT_SYMBOL(lock_sock_nested);
2371 
release_sock(struct sock * sk)2372 void release_sock(struct sock *sk)
2373 {
2374 	/*
2375 	 * The sk_lock has mutex_unlock() semantics:
2376 	 */
2377 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2378 
2379 	spin_lock_bh(&sk->sk_lock.slock);
2380 	if (sk->sk_backlog.tail)
2381 		__release_sock(sk);
2382 
2383 	/* Warning : release_cb() might need to release sk ownership,
2384 	 * ie call sock_release_ownership(sk) before us.
2385 	 */
2386 	if (sk->sk_prot->release_cb)
2387 		sk->sk_prot->release_cb(sk);
2388 
2389 	sock_release_ownership(sk);
2390 	if (waitqueue_active(&sk->sk_lock.wq))
2391 		wake_up(&sk->sk_lock.wq);
2392 	spin_unlock_bh(&sk->sk_lock.slock);
2393 }
2394 EXPORT_SYMBOL(release_sock);
2395 
2396 /**
2397  * lock_sock_fast - fast version of lock_sock
2398  * @sk: socket
2399  *
2400  * This version should be used for very small section, where process wont block
2401  * return false if fast path is taken
2402  *   sk_lock.slock locked, owned = 0, BH disabled
2403  * return true if slow path is taken
2404  *   sk_lock.slock unlocked, owned = 1, BH enabled
2405  */
lock_sock_fast(struct sock * sk)2406 bool lock_sock_fast(struct sock *sk)
2407 {
2408 	might_sleep();
2409 	spin_lock_bh(&sk->sk_lock.slock);
2410 
2411 	if (!sk->sk_lock.owned)
2412 		/*
2413 		 * Note : We must disable BH
2414 		 */
2415 		return false;
2416 
2417 	__lock_sock(sk);
2418 	sk->sk_lock.owned = 1;
2419 	spin_unlock(&sk->sk_lock.slock);
2420 	/*
2421 	 * The sk_lock has mutex_lock() semantics here:
2422 	 */
2423 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2424 	local_bh_enable();
2425 	return true;
2426 }
2427 EXPORT_SYMBOL(lock_sock_fast);
2428 
sock_get_timestamp(struct sock * sk,struct timeval __user * userstamp)2429 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2430 {
2431 	struct timeval tv;
2432 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2433 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2434 	tv = ktime_to_timeval(sk->sk_stamp);
2435 	if (tv.tv_sec == -1)
2436 		return -ENOENT;
2437 	if (tv.tv_sec == 0) {
2438 		sk->sk_stamp = ktime_get_real();
2439 		tv = ktime_to_timeval(sk->sk_stamp);
2440 	}
2441 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2442 }
2443 EXPORT_SYMBOL(sock_get_timestamp);
2444 
sock_get_timestampns(struct sock * sk,struct timespec __user * userstamp)2445 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2446 {
2447 	struct timespec ts;
2448 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2449 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2450 	ts = ktime_to_timespec(sk->sk_stamp);
2451 	if (ts.tv_sec == -1)
2452 		return -ENOENT;
2453 	if (ts.tv_sec == 0) {
2454 		sk->sk_stamp = ktime_get_real();
2455 		ts = ktime_to_timespec(sk->sk_stamp);
2456 	}
2457 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2458 }
2459 EXPORT_SYMBOL(sock_get_timestampns);
2460 
sock_enable_timestamp(struct sock * sk,int flag)2461 void sock_enable_timestamp(struct sock *sk, int flag)
2462 {
2463 	if (!sock_flag(sk, flag)) {
2464 		unsigned long previous_flags = sk->sk_flags;
2465 
2466 		sock_set_flag(sk, flag);
2467 		/*
2468 		 * we just set one of the two flags which require net
2469 		 * time stamping, but time stamping might have been on
2470 		 * already because of the other one
2471 		 */
2472 		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2473 			net_enable_timestamp();
2474 	}
2475 }
2476 
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)2477 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2478 		       int level, int type)
2479 {
2480 	struct sock_exterr_skb *serr;
2481 	struct sk_buff *skb;
2482 	int copied, err;
2483 
2484 	err = -EAGAIN;
2485 	skb = sock_dequeue_err_skb(sk);
2486 	if (skb == NULL)
2487 		goto out;
2488 
2489 	copied = skb->len;
2490 	if (copied > len) {
2491 		msg->msg_flags |= MSG_TRUNC;
2492 		copied = len;
2493 	}
2494 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2495 	if (err)
2496 		goto out_free_skb;
2497 
2498 	sock_recv_timestamp(msg, sk, skb);
2499 
2500 	serr = SKB_EXT_ERR(skb);
2501 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2502 
2503 	msg->msg_flags |= MSG_ERRQUEUE;
2504 	err = copied;
2505 
2506 out_free_skb:
2507 	kfree_skb(skb);
2508 out:
2509 	return err;
2510 }
2511 EXPORT_SYMBOL(sock_recv_errqueue);
2512 
2513 /*
2514  *	Get a socket option on an socket.
2515  *
2516  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2517  *	asynchronous errors should be reported by getsockopt. We assume
2518  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2519  */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2520 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2521 			   char __user *optval, int __user *optlen)
2522 {
2523 	struct sock *sk = sock->sk;
2524 
2525 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2526 }
2527 EXPORT_SYMBOL(sock_common_getsockopt);
2528 
2529 #ifdef CONFIG_COMPAT
compat_sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2530 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2531 				  char __user *optval, int __user *optlen)
2532 {
2533 	struct sock *sk = sock->sk;
2534 
2535 	if (sk->sk_prot->compat_getsockopt != NULL)
2536 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2537 						      optval, optlen);
2538 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2539 }
2540 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2541 #endif
2542 
sock_common_recvmsg(struct kiocb * iocb,struct socket * sock,struct msghdr * msg,size_t size,int flags)2543 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2544 			struct msghdr *msg, size_t size, int flags)
2545 {
2546 	struct sock *sk = sock->sk;
2547 	int addr_len = 0;
2548 	int err;
2549 
2550 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2551 				   flags & ~MSG_DONTWAIT, &addr_len);
2552 	if (err >= 0)
2553 		msg->msg_namelen = addr_len;
2554 	return err;
2555 }
2556 EXPORT_SYMBOL(sock_common_recvmsg);
2557 
2558 /*
2559  *	Set socket options on an inet socket.
2560  */
sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2561 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2562 			   char __user *optval, unsigned int optlen)
2563 {
2564 	struct sock *sk = sock->sk;
2565 
2566 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2567 }
2568 EXPORT_SYMBOL(sock_common_setsockopt);
2569 
2570 #ifdef CONFIG_COMPAT
compat_sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2571 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2572 				  char __user *optval, unsigned int optlen)
2573 {
2574 	struct sock *sk = sock->sk;
2575 
2576 	if (sk->sk_prot->compat_setsockopt != NULL)
2577 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2578 						      optval, optlen);
2579 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2580 }
2581 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2582 #endif
2583 
sk_common_release(struct sock * sk)2584 void sk_common_release(struct sock *sk)
2585 {
2586 	if (sk->sk_prot->destroy)
2587 		sk->sk_prot->destroy(sk);
2588 
2589 	/*
2590 	 * Observation: when sock_common_release is called, processes have
2591 	 * no access to socket. But net still has.
2592 	 * Step one, detach it from networking:
2593 	 *
2594 	 * A. Remove from hash tables.
2595 	 */
2596 
2597 	sk->sk_prot->unhash(sk);
2598 
2599 	/*
2600 	 * In this point socket cannot receive new packets, but it is possible
2601 	 * that some packets are in flight because some CPU runs receiver and
2602 	 * did hash table lookup before we unhashed socket. They will achieve
2603 	 * receive queue and will be purged by socket destructor.
2604 	 *
2605 	 * Also we still have packets pending on receive queue and probably,
2606 	 * our own packets waiting in device queues. sock_destroy will drain
2607 	 * receive queue, but transmitted packets will delay socket destruction
2608 	 * until the last reference will be released.
2609 	 */
2610 
2611 	sock_orphan(sk);
2612 
2613 	xfrm_sk_free_policy(sk);
2614 
2615 	sk_refcnt_debug_release(sk);
2616 
2617 	sock_put(sk);
2618 }
2619 EXPORT_SYMBOL(sk_common_release);
2620 
2621 #ifdef CONFIG_PROC_FS
2622 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2623 struct prot_inuse {
2624 	int val[PROTO_INUSE_NR];
2625 };
2626 
2627 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2628 
2629 #ifdef CONFIG_NET_NS
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)2630 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2631 {
2632 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2633 }
2634 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2635 
sock_prot_inuse_get(struct net * net,struct proto * prot)2636 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2637 {
2638 	int cpu, idx = prot->inuse_idx;
2639 	int res = 0;
2640 
2641 	for_each_possible_cpu(cpu)
2642 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2643 
2644 	return res >= 0 ? res : 0;
2645 }
2646 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2647 
sock_inuse_init_net(struct net * net)2648 static int __net_init sock_inuse_init_net(struct net *net)
2649 {
2650 	net->core.inuse = alloc_percpu(struct prot_inuse);
2651 	return net->core.inuse ? 0 : -ENOMEM;
2652 }
2653 
sock_inuse_exit_net(struct net * net)2654 static void __net_exit sock_inuse_exit_net(struct net *net)
2655 {
2656 	free_percpu(net->core.inuse);
2657 }
2658 
2659 static struct pernet_operations net_inuse_ops = {
2660 	.init = sock_inuse_init_net,
2661 	.exit = sock_inuse_exit_net,
2662 };
2663 
net_inuse_init(void)2664 static __init int net_inuse_init(void)
2665 {
2666 	if (register_pernet_subsys(&net_inuse_ops))
2667 		panic("Cannot initialize net inuse counters");
2668 
2669 	return 0;
2670 }
2671 
2672 core_initcall(net_inuse_init);
2673 #else
2674 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2675 
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)2676 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2677 {
2678 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2679 }
2680 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2681 
sock_prot_inuse_get(struct net * net,struct proto * prot)2682 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2683 {
2684 	int cpu, idx = prot->inuse_idx;
2685 	int res = 0;
2686 
2687 	for_each_possible_cpu(cpu)
2688 		res += per_cpu(prot_inuse, cpu).val[idx];
2689 
2690 	return res >= 0 ? res : 0;
2691 }
2692 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2693 #endif
2694 
assign_proto_idx(struct proto * prot)2695 static void assign_proto_idx(struct proto *prot)
2696 {
2697 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2698 
2699 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2700 		pr_err("PROTO_INUSE_NR exhausted\n");
2701 		return;
2702 	}
2703 
2704 	set_bit(prot->inuse_idx, proto_inuse_idx);
2705 }
2706 
release_proto_idx(struct proto * prot)2707 static void release_proto_idx(struct proto *prot)
2708 {
2709 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2710 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2711 }
2712 #else
assign_proto_idx(struct proto * prot)2713 static inline void assign_proto_idx(struct proto *prot)
2714 {
2715 }
2716 
release_proto_idx(struct proto * prot)2717 static inline void release_proto_idx(struct proto *prot)
2718 {
2719 }
2720 #endif
2721 
proto_register(struct proto * prot,int alloc_slab)2722 int proto_register(struct proto *prot, int alloc_slab)
2723 {
2724 	if (alloc_slab) {
2725 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2726 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2727 					NULL);
2728 
2729 		if (prot->slab == NULL) {
2730 			pr_crit("%s: Can't create sock SLAB cache!\n",
2731 				prot->name);
2732 			goto out;
2733 		}
2734 
2735 		if (prot->rsk_prot != NULL) {
2736 			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2737 			if (prot->rsk_prot->slab_name == NULL)
2738 				goto out_free_sock_slab;
2739 
2740 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2741 								 prot->rsk_prot->obj_size, 0,
2742 								 SLAB_HWCACHE_ALIGN, NULL);
2743 
2744 			if (prot->rsk_prot->slab == NULL) {
2745 				pr_crit("%s: Can't create request sock SLAB cache!\n",
2746 					prot->name);
2747 				goto out_free_request_sock_slab_name;
2748 			}
2749 		}
2750 
2751 		if (prot->twsk_prot != NULL) {
2752 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2753 
2754 			if (prot->twsk_prot->twsk_slab_name == NULL)
2755 				goto out_free_request_sock_slab;
2756 
2757 			prot->twsk_prot->twsk_slab =
2758 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2759 						  prot->twsk_prot->twsk_obj_size,
2760 						  0,
2761 						  SLAB_HWCACHE_ALIGN |
2762 							prot->slab_flags,
2763 						  NULL);
2764 			if (prot->twsk_prot->twsk_slab == NULL)
2765 				goto out_free_timewait_sock_slab_name;
2766 		}
2767 	}
2768 
2769 	mutex_lock(&proto_list_mutex);
2770 	list_add(&prot->node, &proto_list);
2771 	assign_proto_idx(prot);
2772 	mutex_unlock(&proto_list_mutex);
2773 	return 0;
2774 
2775 out_free_timewait_sock_slab_name:
2776 	kfree(prot->twsk_prot->twsk_slab_name);
2777 out_free_request_sock_slab:
2778 	if (prot->rsk_prot && prot->rsk_prot->slab) {
2779 		kmem_cache_destroy(prot->rsk_prot->slab);
2780 		prot->rsk_prot->slab = NULL;
2781 	}
2782 out_free_request_sock_slab_name:
2783 	if (prot->rsk_prot)
2784 		kfree(prot->rsk_prot->slab_name);
2785 out_free_sock_slab:
2786 	kmem_cache_destroy(prot->slab);
2787 	prot->slab = NULL;
2788 out:
2789 	return -ENOBUFS;
2790 }
2791 EXPORT_SYMBOL(proto_register);
2792 
proto_unregister(struct proto * prot)2793 void proto_unregister(struct proto *prot)
2794 {
2795 	mutex_lock(&proto_list_mutex);
2796 	release_proto_idx(prot);
2797 	list_del(&prot->node);
2798 	mutex_unlock(&proto_list_mutex);
2799 
2800 	if (prot->slab != NULL) {
2801 		kmem_cache_destroy(prot->slab);
2802 		prot->slab = NULL;
2803 	}
2804 
2805 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2806 		kmem_cache_destroy(prot->rsk_prot->slab);
2807 		kfree(prot->rsk_prot->slab_name);
2808 		prot->rsk_prot->slab = NULL;
2809 	}
2810 
2811 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2812 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2813 		kfree(prot->twsk_prot->twsk_slab_name);
2814 		prot->twsk_prot->twsk_slab = NULL;
2815 	}
2816 }
2817 EXPORT_SYMBOL(proto_unregister);
2818 
2819 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)2820 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2821 	__acquires(proto_list_mutex)
2822 {
2823 	mutex_lock(&proto_list_mutex);
2824 	return seq_list_start_head(&proto_list, *pos);
2825 }
2826 
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)2827 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2828 {
2829 	return seq_list_next(v, &proto_list, pos);
2830 }
2831 
proto_seq_stop(struct seq_file * seq,void * v)2832 static void proto_seq_stop(struct seq_file *seq, void *v)
2833 	__releases(proto_list_mutex)
2834 {
2835 	mutex_unlock(&proto_list_mutex);
2836 }
2837 
proto_method_implemented(const void * method)2838 static char proto_method_implemented(const void *method)
2839 {
2840 	return method == NULL ? 'n' : 'y';
2841 }
sock_prot_memory_allocated(struct proto * proto)2842 static long sock_prot_memory_allocated(struct proto *proto)
2843 {
2844 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2845 }
2846 
sock_prot_memory_pressure(struct proto * proto)2847 static char *sock_prot_memory_pressure(struct proto *proto)
2848 {
2849 	return proto->memory_pressure != NULL ?
2850 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2851 }
2852 
proto_seq_printf(struct seq_file * seq,struct proto * proto)2853 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2854 {
2855 
2856 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2857 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2858 		   proto->name,
2859 		   proto->obj_size,
2860 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2861 		   sock_prot_memory_allocated(proto),
2862 		   sock_prot_memory_pressure(proto),
2863 		   proto->max_header,
2864 		   proto->slab == NULL ? "no" : "yes",
2865 		   module_name(proto->owner),
2866 		   proto_method_implemented(proto->close),
2867 		   proto_method_implemented(proto->connect),
2868 		   proto_method_implemented(proto->disconnect),
2869 		   proto_method_implemented(proto->accept),
2870 		   proto_method_implemented(proto->ioctl),
2871 		   proto_method_implemented(proto->init),
2872 		   proto_method_implemented(proto->destroy),
2873 		   proto_method_implemented(proto->shutdown),
2874 		   proto_method_implemented(proto->setsockopt),
2875 		   proto_method_implemented(proto->getsockopt),
2876 		   proto_method_implemented(proto->sendmsg),
2877 		   proto_method_implemented(proto->recvmsg),
2878 		   proto_method_implemented(proto->sendpage),
2879 		   proto_method_implemented(proto->bind),
2880 		   proto_method_implemented(proto->backlog_rcv),
2881 		   proto_method_implemented(proto->hash),
2882 		   proto_method_implemented(proto->unhash),
2883 		   proto_method_implemented(proto->get_port),
2884 		   proto_method_implemented(proto->enter_memory_pressure));
2885 }
2886 
proto_seq_show(struct seq_file * seq,void * v)2887 static int proto_seq_show(struct seq_file *seq, void *v)
2888 {
2889 	if (v == &proto_list)
2890 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2891 			   "protocol",
2892 			   "size",
2893 			   "sockets",
2894 			   "memory",
2895 			   "press",
2896 			   "maxhdr",
2897 			   "slab",
2898 			   "module",
2899 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2900 	else
2901 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2902 	return 0;
2903 }
2904 
2905 static const struct seq_operations proto_seq_ops = {
2906 	.start  = proto_seq_start,
2907 	.next   = proto_seq_next,
2908 	.stop   = proto_seq_stop,
2909 	.show   = proto_seq_show,
2910 };
2911 
proto_seq_open(struct inode * inode,struct file * file)2912 static int proto_seq_open(struct inode *inode, struct file *file)
2913 {
2914 	return seq_open_net(inode, file, &proto_seq_ops,
2915 			    sizeof(struct seq_net_private));
2916 }
2917 
2918 static const struct file_operations proto_seq_fops = {
2919 	.owner		= THIS_MODULE,
2920 	.open		= proto_seq_open,
2921 	.read		= seq_read,
2922 	.llseek		= seq_lseek,
2923 	.release	= seq_release_net,
2924 };
2925 
proto_init_net(struct net * net)2926 static __net_init int proto_init_net(struct net *net)
2927 {
2928 	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2929 		return -ENOMEM;
2930 
2931 	return 0;
2932 }
2933 
proto_exit_net(struct net * net)2934 static __net_exit void proto_exit_net(struct net *net)
2935 {
2936 	remove_proc_entry("protocols", net->proc_net);
2937 }
2938 
2939 
2940 static __net_initdata struct pernet_operations proto_net_ops = {
2941 	.init = proto_init_net,
2942 	.exit = proto_exit_net,
2943 };
2944 
proto_init(void)2945 static int __init proto_init(void)
2946 {
2947 	return register_pernet_subsys(&proto_net_ops);
2948 }
2949 
2950 subsys_initcall(proto_init);
2951 
2952 #endif /* PROC_FS */
2953