1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
120
121 #include <asm/uaccess.h>
122
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
135
136 #include <linux/filter.h>
137 #include <net/sock_reuseport.h>
138
139 #include <trace/events/sock.h>
140
141 #include <net/tcp.h>
142 #include <net/busy_poll.h>
143
144 static DEFINE_MUTEX(proto_list_mutex);
145 static LIST_HEAD(proto_list);
146
147 /**
148 * sk_ns_capable - General socket capability test
149 * @sk: Socket to use a capability on or through
150 * @user_ns: The user namespace of the capability to use
151 * @cap: The capability to use
152 *
153 * Test to see if the opener of the socket had when the socket was
154 * created and the current process has the capability @cap in the user
155 * namespace @user_ns.
156 */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)157 bool sk_ns_capable(const struct sock *sk,
158 struct user_namespace *user_ns, int cap)
159 {
160 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 ns_capable(user_ns, cap);
162 }
163 EXPORT_SYMBOL(sk_ns_capable);
164
165 /**
166 * sk_capable - Socket global capability test
167 * @sk: Socket to use a capability on or through
168 * @cap: The global capability to use
169 *
170 * Test to see if the opener of the socket had when the socket was
171 * created and the current process has the capability @cap in all user
172 * namespaces.
173 */
sk_capable(const struct sock * sk,int cap)174 bool sk_capable(const struct sock *sk, int cap)
175 {
176 return sk_ns_capable(sk, &init_user_ns, cap);
177 }
178 EXPORT_SYMBOL(sk_capable);
179
180 /**
181 * sk_net_capable - Network namespace socket capability test
182 * @sk: Socket to use a capability on or through
183 * @cap: The capability to use
184 *
185 * Test to see if the opener of the socket had when the socket was created
186 * and the current process has the capability @cap over the network namespace
187 * the socket is a member of.
188 */
sk_net_capable(const struct sock * sk,int cap)189 bool sk_net_capable(const struct sock *sk, int cap)
190 {
191 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
192 }
193 EXPORT_SYMBOL(sk_net_capable);
194
195 /*
196 * Each address family might have different locking rules, so we have
197 * one slock key per address family:
198 */
199 static struct lock_class_key af_family_keys[AF_MAX];
200 static struct lock_class_key af_family_slock_keys[AF_MAX];
201
202 /*
203 * Make lock validator output more readable. (we pre-construct these
204 * strings build-time, so that runtime initialization of socket
205 * locks is fast):
206 */
207 static const char *const af_family_key_strings[AF_MAX+1] = {
208 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
209 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
210 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
211 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
212 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
213 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
214 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
215 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
216 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
217 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
218 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
219 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
220 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
221 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" ,
222 "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX"
223 };
224 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
225 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
226 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
227 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
228 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
229 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
230 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
231 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
232 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
233 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
234 "slock-27" , "slock-28" , "slock-AF_CAN" ,
235 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
236 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
237 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
238 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" ,
239 "slock-AF_QIPCRTR", "slock-AF_MAX"
240 };
241 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
242 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
243 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
244 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
245 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
246 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
247 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
248 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
249 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
250 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
251 "clock-27" , "clock-28" , "clock-AF_CAN" ,
252 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
253 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
254 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
255 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" ,
256 "clock-AF_QIPCRTR", "clock-AF_MAX"
257 };
258
259 /*
260 * sk_callback_lock locking rules are per-address-family,
261 * so split the lock classes by using a per-AF key:
262 */
263 static struct lock_class_key af_callback_keys[AF_MAX];
264
265 /* Take into consideration the size of the struct sk_buff overhead in the
266 * determination of these values, since that is non-constant across
267 * platforms. This makes socket queueing behavior and performance
268 * not depend upon such differences.
269 */
270 #define _SK_MEM_PACKETS 256
271 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
272 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
273 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
274
275 /* Run time adjustable parameters. */
276 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
277 EXPORT_SYMBOL(sysctl_wmem_max);
278 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
279 EXPORT_SYMBOL(sysctl_rmem_max);
280 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
281 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
282
283 /* Maximal space eaten by iovec or ancillary data plus some space */
284 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
285 EXPORT_SYMBOL(sysctl_optmem_max);
286
287 int sysctl_tstamp_allow_data __read_mostly = 1;
288
289 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
290 EXPORT_SYMBOL_GPL(memalloc_socks);
291
292 /**
293 * sk_set_memalloc - sets %SOCK_MEMALLOC
294 * @sk: socket to set it on
295 *
296 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297 * It's the responsibility of the admin to adjust min_free_kbytes
298 * to meet the requirements
299 */
sk_set_memalloc(struct sock * sk)300 void sk_set_memalloc(struct sock *sk)
301 {
302 sock_set_flag(sk, SOCK_MEMALLOC);
303 sk->sk_allocation |= __GFP_MEMALLOC;
304 static_key_slow_inc(&memalloc_socks);
305 }
306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
307
sk_clear_memalloc(struct sock * sk)308 void sk_clear_memalloc(struct sock *sk)
309 {
310 sock_reset_flag(sk, SOCK_MEMALLOC);
311 sk->sk_allocation &= ~__GFP_MEMALLOC;
312 static_key_slow_dec(&memalloc_socks);
313
314 /*
315 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 * it has rmem allocations due to the last swapfile being deactivated
318 * but there is a risk that the socket is unusable due to exceeding
319 * the rmem limits. Reclaim the reserves and obey rmem limits again.
320 */
321 sk_mem_reclaim(sk);
322 }
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326 {
327 int ret;
328 unsigned long pflags = current->flags;
329
330 /* these should have been dropped before queueing */
331 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332
333 current->flags |= PF_MEMALLOC;
334 ret = sk->sk_backlog_rcv(sk, skb);
335 tsk_restore_flags(current, pflags, PF_MEMALLOC);
336
337 return ret;
338 }
339 EXPORT_SYMBOL(__sk_backlog_rcv);
340
sock_set_timeout(long * timeo_p,char __user * optval,int optlen)341 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
342 {
343 struct timeval tv;
344
345 if (optlen < sizeof(tv))
346 return -EINVAL;
347 if (copy_from_user(&tv, optval, sizeof(tv)))
348 return -EFAULT;
349 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
350 return -EDOM;
351
352 if (tv.tv_sec < 0) {
353 static int warned __read_mostly;
354
355 *timeo_p = 0;
356 if (warned < 10 && net_ratelimit()) {
357 warned++;
358 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
359 __func__, current->comm, task_pid_nr(current));
360 }
361 return 0;
362 }
363 *timeo_p = MAX_SCHEDULE_TIMEOUT;
364 if (tv.tv_sec == 0 && tv.tv_usec == 0)
365 return 0;
366 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
367 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
368 return 0;
369 }
370
sock_warn_obsolete_bsdism(const char * name)371 static void sock_warn_obsolete_bsdism(const char *name)
372 {
373 static int warned;
374 static char warncomm[TASK_COMM_LEN];
375 if (strcmp(warncomm, current->comm) && warned < 5) {
376 strcpy(warncomm, current->comm);
377 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
378 warncomm, name);
379 warned++;
380 }
381 }
382
sock_needs_netstamp(const struct sock * sk)383 static bool sock_needs_netstamp(const struct sock *sk)
384 {
385 switch (sk->sk_family) {
386 case AF_UNSPEC:
387 case AF_UNIX:
388 return false;
389 default:
390 return true;
391 }
392 }
393
sock_disable_timestamp(struct sock * sk,unsigned long flags)394 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
395 {
396 if (sk->sk_flags & flags) {
397 sk->sk_flags &= ~flags;
398 if (sock_needs_netstamp(sk) &&
399 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
400 net_disable_timestamp();
401 }
402 }
403
404
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)405 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
406 {
407 unsigned long flags;
408 struct sk_buff_head *list = &sk->sk_receive_queue;
409
410 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
411 atomic_inc(&sk->sk_drops);
412 trace_sock_rcvqueue_full(sk, skb);
413 return -ENOMEM;
414 }
415
416 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
417 atomic_inc(&sk->sk_drops);
418 return -ENOBUFS;
419 }
420
421 skb->dev = NULL;
422 skb_set_owner_r(skb, sk);
423
424 /* we escape from rcu protected region, make sure we dont leak
425 * a norefcounted dst
426 */
427 skb_dst_force(skb);
428
429 spin_lock_irqsave(&list->lock, flags);
430 sock_skb_set_dropcount(sk, skb);
431 __skb_queue_tail(list, skb);
432 spin_unlock_irqrestore(&list->lock, flags);
433
434 if (!sock_flag(sk, SOCK_DEAD))
435 sk->sk_data_ready(sk);
436 return 0;
437 }
438 EXPORT_SYMBOL(__sock_queue_rcv_skb);
439
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)440 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
441 {
442 int err;
443
444 err = sk_filter(sk, skb);
445 if (err)
446 return err;
447
448 return __sock_queue_rcv_skb(sk, skb);
449 }
450 EXPORT_SYMBOL(sock_queue_rcv_skb);
451
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)452 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
453 const int nested, unsigned int trim_cap, bool refcounted)
454 {
455 int rc = NET_RX_SUCCESS;
456
457 if (sk_filter_trim_cap(sk, skb, trim_cap))
458 goto discard_and_relse;
459
460 skb->dev = NULL;
461
462 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
463 atomic_inc(&sk->sk_drops);
464 goto discard_and_relse;
465 }
466 if (nested)
467 bh_lock_sock_nested(sk);
468 else
469 bh_lock_sock(sk);
470 if (!sock_owned_by_user(sk)) {
471 /*
472 * trylock + unlock semantics:
473 */
474 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
475
476 rc = sk_backlog_rcv(sk, skb);
477
478 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
479 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
480 bh_unlock_sock(sk);
481 atomic_inc(&sk->sk_drops);
482 goto discard_and_relse;
483 }
484
485 bh_unlock_sock(sk);
486 out:
487 if (refcounted)
488 sock_put(sk);
489 return rc;
490 discard_and_relse:
491 kfree_skb(skb);
492 goto out;
493 }
494 EXPORT_SYMBOL(__sk_receive_skb);
495
__sk_dst_check(struct sock * sk,u32 cookie)496 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
497 {
498 struct dst_entry *dst = __sk_dst_get(sk);
499
500 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
501 sk_tx_queue_clear(sk);
502 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
503 dst_release(dst);
504 return NULL;
505 }
506
507 return dst;
508 }
509 EXPORT_SYMBOL(__sk_dst_check);
510
sk_dst_check(struct sock * sk,u32 cookie)511 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
512 {
513 struct dst_entry *dst = sk_dst_get(sk);
514
515 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
516 sk_dst_reset(sk);
517 dst_release(dst);
518 return NULL;
519 }
520
521 return dst;
522 }
523 EXPORT_SYMBOL(sk_dst_check);
524
sock_setbindtodevice(struct sock * sk,char __user * optval,int optlen)525 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
526 int optlen)
527 {
528 int ret = -ENOPROTOOPT;
529 #ifdef CONFIG_NETDEVICES
530 struct net *net = sock_net(sk);
531 char devname[IFNAMSIZ];
532 int index;
533
534 /* Sorry... */
535 ret = -EPERM;
536 if (!ns_capable(net->user_ns, CAP_NET_RAW))
537 goto out;
538
539 ret = -EINVAL;
540 if (optlen < 0)
541 goto out;
542
543 /* Bind this socket to a particular device like "eth0",
544 * as specified in the passed interface name. If the
545 * name is "" or the option length is zero the socket
546 * is not bound.
547 */
548 if (optlen > IFNAMSIZ - 1)
549 optlen = IFNAMSIZ - 1;
550 memset(devname, 0, sizeof(devname));
551
552 ret = -EFAULT;
553 if (copy_from_user(devname, optval, optlen))
554 goto out;
555
556 index = 0;
557 if (devname[0] != '\0') {
558 struct net_device *dev;
559
560 rcu_read_lock();
561 dev = dev_get_by_name_rcu(net, devname);
562 if (dev)
563 index = dev->ifindex;
564 rcu_read_unlock();
565 ret = -ENODEV;
566 if (!dev)
567 goto out;
568 }
569
570 lock_sock(sk);
571 sk->sk_bound_dev_if = index;
572 sk_dst_reset(sk);
573 release_sock(sk);
574
575 ret = 0;
576
577 out:
578 #endif
579
580 return ret;
581 }
582
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)583 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
584 int __user *optlen, int len)
585 {
586 int ret = -ENOPROTOOPT;
587 #ifdef CONFIG_NETDEVICES
588 struct net *net = sock_net(sk);
589 char devname[IFNAMSIZ];
590
591 if (sk->sk_bound_dev_if == 0) {
592 len = 0;
593 goto zero;
594 }
595
596 ret = -EINVAL;
597 if (len < IFNAMSIZ)
598 goto out;
599
600 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
601 if (ret)
602 goto out;
603
604 len = strlen(devname) + 1;
605
606 ret = -EFAULT;
607 if (copy_to_user(optval, devname, len))
608 goto out;
609
610 zero:
611 ret = -EFAULT;
612 if (put_user(len, optlen))
613 goto out;
614
615 ret = 0;
616
617 out:
618 #endif
619
620 return ret;
621 }
622
sock_valbool_flag(struct sock * sk,int bit,int valbool)623 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
624 {
625 if (valbool)
626 sock_set_flag(sk, bit);
627 else
628 sock_reset_flag(sk, bit);
629 }
630
sk_mc_loop(struct sock * sk)631 bool sk_mc_loop(struct sock *sk)
632 {
633 if (dev_recursion_level())
634 return false;
635 if (!sk)
636 return true;
637 switch (sk->sk_family) {
638 case AF_INET:
639 return inet_sk(sk)->mc_loop;
640 #if IS_ENABLED(CONFIG_IPV6)
641 case AF_INET6:
642 return inet6_sk(sk)->mc_loop;
643 #endif
644 }
645 WARN_ON(1);
646 return true;
647 }
648 EXPORT_SYMBOL(sk_mc_loop);
649
650 /*
651 * This is meant for all protocols to use and covers goings on
652 * at the socket level. Everything here is generic.
653 */
654
sock_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)655 int sock_setsockopt(struct socket *sock, int level, int optname,
656 char __user *optval, unsigned int optlen)
657 {
658 struct sock *sk = sock->sk;
659 int val;
660 int valbool;
661 struct linger ling;
662 int ret = 0;
663
664 /*
665 * Options without arguments
666 */
667
668 if (optname == SO_BINDTODEVICE)
669 return sock_setbindtodevice(sk, optval, optlen);
670
671 if (optlen < sizeof(int))
672 return -EINVAL;
673
674 if (get_user(val, (int __user *)optval))
675 return -EFAULT;
676
677 valbool = val ? 1 : 0;
678
679 lock_sock(sk);
680
681 switch (optname) {
682 case SO_DEBUG:
683 if (val && !capable(CAP_NET_ADMIN))
684 ret = -EACCES;
685 else
686 sock_valbool_flag(sk, SOCK_DBG, valbool);
687 break;
688 case SO_REUSEADDR:
689 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
690 break;
691 case SO_REUSEPORT:
692 sk->sk_reuseport = valbool;
693 break;
694 case SO_TYPE:
695 case SO_PROTOCOL:
696 case SO_DOMAIN:
697 case SO_ERROR:
698 ret = -ENOPROTOOPT;
699 break;
700 case SO_DONTROUTE:
701 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
702 break;
703 case SO_BROADCAST:
704 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
705 break;
706 case SO_SNDBUF:
707 /* Don't error on this BSD doesn't and if you think
708 * about it this is right. Otherwise apps have to
709 * play 'guess the biggest size' games. RCVBUF/SNDBUF
710 * are treated in BSD as hints
711 */
712 val = min_t(u32, val, sysctl_wmem_max);
713 set_sndbuf:
714 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
715 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
716 /* Wake up sending tasks if we upped the value. */
717 sk->sk_write_space(sk);
718 break;
719
720 case SO_SNDBUFFORCE:
721 if (!capable(CAP_NET_ADMIN)) {
722 ret = -EPERM;
723 break;
724 }
725 goto set_sndbuf;
726
727 case SO_RCVBUF:
728 /* Don't error on this BSD doesn't and if you think
729 * about it this is right. Otherwise apps have to
730 * play 'guess the biggest size' games. RCVBUF/SNDBUF
731 * are treated in BSD as hints
732 */
733 val = min_t(u32, val, sysctl_rmem_max);
734 set_rcvbuf:
735 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
736 /*
737 * We double it on the way in to account for
738 * "struct sk_buff" etc. overhead. Applications
739 * assume that the SO_RCVBUF setting they make will
740 * allow that much actual data to be received on that
741 * socket.
742 *
743 * Applications are unaware that "struct sk_buff" and
744 * other overheads allocate from the receive buffer
745 * during socket buffer allocation.
746 *
747 * And after considering the possible alternatives,
748 * returning the value we actually used in getsockopt
749 * is the most desirable behavior.
750 */
751 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
752 break;
753
754 case SO_RCVBUFFORCE:
755 if (!capable(CAP_NET_ADMIN)) {
756 ret = -EPERM;
757 break;
758 }
759 goto set_rcvbuf;
760
761 case SO_KEEPALIVE:
762 #ifdef CONFIG_INET
763 if (sk->sk_protocol == IPPROTO_TCP &&
764 sk->sk_type == SOCK_STREAM)
765 tcp_set_keepalive(sk, valbool);
766 #endif
767 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
768 break;
769
770 case SO_OOBINLINE:
771 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
772 break;
773
774 case SO_NO_CHECK:
775 sk->sk_no_check_tx = valbool;
776 break;
777
778 case SO_PRIORITY:
779 if ((val >= 0 && val <= 6) ||
780 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
781 sk->sk_priority = val;
782 else
783 ret = -EPERM;
784 break;
785
786 case SO_LINGER:
787 if (optlen < sizeof(ling)) {
788 ret = -EINVAL; /* 1003.1g */
789 break;
790 }
791 if (copy_from_user(&ling, optval, sizeof(ling))) {
792 ret = -EFAULT;
793 break;
794 }
795 if (!ling.l_onoff)
796 sock_reset_flag(sk, SOCK_LINGER);
797 else {
798 #if (BITS_PER_LONG == 32)
799 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
800 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
801 else
802 #endif
803 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
804 sock_set_flag(sk, SOCK_LINGER);
805 }
806 break;
807
808 case SO_BSDCOMPAT:
809 sock_warn_obsolete_bsdism("setsockopt");
810 break;
811
812 case SO_PASSCRED:
813 if (valbool)
814 set_bit(SOCK_PASSCRED, &sock->flags);
815 else
816 clear_bit(SOCK_PASSCRED, &sock->flags);
817 break;
818
819 case SO_TIMESTAMP:
820 case SO_TIMESTAMPNS:
821 if (valbool) {
822 if (optname == SO_TIMESTAMP)
823 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
824 else
825 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
826 sock_set_flag(sk, SOCK_RCVTSTAMP);
827 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
828 } else {
829 sock_reset_flag(sk, SOCK_RCVTSTAMP);
830 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
831 }
832 break;
833
834 case SO_TIMESTAMPING:
835 if (val & ~SOF_TIMESTAMPING_MASK) {
836 ret = -EINVAL;
837 break;
838 }
839
840 if (val & SOF_TIMESTAMPING_OPT_ID &&
841 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
842 if (sk->sk_protocol == IPPROTO_TCP &&
843 sk->sk_type == SOCK_STREAM) {
844 if ((1 << sk->sk_state) &
845 (TCPF_CLOSE | TCPF_LISTEN)) {
846 ret = -EINVAL;
847 break;
848 }
849 sk->sk_tskey = tcp_sk(sk)->snd_una;
850 } else {
851 sk->sk_tskey = 0;
852 }
853 }
854 sk->sk_tsflags = val;
855 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
856 sock_enable_timestamp(sk,
857 SOCK_TIMESTAMPING_RX_SOFTWARE);
858 else
859 sock_disable_timestamp(sk,
860 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
861 break;
862
863 case SO_RCVLOWAT:
864 if (val < 0)
865 val = INT_MAX;
866 sk->sk_rcvlowat = val ? : 1;
867 break;
868
869 case SO_RCVTIMEO:
870 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
871 break;
872
873 case SO_SNDTIMEO:
874 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
875 break;
876
877 case SO_ATTACH_FILTER:
878 ret = -EINVAL;
879 if (optlen == sizeof(struct sock_fprog)) {
880 struct sock_fprog fprog;
881
882 ret = -EFAULT;
883 if (copy_from_user(&fprog, optval, sizeof(fprog)))
884 break;
885
886 ret = sk_attach_filter(&fprog, sk);
887 }
888 break;
889
890 case SO_ATTACH_BPF:
891 ret = -EINVAL;
892 if (optlen == sizeof(u32)) {
893 u32 ufd;
894
895 ret = -EFAULT;
896 if (copy_from_user(&ufd, optval, sizeof(ufd)))
897 break;
898
899 ret = sk_attach_bpf(ufd, sk);
900 }
901 break;
902
903 case SO_ATTACH_REUSEPORT_CBPF:
904 ret = -EINVAL;
905 if (optlen == sizeof(struct sock_fprog)) {
906 struct sock_fprog fprog;
907
908 ret = -EFAULT;
909 if (copy_from_user(&fprog, optval, sizeof(fprog)))
910 break;
911
912 ret = sk_reuseport_attach_filter(&fprog, sk);
913 }
914 break;
915
916 case SO_ATTACH_REUSEPORT_EBPF:
917 ret = -EINVAL;
918 if (optlen == sizeof(u32)) {
919 u32 ufd;
920
921 ret = -EFAULT;
922 if (copy_from_user(&ufd, optval, sizeof(ufd)))
923 break;
924
925 ret = sk_reuseport_attach_bpf(ufd, sk);
926 }
927 break;
928
929 case SO_DETACH_FILTER:
930 ret = sk_detach_filter(sk);
931 break;
932
933 case SO_LOCK_FILTER:
934 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
935 ret = -EPERM;
936 else
937 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
938 break;
939
940 case SO_PASSSEC:
941 if (valbool)
942 set_bit(SOCK_PASSSEC, &sock->flags);
943 else
944 clear_bit(SOCK_PASSSEC, &sock->flags);
945 break;
946 case SO_MARK:
947 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
948 ret = -EPERM;
949 else
950 sk->sk_mark = val;
951 break;
952
953 case SO_RXQ_OVFL:
954 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
955 break;
956
957 case SO_WIFI_STATUS:
958 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
959 break;
960
961 case SO_PEEK_OFF:
962 if (sock->ops->set_peek_off)
963 ret = sock->ops->set_peek_off(sk, val);
964 else
965 ret = -EOPNOTSUPP;
966 break;
967
968 case SO_NOFCS:
969 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
970 break;
971
972 case SO_SELECT_ERR_QUEUE:
973 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
974 break;
975
976 #ifdef CONFIG_NET_RX_BUSY_POLL
977 case SO_BUSY_POLL:
978 /* allow unprivileged users to decrease the value */
979 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
980 ret = -EPERM;
981 else {
982 if (val < 0)
983 ret = -EINVAL;
984 else
985 sk->sk_ll_usec = val;
986 }
987 break;
988 #endif
989
990 case SO_MAX_PACING_RATE:
991 sk->sk_max_pacing_rate = val;
992 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
993 sk->sk_max_pacing_rate);
994 break;
995
996 case SO_INCOMING_CPU:
997 sk->sk_incoming_cpu = val;
998 break;
999
1000 case SO_CNX_ADVICE:
1001 if (val == 1)
1002 dst_negative_advice(sk);
1003 break;
1004 default:
1005 ret = -ENOPROTOOPT;
1006 break;
1007 }
1008 release_sock(sk);
1009 return ret;
1010 }
1011 EXPORT_SYMBOL(sock_setsockopt);
1012
1013
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1014 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1015 struct ucred *ucred)
1016 {
1017 ucred->pid = pid_vnr(pid);
1018 ucred->uid = ucred->gid = -1;
1019 if (cred) {
1020 struct user_namespace *current_ns = current_user_ns();
1021
1022 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1023 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1024 }
1025 }
1026
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1027 int sock_getsockopt(struct socket *sock, int level, int optname,
1028 char __user *optval, int __user *optlen)
1029 {
1030 struct sock *sk = sock->sk;
1031
1032 union {
1033 int val;
1034 u64 val64;
1035 struct linger ling;
1036 struct timeval tm;
1037 } v;
1038
1039 int lv = sizeof(int);
1040 int len;
1041
1042 if (get_user(len, optlen))
1043 return -EFAULT;
1044 if (len < 0)
1045 return -EINVAL;
1046
1047 memset(&v, 0, sizeof(v));
1048
1049 switch (optname) {
1050 case SO_DEBUG:
1051 v.val = sock_flag(sk, SOCK_DBG);
1052 break;
1053
1054 case SO_DONTROUTE:
1055 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1056 break;
1057
1058 case SO_BROADCAST:
1059 v.val = sock_flag(sk, SOCK_BROADCAST);
1060 break;
1061
1062 case SO_SNDBUF:
1063 v.val = sk->sk_sndbuf;
1064 break;
1065
1066 case SO_RCVBUF:
1067 v.val = sk->sk_rcvbuf;
1068 break;
1069
1070 case SO_REUSEADDR:
1071 v.val = sk->sk_reuse;
1072 break;
1073
1074 case SO_REUSEPORT:
1075 v.val = sk->sk_reuseport;
1076 break;
1077
1078 case SO_KEEPALIVE:
1079 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1080 break;
1081
1082 case SO_TYPE:
1083 v.val = sk->sk_type;
1084 break;
1085
1086 case SO_PROTOCOL:
1087 v.val = sk->sk_protocol;
1088 break;
1089
1090 case SO_DOMAIN:
1091 v.val = sk->sk_family;
1092 break;
1093
1094 case SO_ERROR:
1095 v.val = -sock_error(sk);
1096 if (v.val == 0)
1097 v.val = xchg(&sk->sk_err_soft, 0);
1098 break;
1099
1100 case SO_OOBINLINE:
1101 v.val = sock_flag(sk, SOCK_URGINLINE);
1102 break;
1103
1104 case SO_NO_CHECK:
1105 v.val = sk->sk_no_check_tx;
1106 break;
1107
1108 case SO_PRIORITY:
1109 v.val = sk->sk_priority;
1110 break;
1111
1112 case SO_LINGER:
1113 lv = sizeof(v.ling);
1114 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1115 v.ling.l_linger = sk->sk_lingertime / HZ;
1116 break;
1117
1118 case SO_BSDCOMPAT:
1119 sock_warn_obsolete_bsdism("getsockopt");
1120 break;
1121
1122 case SO_TIMESTAMP:
1123 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1124 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1125 break;
1126
1127 case SO_TIMESTAMPNS:
1128 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1129 break;
1130
1131 case SO_TIMESTAMPING:
1132 v.val = sk->sk_tsflags;
1133 break;
1134
1135 case SO_RCVTIMEO:
1136 lv = sizeof(struct timeval);
1137 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1138 v.tm.tv_sec = 0;
1139 v.tm.tv_usec = 0;
1140 } else {
1141 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1142 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1143 }
1144 break;
1145
1146 case SO_SNDTIMEO:
1147 lv = sizeof(struct timeval);
1148 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1149 v.tm.tv_sec = 0;
1150 v.tm.tv_usec = 0;
1151 } else {
1152 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1153 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1154 }
1155 break;
1156
1157 case SO_RCVLOWAT:
1158 v.val = sk->sk_rcvlowat;
1159 break;
1160
1161 case SO_SNDLOWAT:
1162 v.val = 1;
1163 break;
1164
1165 case SO_PASSCRED:
1166 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1167 break;
1168
1169 case SO_PEERCRED:
1170 {
1171 struct ucred peercred;
1172 if (len > sizeof(peercred))
1173 len = sizeof(peercred);
1174 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1175 if (copy_to_user(optval, &peercred, len))
1176 return -EFAULT;
1177 goto lenout;
1178 }
1179
1180 case SO_PEERNAME:
1181 {
1182 char address[128];
1183
1184 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1185 return -ENOTCONN;
1186 if (lv < len)
1187 return -EINVAL;
1188 if (copy_to_user(optval, address, len))
1189 return -EFAULT;
1190 goto lenout;
1191 }
1192
1193 /* Dubious BSD thing... Probably nobody even uses it, but
1194 * the UNIX standard wants it for whatever reason... -DaveM
1195 */
1196 case SO_ACCEPTCONN:
1197 v.val = sk->sk_state == TCP_LISTEN;
1198 break;
1199
1200 case SO_PASSSEC:
1201 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1202 break;
1203
1204 case SO_PEERSEC:
1205 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1206
1207 case SO_MARK:
1208 v.val = sk->sk_mark;
1209 break;
1210
1211 case SO_RXQ_OVFL:
1212 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1213 break;
1214
1215 case SO_WIFI_STATUS:
1216 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1217 break;
1218
1219 case SO_PEEK_OFF:
1220 if (!sock->ops->set_peek_off)
1221 return -EOPNOTSUPP;
1222
1223 v.val = sk->sk_peek_off;
1224 break;
1225 case SO_NOFCS:
1226 v.val = sock_flag(sk, SOCK_NOFCS);
1227 break;
1228
1229 case SO_BINDTODEVICE:
1230 return sock_getbindtodevice(sk, optval, optlen, len);
1231
1232 case SO_GET_FILTER:
1233 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1234 if (len < 0)
1235 return len;
1236
1237 goto lenout;
1238
1239 case SO_LOCK_FILTER:
1240 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1241 break;
1242
1243 case SO_BPF_EXTENSIONS:
1244 v.val = bpf_tell_extensions();
1245 break;
1246
1247 case SO_SELECT_ERR_QUEUE:
1248 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1249 break;
1250
1251 #ifdef CONFIG_NET_RX_BUSY_POLL
1252 case SO_BUSY_POLL:
1253 v.val = sk->sk_ll_usec;
1254 break;
1255 #endif
1256
1257 case SO_MAX_PACING_RATE:
1258 v.val = sk->sk_max_pacing_rate;
1259 break;
1260
1261 case SO_INCOMING_CPU:
1262 v.val = sk->sk_incoming_cpu;
1263 break;
1264
1265
1266 case SO_COOKIE:
1267 lv = sizeof(u64);
1268 if (len < lv)
1269 return -EINVAL;
1270 v.val64 = sock_gen_cookie(sk);
1271 break;
1272 default:
1273 /* We implement the SO_SNDLOWAT etc to not be settable
1274 * (1003.1g 7).
1275 */
1276 return -ENOPROTOOPT;
1277 }
1278
1279 if (len > lv)
1280 len = lv;
1281 if (copy_to_user(optval, &v, len))
1282 return -EFAULT;
1283 lenout:
1284 if (put_user(len, optlen))
1285 return -EFAULT;
1286 return 0;
1287 }
1288
1289 /*
1290 * Initialize an sk_lock.
1291 *
1292 * (We also register the sk_lock with the lock validator.)
1293 */
sock_lock_init(struct sock * sk)1294 static inline void sock_lock_init(struct sock *sk)
1295 {
1296 sock_lock_init_class_and_name(sk,
1297 af_family_slock_key_strings[sk->sk_family],
1298 af_family_slock_keys + sk->sk_family,
1299 af_family_key_strings[sk->sk_family],
1300 af_family_keys + sk->sk_family);
1301 }
1302
1303 /*
1304 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1305 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1306 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1307 */
sock_copy(struct sock * nsk,const struct sock * osk)1308 static void sock_copy(struct sock *nsk, const struct sock *osk)
1309 {
1310 #ifdef CONFIG_SECURITY_NETWORK
1311 void *sptr = nsk->sk_security;
1312 #endif
1313 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1314
1315 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1316 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1317
1318 #ifdef CONFIG_SECURITY_NETWORK
1319 nsk->sk_security = sptr;
1320 security_sk_clone(osk, nsk);
1321 #endif
1322 }
1323
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1324 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1325 int family)
1326 {
1327 struct sock *sk;
1328 struct kmem_cache *slab;
1329
1330 slab = prot->slab;
1331 if (slab != NULL) {
1332 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1333 if (!sk)
1334 return sk;
1335 if (priority & __GFP_ZERO)
1336 sk_prot_clear_nulls(sk, prot->obj_size);
1337 } else
1338 sk = kmalloc(prot->obj_size, priority);
1339
1340 if (sk != NULL) {
1341 kmemcheck_annotate_bitfield(sk, flags);
1342
1343 if (security_sk_alloc(sk, family, priority))
1344 goto out_free;
1345
1346 if (!try_module_get(prot->owner))
1347 goto out_free_sec;
1348 sk_tx_queue_clear(sk);
1349 }
1350
1351 return sk;
1352
1353 out_free_sec:
1354 security_sk_free(sk);
1355 out_free:
1356 if (slab != NULL)
1357 kmem_cache_free(slab, sk);
1358 else
1359 kfree(sk);
1360 return NULL;
1361 }
1362
sk_prot_free(struct proto * prot,struct sock * sk)1363 static void sk_prot_free(struct proto *prot, struct sock *sk)
1364 {
1365 struct kmem_cache *slab;
1366 struct module *owner;
1367
1368 owner = prot->owner;
1369 slab = prot->slab;
1370
1371 cgroup_sk_free(&sk->sk_cgrp_data);
1372 mem_cgroup_sk_free(sk);
1373 security_sk_free(sk);
1374 if (slab != NULL)
1375 kmem_cache_free(slab, sk);
1376 else
1377 kfree(sk);
1378 module_put(owner);
1379 }
1380
1381 /**
1382 * sk_alloc - All socket objects are allocated here
1383 * @net: the applicable net namespace
1384 * @family: protocol family
1385 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1386 * @prot: struct proto associated with this new sock instance
1387 * @kern: is this to be a kernel socket?
1388 */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)1389 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1390 struct proto *prot, int kern)
1391 {
1392 struct sock *sk;
1393
1394 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1395 if (sk) {
1396 sk->sk_family = family;
1397 /*
1398 * See comment in struct sock definition to understand
1399 * why we need sk_prot_creator -acme
1400 */
1401 sk->sk_prot = sk->sk_prot_creator = prot;
1402 sock_lock_init(sk);
1403 sk->sk_net_refcnt = kern ? 0 : 1;
1404 if (likely(sk->sk_net_refcnt))
1405 get_net(net);
1406 sock_net_set(sk, net);
1407 atomic_set(&sk->sk_wmem_alloc, 1);
1408
1409 mem_cgroup_sk_alloc(sk);
1410 cgroup_sk_alloc(&sk->sk_cgrp_data);
1411 sock_update_classid(&sk->sk_cgrp_data);
1412 sock_update_netprioidx(&sk->sk_cgrp_data);
1413 }
1414
1415 return sk;
1416 }
1417 EXPORT_SYMBOL(sk_alloc);
1418
1419 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1420 * grace period. This is the case for UDP sockets and TCP listeners.
1421 */
__sk_destruct(struct rcu_head * head)1422 static void __sk_destruct(struct rcu_head *head)
1423 {
1424 struct sock *sk = container_of(head, struct sock, sk_rcu);
1425 struct sk_filter *filter;
1426
1427 if (sk->sk_destruct)
1428 sk->sk_destruct(sk);
1429
1430 filter = rcu_dereference_check(sk->sk_filter,
1431 atomic_read(&sk->sk_wmem_alloc) == 0);
1432 if (filter) {
1433 sk_filter_uncharge(sk, filter);
1434 RCU_INIT_POINTER(sk->sk_filter, NULL);
1435 }
1436 if (rcu_access_pointer(sk->sk_reuseport_cb))
1437 reuseport_detach_sock(sk);
1438
1439 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1440
1441 if (atomic_read(&sk->sk_omem_alloc))
1442 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1443 __func__, atomic_read(&sk->sk_omem_alloc));
1444
1445 if (sk->sk_frag.page) {
1446 put_page(sk->sk_frag.page);
1447 sk->sk_frag.page = NULL;
1448 }
1449
1450 if (sk->sk_peer_cred)
1451 put_cred(sk->sk_peer_cred);
1452 put_pid(sk->sk_peer_pid);
1453 if (likely(sk->sk_net_refcnt))
1454 put_net(sock_net(sk));
1455 sk_prot_free(sk->sk_prot_creator, sk);
1456 }
1457
sk_destruct(struct sock * sk)1458 void sk_destruct(struct sock *sk)
1459 {
1460 if (sock_flag(sk, SOCK_RCU_FREE))
1461 call_rcu(&sk->sk_rcu, __sk_destruct);
1462 else
1463 __sk_destruct(&sk->sk_rcu);
1464 }
1465
__sk_free(struct sock * sk)1466 static void __sk_free(struct sock *sk)
1467 {
1468 if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1469 sock_diag_broadcast_destroy(sk);
1470 else
1471 sk_destruct(sk);
1472 }
1473
sk_free(struct sock * sk)1474 void sk_free(struct sock *sk)
1475 {
1476 /*
1477 * We subtract one from sk_wmem_alloc and can know if
1478 * some packets are still in some tx queue.
1479 * If not null, sock_wfree() will call __sk_free(sk) later
1480 */
1481 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1482 __sk_free(sk);
1483 }
1484 EXPORT_SYMBOL(sk_free);
1485
1486 /**
1487 * sk_clone_lock - clone a socket, and lock its clone
1488 * @sk: the socket to clone
1489 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1490 *
1491 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1492 */
sk_clone_lock(const struct sock * sk,const gfp_t priority)1493 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1494 {
1495 struct sock *newsk;
1496 bool is_charged = true;
1497
1498 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1499 if (newsk != NULL) {
1500 struct sk_filter *filter;
1501
1502 sock_copy(newsk, sk);
1503
1504 newsk->sk_prot_creator = sk->sk_prot;
1505
1506 /* SANITY */
1507 if (likely(newsk->sk_net_refcnt))
1508 get_net(sock_net(newsk));
1509 sk_node_init(&newsk->sk_node);
1510 sock_lock_init(newsk);
1511 bh_lock_sock(newsk);
1512 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1513 newsk->sk_backlog.len = 0;
1514
1515 atomic_set(&newsk->sk_rmem_alloc, 0);
1516 /*
1517 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1518 */
1519 atomic_set(&newsk->sk_wmem_alloc, 1);
1520 atomic_set(&newsk->sk_omem_alloc, 0);
1521 skb_queue_head_init(&newsk->sk_receive_queue);
1522 skb_queue_head_init(&newsk->sk_write_queue);
1523
1524 rwlock_init(&newsk->sk_callback_lock);
1525 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1526 af_callback_keys + newsk->sk_family,
1527 af_family_clock_key_strings[newsk->sk_family]);
1528
1529 newsk->sk_dst_cache = NULL;
1530 newsk->sk_wmem_queued = 0;
1531 newsk->sk_forward_alloc = 0;
1532 atomic_set(&newsk->sk_drops, 0);
1533 newsk->sk_send_head = NULL;
1534 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1535
1536 sock_reset_flag(newsk, SOCK_DONE);
1537 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1538 skb_queue_head_init(&newsk->sk_error_queue);
1539
1540 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1541 if (filter != NULL)
1542 /* though it's an empty new sock, the charging may fail
1543 * if sysctl_optmem_max was changed between creation of
1544 * original socket and cloning
1545 */
1546 is_charged = sk_filter_charge(newsk, filter);
1547
1548 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1549 /* We need to make sure that we don't uncharge the new
1550 * socket if we couldn't charge it in the first place
1551 * as otherwise we uncharge the parent's filter.
1552 */
1553 if (!is_charged)
1554 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1555 /* It is still raw copy of parent, so invalidate
1556 * destructor and make plain sk_free() */
1557 newsk->sk_destruct = NULL;
1558 bh_unlock_sock(newsk);
1559 sk_free(newsk);
1560 newsk = NULL;
1561 goto out;
1562 }
1563 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1564
1565 newsk->sk_err = 0;
1566 newsk->sk_err_soft = 0;
1567 newsk->sk_priority = 0;
1568 newsk->sk_incoming_cpu = raw_smp_processor_id();
1569 atomic64_set(&newsk->sk_cookie, 0);
1570
1571 mem_cgroup_sk_alloc(newsk);
1572 /*
1573 * Before updating sk_refcnt, we must commit prior changes to memory
1574 * (Documentation/RCU/rculist_nulls.txt for details)
1575 */
1576 smp_wmb();
1577 atomic_set(&newsk->sk_refcnt, 2);
1578
1579 /*
1580 * Increment the counter in the same struct proto as the master
1581 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1582 * is the same as sk->sk_prot->socks, as this field was copied
1583 * with memcpy).
1584 *
1585 * This _changes_ the previous behaviour, where
1586 * tcp_create_openreq_child always was incrementing the
1587 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1588 * to be taken into account in all callers. -acme
1589 */
1590 sk_refcnt_debug_inc(newsk);
1591 sk_set_socket(newsk, NULL);
1592 newsk->sk_wq = NULL;
1593
1594 if (newsk->sk_prot->sockets_allocated)
1595 sk_sockets_allocated_inc(newsk);
1596
1597 if (sock_needs_netstamp(sk) &&
1598 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1599 net_enable_timestamp();
1600 }
1601 out:
1602 return newsk;
1603 }
1604 EXPORT_SYMBOL_GPL(sk_clone_lock);
1605
sk_setup_caps(struct sock * sk,struct dst_entry * dst)1606 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1607 {
1608 u32 max_segs = 1;
1609
1610 sk_dst_set(sk, dst);
1611 sk->sk_route_caps = dst->dev->features;
1612 if (sk->sk_route_caps & NETIF_F_GSO)
1613 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1614 sk->sk_route_caps &= ~sk->sk_route_nocaps;
1615 if (sk_can_gso(sk)) {
1616 if (dst->header_len) {
1617 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1618 } else {
1619 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1620 sk->sk_gso_max_size = dst->dev->gso_max_size;
1621 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1622 }
1623 }
1624 sk->sk_gso_max_segs = max_segs;
1625 }
1626 EXPORT_SYMBOL_GPL(sk_setup_caps);
1627
1628 /*
1629 * Simple resource managers for sockets.
1630 */
1631
1632
1633 /*
1634 * Write buffer destructor automatically called from kfree_skb.
1635 */
sock_wfree(struct sk_buff * skb)1636 void sock_wfree(struct sk_buff *skb)
1637 {
1638 struct sock *sk = skb->sk;
1639 unsigned int len = skb->truesize;
1640
1641 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1642 /*
1643 * Keep a reference on sk_wmem_alloc, this will be released
1644 * after sk_write_space() call
1645 */
1646 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1647 sk->sk_write_space(sk);
1648 len = 1;
1649 }
1650 /*
1651 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1652 * could not do because of in-flight packets
1653 */
1654 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1655 __sk_free(sk);
1656 }
1657 EXPORT_SYMBOL(sock_wfree);
1658
1659 /* This variant of sock_wfree() is used by TCP,
1660 * since it sets SOCK_USE_WRITE_QUEUE.
1661 */
__sock_wfree(struct sk_buff * skb)1662 void __sock_wfree(struct sk_buff *skb)
1663 {
1664 struct sock *sk = skb->sk;
1665
1666 if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1667 __sk_free(sk);
1668 }
1669
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)1670 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1671 {
1672 skb_orphan(skb);
1673 skb->sk = sk;
1674 #ifdef CONFIG_INET
1675 if (unlikely(!sk_fullsock(sk))) {
1676 skb->destructor = sock_edemux;
1677 sock_hold(sk);
1678 return;
1679 }
1680 #endif
1681 skb->destructor = sock_wfree;
1682 skb_set_hash_from_sk(skb, sk);
1683 /*
1684 * We used to take a refcount on sk, but following operation
1685 * is enough to guarantee sk_free() wont free this sock until
1686 * all in-flight packets are completed
1687 */
1688 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1689 }
1690 EXPORT_SYMBOL(skb_set_owner_w);
1691
1692 /* This helper is used by netem, as it can hold packets in its
1693 * delay queue. We want to allow the owner socket to send more
1694 * packets, as if they were already TX completed by a typical driver.
1695 * But we also want to keep skb->sk set because some packet schedulers
1696 * rely on it (sch_fq for example).
1697 */
skb_orphan_partial(struct sk_buff * skb)1698 void skb_orphan_partial(struct sk_buff *skb)
1699 {
1700 if (skb_is_tcp_pure_ack(skb))
1701 return;
1702
1703 if (skb->destructor == sock_wfree
1704 #ifdef CONFIG_INET
1705 || skb->destructor == tcp_wfree
1706 #endif
1707 ) {
1708 struct sock *sk = skb->sk;
1709
1710 if (atomic_inc_not_zero(&sk->sk_refcnt)) {
1711 atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1712 skb->destructor = sock_efree;
1713 }
1714 } else {
1715 skb_orphan(skb);
1716 }
1717 }
1718 EXPORT_SYMBOL(skb_orphan_partial);
1719
1720 /*
1721 * Read buffer destructor automatically called from kfree_skb.
1722 */
sock_rfree(struct sk_buff * skb)1723 void sock_rfree(struct sk_buff *skb)
1724 {
1725 struct sock *sk = skb->sk;
1726 unsigned int len = skb->truesize;
1727
1728 atomic_sub(len, &sk->sk_rmem_alloc);
1729 sk_mem_uncharge(sk, len);
1730 }
1731 EXPORT_SYMBOL(sock_rfree);
1732
1733 /*
1734 * Buffer destructor for skbs that are not used directly in read or write
1735 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1736 */
sock_efree(struct sk_buff * skb)1737 void sock_efree(struct sk_buff *skb)
1738 {
1739 sock_put(skb->sk);
1740 }
1741 EXPORT_SYMBOL(sock_efree);
1742
sock_i_uid(struct sock * sk)1743 kuid_t sock_i_uid(struct sock *sk)
1744 {
1745 kuid_t uid;
1746
1747 read_lock_bh(&sk->sk_callback_lock);
1748 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1749 read_unlock_bh(&sk->sk_callback_lock);
1750 return uid;
1751 }
1752 EXPORT_SYMBOL(sock_i_uid);
1753
sock_i_ino(struct sock * sk)1754 unsigned long sock_i_ino(struct sock *sk)
1755 {
1756 unsigned long ino;
1757
1758 read_lock_bh(&sk->sk_callback_lock);
1759 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1760 read_unlock_bh(&sk->sk_callback_lock);
1761 return ino;
1762 }
1763 EXPORT_SYMBOL(sock_i_ino);
1764
1765 /*
1766 * Allocate a skb from the socket's send buffer.
1767 */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)1768 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1769 gfp_t priority)
1770 {
1771 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1772 struct sk_buff *skb = alloc_skb(size, priority);
1773 if (skb) {
1774 skb_set_owner_w(skb, sk);
1775 return skb;
1776 }
1777 }
1778 return NULL;
1779 }
1780 EXPORT_SYMBOL(sock_wmalloc);
1781
1782 /*
1783 * Allocate a memory block from the socket's option memory buffer.
1784 */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)1785 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1786 {
1787 if ((unsigned int)size <= sysctl_optmem_max &&
1788 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1789 void *mem;
1790 /* First do the add, to avoid the race if kmalloc
1791 * might sleep.
1792 */
1793 atomic_add(size, &sk->sk_omem_alloc);
1794 mem = kmalloc(size, priority);
1795 if (mem)
1796 return mem;
1797 atomic_sub(size, &sk->sk_omem_alloc);
1798 }
1799 return NULL;
1800 }
1801 EXPORT_SYMBOL(sock_kmalloc);
1802
1803 /* Free an option memory block. Note, we actually want the inline
1804 * here as this allows gcc to detect the nullify and fold away the
1805 * condition entirely.
1806 */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)1807 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1808 const bool nullify)
1809 {
1810 if (WARN_ON_ONCE(!mem))
1811 return;
1812 if (nullify)
1813 kzfree(mem);
1814 else
1815 kfree(mem);
1816 atomic_sub(size, &sk->sk_omem_alloc);
1817 }
1818
sock_kfree_s(struct sock * sk,void * mem,int size)1819 void sock_kfree_s(struct sock *sk, void *mem, int size)
1820 {
1821 __sock_kfree_s(sk, mem, size, false);
1822 }
1823 EXPORT_SYMBOL(sock_kfree_s);
1824
sock_kzfree_s(struct sock * sk,void * mem,int size)1825 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1826 {
1827 __sock_kfree_s(sk, mem, size, true);
1828 }
1829 EXPORT_SYMBOL(sock_kzfree_s);
1830
1831 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1832 I think, these locks should be removed for datagram sockets.
1833 */
sock_wait_for_wmem(struct sock * sk,long timeo)1834 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1835 {
1836 DEFINE_WAIT(wait);
1837
1838 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1839 for (;;) {
1840 if (!timeo)
1841 break;
1842 if (signal_pending(current))
1843 break;
1844 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1845 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1846 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1847 break;
1848 if (sk->sk_shutdown & SEND_SHUTDOWN)
1849 break;
1850 if (sk->sk_err)
1851 break;
1852 timeo = schedule_timeout(timeo);
1853 }
1854 finish_wait(sk_sleep(sk), &wait);
1855 return timeo;
1856 }
1857
1858
1859 /*
1860 * Generic send/receive buffer handlers
1861 */
1862
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)1863 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1864 unsigned long data_len, int noblock,
1865 int *errcode, int max_page_order)
1866 {
1867 struct sk_buff *skb;
1868 long timeo;
1869 int err;
1870
1871 timeo = sock_sndtimeo(sk, noblock);
1872 for (;;) {
1873 err = sock_error(sk);
1874 if (err != 0)
1875 goto failure;
1876
1877 err = -EPIPE;
1878 if (sk->sk_shutdown & SEND_SHUTDOWN)
1879 goto failure;
1880
1881 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1882 break;
1883
1884 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1885 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1886 err = -EAGAIN;
1887 if (!timeo)
1888 goto failure;
1889 if (signal_pending(current))
1890 goto interrupted;
1891 timeo = sock_wait_for_wmem(sk, timeo);
1892 }
1893 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1894 errcode, sk->sk_allocation);
1895 if (skb)
1896 skb_set_owner_w(skb, sk);
1897 return skb;
1898
1899 interrupted:
1900 err = sock_intr_errno(timeo);
1901 failure:
1902 *errcode = err;
1903 return NULL;
1904 }
1905 EXPORT_SYMBOL(sock_alloc_send_pskb);
1906
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)1907 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1908 int noblock, int *errcode)
1909 {
1910 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1911 }
1912 EXPORT_SYMBOL(sock_alloc_send_skb);
1913
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)1914 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1915 struct sockcm_cookie *sockc)
1916 {
1917 u32 tsflags;
1918
1919 switch (cmsg->cmsg_type) {
1920 case SO_MARK:
1921 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1922 return -EPERM;
1923 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1924 return -EINVAL;
1925 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1926 break;
1927 case SO_TIMESTAMPING:
1928 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1929 return -EINVAL;
1930
1931 tsflags = *(u32 *)CMSG_DATA(cmsg);
1932 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1933 return -EINVAL;
1934
1935 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1936 sockc->tsflags |= tsflags;
1937 break;
1938 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
1939 case SCM_RIGHTS:
1940 case SCM_CREDENTIALS:
1941 break;
1942 default:
1943 return -EINVAL;
1944 }
1945 return 0;
1946 }
1947 EXPORT_SYMBOL(__sock_cmsg_send);
1948
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)1949 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1950 struct sockcm_cookie *sockc)
1951 {
1952 struct cmsghdr *cmsg;
1953 int ret;
1954
1955 for_each_cmsghdr(cmsg, msg) {
1956 if (!CMSG_OK(msg, cmsg))
1957 return -EINVAL;
1958 if (cmsg->cmsg_level != SOL_SOCKET)
1959 continue;
1960 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1961 if (ret)
1962 return ret;
1963 }
1964 return 0;
1965 }
1966 EXPORT_SYMBOL(sock_cmsg_send);
1967
1968 /* On 32bit arches, an skb frag is limited to 2^15 */
1969 #define SKB_FRAG_PAGE_ORDER get_order(32768)
1970
1971 /**
1972 * skb_page_frag_refill - check that a page_frag contains enough room
1973 * @sz: minimum size of the fragment we want to get
1974 * @pfrag: pointer to page_frag
1975 * @gfp: priority for memory allocation
1976 *
1977 * Note: While this allocator tries to use high order pages, there is
1978 * no guarantee that allocations succeed. Therefore, @sz MUST be
1979 * less or equal than PAGE_SIZE.
1980 */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)1981 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1982 {
1983 if (pfrag->page) {
1984 if (page_ref_count(pfrag->page) == 1) {
1985 pfrag->offset = 0;
1986 return true;
1987 }
1988 if (pfrag->offset + sz <= pfrag->size)
1989 return true;
1990 put_page(pfrag->page);
1991 }
1992
1993 pfrag->offset = 0;
1994 if (SKB_FRAG_PAGE_ORDER) {
1995 /* Avoid direct reclaim but allow kswapd to wake */
1996 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1997 __GFP_COMP | __GFP_NOWARN |
1998 __GFP_NORETRY,
1999 SKB_FRAG_PAGE_ORDER);
2000 if (likely(pfrag->page)) {
2001 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2002 return true;
2003 }
2004 }
2005 pfrag->page = alloc_page(gfp);
2006 if (likely(pfrag->page)) {
2007 pfrag->size = PAGE_SIZE;
2008 return true;
2009 }
2010 return false;
2011 }
2012 EXPORT_SYMBOL(skb_page_frag_refill);
2013
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2014 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2015 {
2016 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2017 return true;
2018
2019 sk_enter_memory_pressure(sk);
2020 sk_stream_moderate_sndbuf(sk);
2021 return false;
2022 }
2023 EXPORT_SYMBOL(sk_page_frag_refill);
2024
__lock_sock(struct sock * sk)2025 static void __lock_sock(struct sock *sk)
2026 __releases(&sk->sk_lock.slock)
2027 __acquires(&sk->sk_lock.slock)
2028 {
2029 DEFINE_WAIT(wait);
2030
2031 for (;;) {
2032 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2033 TASK_UNINTERRUPTIBLE);
2034 spin_unlock_bh(&sk->sk_lock.slock);
2035 schedule();
2036 spin_lock_bh(&sk->sk_lock.slock);
2037 if (!sock_owned_by_user(sk))
2038 break;
2039 }
2040 finish_wait(&sk->sk_lock.wq, &wait);
2041 }
2042
__release_sock(struct sock * sk)2043 static void __release_sock(struct sock *sk)
2044 __releases(&sk->sk_lock.slock)
2045 __acquires(&sk->sk_lock.slock)
2046 {
2047 struct sk_buff *skb, *next;
2048
2049 while ((skb = sk->sk_backlog.head) != NULL) {
2050 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2051
2052 spin_unlock_bh(&sk->sk_lock.slock);
2053
2054 do {
2055 next = skb->next;
2056 prefetch(next);
2057 WARN_ON_ONCE(skb_dst_is_noref(skb));
2058 skb->next = NULL;
2059 sk_backlog_rcv(sk, skb);
2060
2061 cond_resched();
2062
2063 skb = next;
2064 } while (skb != NULL);
2065
2066 spin_lock_bh(&sk->sk_lock.slock);
2067 }
2068
2069 /*
2070 * Doing the zeroing here guarantee we can not loop forever
2071 * while a wild producer attempts to flood us.
2072 */
2073 sk->sk_backlog.len = 0;
2074 }
2075
__sk_flush_backlog(struct sock * sk)2076 void __sk_flush_backlog(struct sock *sk)
2077 {
2078 spin_lock_bh(&sk->sk_lock.slock);
2079 __release_sock(sk);
2080 spin_unlock_bh(&sk->sk_lock.slock);
2081 }
2082
2083 /**
2084 * sk_wait_data - wait for data to arrive at sk_receive_queue
2085 * @sk: sock to wait on
2086 * @timeo: for how long
2087 * @skb: last skb seen on sk_receive_queue
2088 *
2089 * Now socket state including sk->sk_err is changed only under lock,
2090 * hence we may omit checks after joining wait queue.
2091 * We check receive queue before schedule() only as optimization;
2092 * it is very likely that release_sock() added new data.
2093 */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2094 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2095 {
2096 int rc;
2097 DEFINE_WAIT(wait);
2098
2099 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2100 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2101 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
2102 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2103 finish_wait(sk_sleep(sk), &wait);
2104 return rc;
2105 }
2106 EXPORT_SYMBOL(sk_wait_data);
2107
2108 /**
2109 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2110 * @sk: socket
2111 * @size: memory size to allocate
2112 * @kind: allocation type
2113 *
2114 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2115 * rmem allocation. This function assumes that protocols which have
2116 * memory_pressure use sk_wmem_queued as write buffer accounting.
2117 */
__sk_mem_schedule(struct sock * sk,int size,int kind)2118 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2119 {
2120 struct proto *prot = sk->sk_prot;
2121 int amt = sk_mem_pages(size);
2122 long allocated;
2123
2124 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2125
2126 allocated = sk_memory_allocated_add(sk, amt);
2127
2128 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2129 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2130 goto suppress_allocation;
2131
2132 /* Under limit. */
2133 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2134 sk_leave_memory_pressure(sk);
2135 return 1;
2136 }
2137
2138 /* Under pressure. */
2139 if (allocated > sk_prot_mem_limits(sk, 1))
2140 sk_enter_memory_pressure(sk);
2141
2142 /* Over hard limit. */
2143 if (allocated > sk_prot_mem_limits(sk, 2))
2144 goto suppress_allocation;
2145
2146 /* guarantee minimum buffer size under pressure */
2147 if (kind == SK_MEM_RECV) {
2148 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2149 return 1;
2150
2151 } else { /* SK_MEM_SEND */
2152 if (sk->sk_type == SOCK_STREAM) {
2153 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2154 return 1;
2155 } else if (atomic_read(&sk->sk_wmem_alloc) <
2156 prot->sysctl_wmem[0])
2157 return 1;
2158 }
2159
2160 if (sk_has_memory_pressure(sk)) {
2161 int alloc;
2162
2163 if (!sk_under_memory_pressure(sk))
2164 return 1;
2165 alloc = sk_sockets_allocated_read_positive(sk);
2166 if (sk_prot_mem_limits(sk, 2) > alloc *
2167 sk_mem_pages(sk->sk_wmem_queued +
2168 atomic_read(&sk->sk_rmem_alloc) +
2169 sk->sk_forward_alloc))
2170 return 1;
2171 }
2172
2173 suppress_allocation:
2174
2175 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2176 sk_stream_moderate_sndbuf(sk);
2177
2178 /* Fail only if socket is _under_ its sndbuf.
2179 * In this case we cannot block, so that we have to fail.
2180 */
2181 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2182 return 1;
2183 }
2184
2185 trace_sock_exceed_buf_limit(sk, prot, allocated);
2186
2187 /* Alas. Undo changes. */
2188 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2189
2190 sk_memory_allocated_sub(sk, amt);
2191
2192 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2193 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2194
2195 return 0;
2196 }
2197 EXPORT_SYMBOL(__sk_mem_schedule);
2198
2199 /**
2200 * __sk_mem_reclaim - reclaim memory_allocated
2201 * @sk: socket
2202 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2203 */
__sk_mem_reclaim(struct sock * sk,int amount)2204 void __sk_mem_reclaim(struct sock *sk, int amount)
2205 {
2206 amount >>= SK_MEM_QUANTUM_SHIFT;
2207 sk_memory_allocated_sub(sk, amount);
2208 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2209
2210 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2211 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2212
2213 if (sk_under_memory_pressure(sk) &&
2214 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2215 sk_leave_memory_pressure(sk);
2216 }
2217 EXPORT_SYMBOL(__sk_mem_reclaim);
2218
sk_set_peek_off(struct sock * sk,int val)2219 int sk_set_peek_off(struct sock *sk, int val)
2220 {
2221 if (val < 0)
2222 return -EINVAL;
2223
2224 sk->sk_peek_off = val;
2225 return 0;
2226 }
2227 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2228
2229 /*
2230 * Set of default routines for initialising struct proto_ops when
2231 * the protocol does not support a particular function. In certain
2232 * cases where it makes no sense for a protocol to have a "do nothing"
2233 * function, some default processing is provided.
2234 */
2235
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2236 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2237 {
2238 return -EOPNOTSUPP;
2239 }
2240 EXPORT_SYMBOL(sock_no_bind);
2241
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2242 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2243 int len, int flags)
2244 {
2245 return -EOPNOTSUPP;
2246 }
2247 EXPORT_SYMBOL(sock_no_connect);
2248
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2249 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2250 {
2251 return -EOPNOTSUPP;
2252 }
2253 EXPORT_SYMBOL(sock_no_socketpair);
2254
sock_no_accept(struct socket * sock,struct socket * newsock,int flags)2255 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2256 {
2257 return -EOPNOTSUPP;
2258 }
2259 EXPORT_SYMBOL(sock_no_accept);
2260
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int * len,int peer)2261 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2262 int *len, int peer)
2263 {
2264 return -EOPNOTSUPP;
2265 }
2266 EXPORT_SYMBOL(sock_no_getname);
2267
sock_no_poll(struct file * file,struct socket * sock,poll_table * pt)2268 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2269 {
2270 return 0;
2271 }
2272 EXPORT_SYMBOL(sock_no_poll);
2273
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2274 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2275 {
2276 return -EOPNOTSUPP;
2277 }
2278 EXPORT_SYMBOL(sock_no_ioctl);
2279
sock_no_listen(struct socket * sock,int backlog)2280 int sock_no_listen(struct socket *sock, int backlog)
2281 {
2282 return -EOPNOTSUPP;
2283 }
2284 EXPORT_SYMBOL(sock_no_listen);
2285
sock_no_shutdown(struct socket * sock,int how)2286 int sock_no_shutdown(struct socket *sock, int how)
2287 {
2288 return -EOPNOTSUPP;
2289 }
2290 EXPORT_SYMBOL(sock_no_shutdown);
2291
sock_no_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2292 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2293 char __user *optval, unsigned int optlen)
2294 {
2295 return -EOPNOTSUPP;
2296 }
2297 EXPORT_SYMBOL(sock_no_setsockopt);
2298
sock_no_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2299 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2300 char __user *optval, int __user *optlen)
2301 {
2302 return -EOPNOTSUPP;
2303 }
2304 EXPORT_SYMBOL(sock_no_getsockopt);
2305
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)2306 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2307 {
2308 return -EOPNOTSUPP;
2309 }
2310 EXPORT_SYMBOL(sock_no_sendmsg);
2311
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)2312 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2313 int flags)
2314 {
2315 return -EOPNOTSUPP;
2316 }
2317 EXPORT_SYMBOL(sock_no_recvmsg);
2318
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)2319 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2320 {
2321 /* Mirror missing mmap method error code */
2322 return -ENODEV;
2323 }
2324 EXPORT_SYMBOL(sock_no_mmap);
2325
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)2326 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2327 {
2328 ssize_t res;
2329 struct msghdr msg = {.msg_flags = flags};
2330 struct kvec iov;
2331 char *kaddr = kmap(page);
2332 iov.iov_base = kaddr + offset;
2333 iov.iov_len = size;
2334 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2335 kunmap(page);
2336 return res;
2337 }
2338 EXPORT_SYMBOL(sock_no_sendpage);
2339
2340 /*
2341 * Default Socket Callbacks
2342 */
2343
sock_def_wakeup(struct sock * sk)2344 static void sock_def_wakeup(struct sock *sk)
2345 {
2346 struct socket_wq *wq;
2347
2348 rcu_read_lock();
2349 wq = rcu_dereference(sk->sk_wq);
2350 if (skwq_has_sleeper(wq))
2351 wake_up_interruptible_all(&wq->wait);
2352 rcu_read_unlock();
2353 }
2354
sock_def_error_report(struct sock * sk)2355 static void sock_def_error_report(struct sock *sk)
2356 {
2357 struct socket_wq *wq;
2358
2359 rcu_read_lock();
2360 wq = rcu_dereference(sk->sk_wq);
2361 if (skwq_has_sleeper(wq))
2362 wake_up_interruptible_poll(&wq->wait, POLLERR);
2363 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2364 rcu_read_unlock();
2365 }
2366
sock_def_readable(struct sock * sk)2367 static void sock_def_readable(struct sock *sk)
2368 {
2369 struct socket_wq *wq;
2370
2371 rcu_read_lock();
2372 wq = rcu_dereference(sk->sk_wq);
2373 if (skwq_has_sleeper(wq))
2374 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2375 POLLRDNORM | POLLRDBAND);
2376 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2377 rcu_read_unlock();
2378 }
2379
sock_def_write_space(struct sock * sk)2380 static void sock_def_write_space(struct sock *sk)
2381 {
2382 struct socket_wq *wq;
2383
2384 rcu_read_lock();
2385
2386 /* Do not wake up a writer until he can make "significant"
2387 * progress. --DaveM
2388 */
2389 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2390 wq = rcu_dereference(sk->sk_wq);
2391 if (skwq_has_sleeper(wq))
2392 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2393 POLLWRNORM | POLLWRBAND);
2394
2395 /* Should agree with poll, otherwise some programs break */
2396 if (sock_writeable(sk))
2397 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2398 }
2399
2400 rcu_read_unlock();
2401 }
2402
sock_def_destruct(struct sock * sk)2403 static void sock_def_destruct(struct sock *sk)
2404 {
2405 }
2406
sk_send_sigurg(struct sock * sk)2407 void sk_send_sigurg(struct sock *sk)
2408 {
2409 if (sk->sk_socket && sk->sk_socket->file)
2410 if (send_sigurg(&sk->sk_socket->file->f_owner))
2411 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2412 }
2413 EXPORT_SYMBOL(sk_send_sigurg);
2414
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)2415 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2416 unsigned long expires)
2417 {
2418 if (!mod_timer(timer, expires))
2419 sock_hold(sk);
2420 }
2421 EXPORT_SYMBOL(sk_reset_timer);
2422
sk_stop_timer(struct sock * sk,struct timer_list * timer)2423 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2424 {
2425 if (del_timer(timer))
2426 __sock_put(sk);
2427 }
2428 EXPORT_SYMBOL(sk_stop_timer);
2429
sock_init_data(struct socket * sock,struct sock * sk)2430 void sock_init_data(struct socket *sock, struct sock *sk)
2431 {
2432 skb_queue_head_init(&sk->sk_receive_queue);
2433 skb_queue_head_init(&sk->sk_write_queue);
2434 skb_queue_head_init(&sk->sk_error_queue);
2435
2436 sk->sk_send_head = NULL;
2437
2438 init_timer(&sk->sk_timer);
2439
2440 sk->sk_allocation = GFP_KERNEL;
2441 sk->sk_rcvbuf = sysctl_rmem_default;
2442 sk->sk_sndbuf = sysctl_wmem_default;
2443 sk->sk_state = TCP_CLOSE;
2444 sk_set_socket(sk, sock);
2445
2446 sock_set_flag(sk, SOCK_ZAPPED);
2447
2448 if (sock) {
2449 sk->sk_type = sock->type;
2450 sk->sk_wq = sock->wq;
2451 sock->sk = sk;
2452 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2453 } else {
2454 sk->sk_wq = NULL;
2455 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2456 }
2457
2458 rwlock_init(&sk->sk_callback_lock);
2459 lockdep_set_class_and_name(&sk->sk_callback_lock,
2460 af_callback_keys + sk->sk_family,
2461 af_family_clock_key_strings[sk->sk_family]);
2462
2463 sk->sk_state_change = sock_def_wakeup;
2464 sk->sk_data_ready = sock_def_readable;
2465 sk->sk_write_space = sock_def_write_space;
2466 sk->sk_error_report = sock_def_error_report;
2467 sk->sk_destruct = sock_def_destruct;
2468
2469 sk->sk_frag.page = NULL;
2470 sk->sk_frag.offset = 0;
2471 sk->sk_peek_off = -1;
2472
2473 sk->sk_peer_pid = NULL;
2474 sk->sk_peer_cred = NULL;
2475 sk->sk_write_pending = 0;
2476 sk->sk_rcvlowat = 1;
2477 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2478 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2479
2480 sk->sk_stamp = ktime_set(-1L, 0);
2481
2482 #ifdef CONFIG_NET_RX_BUSY_POLL
2483 sk->sk_napi_id = 0;
2484 sk->sk_ll_usec = sysctl_net_busy_read;
2485 #endif
2486
2487 sk->sk_max_pacing_rate = ~0U;
2488 sk->sk_pacing_rate = ~0U;
2489 sk->sk_incoming_cpu = -1;
2490 /*
2491 * Before updating sk_refcnt, we must commit prior changes to memory
2492 * (Documentation/RCU/rculist_nulls.txt for details)
2493 */
2494 smp_wmb();
2495 atomic_set(&sk->sk_refcnt, 1);
2496 atomic_set(&sk->sk_drops, 0);
2497 }
2498 EXPORT_SYMBOL(sock_init_data);
2499
lock_sock_nested(struct sock * sk,int subclass)2500 void lock_sock_nested(struct sock *sk, int subclass)
2501 {
2502 might_sleep();
2503 spin_lock_bh(&sk->sk_lock.slock);
2504 if (sk->sk_lock.owned)
2505 __lock_sock(sk);
2506 sk->sk_lock.owned = 1;
2507 spin_unlock(&sk->sk_lock.slock);
2508 /*
2509 * The sk_lock has mutex_lock() semantics here:
2510 */
2511 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2512 local_bh_enable();
2513 }
2514 EXPORT_SYMBOL(lock_sock_nested);
2515
release_sock(struct sock * sk)2516 void release_sock(struct sock *sk)
2517 {
2518 spin_lock_bh(&sk->sk_lock.slock);
2519 if (sk->sk_backlog.tail)
2520 __release_sock(sk);
2521
2522 /* Warning : release_cb() might need to release sk ownership,
2523 * ie call sock_release_ownership(sk) before us.
2524 */
2525 if (sk->sk_prot->release_cb)
2526 sk->sk_prot->release_cb(sk);
2527
2528 sock_release_ownership(sk);
2529 if (waitqueue_active(&sk->sk_lock.wq))
2530 wake_up(&sk->sk_lock.wq);
2531 spin_unlock_bh(&sk->sk_lock.slock);
2532 }
2533 EXPORT_SYMBOL(release_sock);
2534
2535 /**
2536 * lock_sock_fast - fast version of lock_sock
2537 * @sk: socket
2538 *
2539 * This version should be used for very small section, where process wont block
2540 * return false if fast path is taken
2541 * sk_lock.slock locked, owned = 0, BH disabled
2542 * return true if slow path is taken
2543 * sk_lock.slock unlocked, owned = 1, BH enabled
2544 */
lock_sock_fast(struct sock * sk)2545 bool lock_sock_fast(struct sock *sk)
2546 {
2547 might_sleep();
2548 spin_lock_bh(&sk->sk_lock.slock);
2549
2550 if (!sk->sk_lock.owned)
2551 /*
2552 * Note : We must disable BH
2553 */
2554 return false;
2555
2556 __lock_sock(sk);
2557 sk->sk_lock.owned = 1;
2558 spin_unlock(&sk->sk_lock.slock);
2559 /*
2560 * The sk_lock has mutex_lock() semantics here:
2561 */
2562 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2563 local_bh_enable();
2564 return true;
2565 }
2566 EXPORT_SYMBOL(lock_sock_fast);
2567
sock_get_timestamp(struct sock * sk,struct timeval __user * userstamp)2568 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2569 {
2570 struct timeval tv;
2571 if (!sock_flag(sk, SOCK_TIMESTAMP))
2572 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2573 tv = ktime_to_timeval(sk->sk_stamp);
2574 if (tv.tv_sec == -1)
2575 return -ENOENT;
2576 if (tv.tv_sec == 0) {
2577 sk->sk_stamp = ktime_get_real();
2578 tv = ktime_to_timeval(sk->sk_stamp);
2579 }
2580 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2581 }
2582 EXPORT_SYMBOL(sock_get_timestamp);
2583
sock_get_timestampns(struct sock * sk,struct timespec __user * userstamp)2584 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2585 {
2586 struct timespec ts;
2587 if (!sock_flag(sk, SOCK_TIMESTAMP))
2588 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2589 ts = ktime_to_timespec(sk->sk_stamp);
2590 if (ts.tv_sec == -1)
2591 return -ENOENT;
2592 if (ts.tv_sec == 0) {
2593 sk->sk_stamp = ktime_get_real();
2594 ts = ktime_to_timespec(sk->sk_stamp);
2595 }
2596 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2597 }
2598 EXPORT_SYMBOL(sock_get_timestampns);
2599
sock_enable_timestamp(struct sock * sk,int flag)2600 void sock_enable_timestamp(struct sock *sk, int flag)
2601 {
2602 if (!sock_flag(sk, flag)) {
2603 unsigned long previous_flags = sk->sk_flags;
2604
2605 sock_set_flag(sk, flag);
2606 /*
2607 * we just set one of the two flags which require net
2608 * time stamping, but time stamping might have been on
2609 * already because of the other one
2610 */
2611 if (sock_needs_netstamp(sk) &&
2612 !(previous_flags & SK_FLAGS_TIMESTAMP))
2613 net_enable_timestamp();
2614 }
2615 }
2616
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)2617 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2618 int level, int type)
2619 {
2620 struct sock_exterr_skb *serr;
2621 struct sk_buff *skb;
2622 int copied, err;
2623
2624 err = -EAGAIN;
2625 skb = sock_dequeue_err_skb(sk);
2626 if (skb == NULL)
2627 goto out;
2628
2629 copied = skb->len;
2630 if (copied > len) {
2631 msg->msg_flags |= MSG_TRUNC;
2632 copied = len;
2633 }
2634 err = skb_copy_datagram_msg(skb, 0, msg, copied);
2635 if (err)
2636 goto out_free_skb;
2637
2638 sock_recv_timestamp(msg, sk, skb);
2639
2640 serr = SKB_EXT_ERR(skb);
2641 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2642
2643 msg->msg_flags |= MSG_ERRQUEUE;
2644 err = copied;
2645
2646 out_free_skb:
2647 kfree_skb(skb);
2648 out:
2649 return err;
2650 }
2651 EXPORT_SYMBOL(sock_recv_errqueue);
2652
2653 /*
2654 * Get a socket option on an socket.
2655 *
2656 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2657 * asynchronous errors should be reported by getsockopt. We assume
2658 * this means if you specify SO_ERROR (otherwise whats the point of it).
2659 */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2660 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2661 char __user *optval, int __user *optlen)
2662 {
2663 struct sock *sk = sock->sk;
2664
2665 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2666 }
2667 EXPORT_SYMBOL(sock_common_getsockopt);
2668
2669 #ifdef CONFIG_COMPAT
compat_sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2670 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2671 char __user *optval, int __user *optlen)
2672 {
2673 struct sock *sk = sock->sk;
2674
2675 if (sk->sk_prot->compat_getsockopt != NULL)
2676 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2677 optval, optlen);
2678 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2679 }
2680 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2681 #endif
2682
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2683 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2684 int flags)
2685 {
2686 struct sock *sk = sock->sk;
2687 int addr_len = 0;
2688 int err;
2689
2690 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2691 flags & ~MSG_DONTWAIT, &addr_len);
2692 if (err >= 0)
2693 msg->msg_namelen = addr_len;
2694 return err;
2695 }
2696 EXPORT_SYMBOL(sock_common_recvmsg);
2697
2698 /*
2699 * Set socket options on an inet socket.
2700 */
sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2701 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2702 char __user *optval, unsigned int optlen)
2703 {
2704 struct sock *sk = sock->sk;
2705
2706 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2707 }
2708 EXPORT_SYMBOL(sock_common_setsockopt);
2709
2710 #ifdef CONFIG_COMPAT
compat_sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2711 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2712 char __user *optval, unsigned int optlen)
2713 {
2714 struct sock *sk = sock->sk;
2715
2716 if (sk->sk_prot->compat_setsockopt != NULL)
2717 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2718 optval, optlen);
2719 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2720 }
2721 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2722 #endif
2723
sk_common_release(struct sock * sk)2724 void sk_common_release(struct sock *sk)
2725 {
2726 if (sk->sk_prot->destroy)
2727 sk->sk_prot->destroy(sk);
2728
2729 /*
2730 * Observation: when sock_common_release is called, processes have
2731 * no access to socket. But net still has.
2732 * Step one, detach it from networking:
2733 *
2734 * A. Remove from hash tables.
2735 */
2736
2737 sk->sk_prot->unhash(sk);
2738
2739 /*
2740 * In this point socket cannot receive new packets, but it is possible
2741 * that some packets are in flight because some CPU runs receiver and
2742 * did hash table lookup before we unhashed socket. They will achieve
2743 * receive queue and will be purged by socket destructor.
2744 *
2745 * Also we still have packets pending on receive queue and probably,
2746 * our own packets waiting in device queues. sock_destroy will drain
2747 * receive queue, but transmitted packets will delay socket destruction
2748 * until the last reference will be released.
2749 */
2750
2751 sock_orphan(sk);
2752
2753 xfrm_sk_free_policy(sk);
2754
2755 sk_refcnt_debug_release(sk);
2756
2757 sock_put(sk);
2758 }
2759 EXPORT_SYMBOL(sk_common_release);
2760
2761 #ifdef CONFIG_PROC_FS
2762 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
2763 struct prot_inuse {
2764 int val[PROTO_INUSE_NR];
2765 };
2766
2767 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2768
2769 #ifdef CONFIG_NET_NS
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)2770 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2771 {
2772 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2773 }
2774 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2775
sock_prot_inuse_get(struct net * net,struct proto * prot)2776 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2777 {
2778 int cpu, idx = prot->inuse_idx;
2779 int res = 0;
2780
2781 for_each_possible_cpu(cpu)
2782 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2783
2784 return res >= 0 ? res : 0;
2785 }
2786 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2787
sock_inuse_init_net(struct net * net)2788 static int __net_init sock_inuse_init_net(struct net *net)
2789 {
2790 net->core.inuse = alloc_percpu(struct prot_inuse);
2791 return net->core.inuse ? 0 : -ENOMEM;
2792 }
2793
sock_inuse_exit_net(struct net * net)2794 static void __net_exit sock_inuse_exit_net(struct net *net)
2795 {
2796 free_percpu(net->core.inuse);
2797 }
2798
2799 static struct pernet_operations net_inuse_ops = {
2800 .init = sock_inuse_init_net,
2801 .exit = sock_inuse_exit_net,
2802 };
2803
net_inuse_init(void)2804 static __init int net_inuse_init(void)
2805 {
2806 if (register_pernet_subsys(&net_inuse_ops))
2807 panic("Cannot initialize net inuse counters");
2808
2809 return 0;
2810 }
2811
2812 core_initcall(net_inuse_init);
2813 #else
2814 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2815
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)2816 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2817 {
2818 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2819 }
2820 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2821
sock_prot_inuse_get(struct net * net,struct proto * prot)2822 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2823 {
2824 int cpu, idx = prot->inuse_idx;
2825 int res = 0;
2826
2827 for_each_possible_cpu(cpu)
2828 res += per_cpu(prot_inuse, cpu).val[idx];
2829
2830 return res >= 0 ? res : 0;
2831 }
2832 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2833 #endif
2834
assign_proto_idx(struct proto * prot)2835 static void assign_proto_idx(struct proto *prot)
2836 {
2837 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2838
2839 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2840 pr_err("PROTO_INUSE_NR exhausted\n");
2841 return;
2842 }
2843
2844 set_bit(prot->inuse_idx, proto_inuse_idx);
2845 }
2846
release_proto_idx(struct proto * prot)2847 static void release_proto_idx(struct proto *prot)
2848 {
2849 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2850 clear_bit(prot->inuse_idx, proto_inuse_idx);
2851 }
2852 #else
assign_proto_idx(struct proto * prot)2853 static inline void assign_proto_idx(struct proto *prot)
2854 {
2855 }
2856
release_proto_idx(struct proto * prot)2857 static inline void release_proto_idx(struct proto *prot)
2858 {
2859 }
2860 #endif
2861
req_prot_cleanup(struct request_sock_ops * rsk_prot)2862 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2863 {
2864 if (!rsk_prot)
2865 return;
2866 kfree(rsk_prot->slab_name);
2867 rsk_prot->slab_name = NULL;
2868 kmem_cache_destroy(rsk_prot->slab);
2869 rsk_prot->slab = NULL;
2870 }
2871
req_prot_init(const struct proto * prot)2872 static int req_prot_init(const struct proto *prot)
2873 {
2874 struct request_sock_ops *rsk_prot = prot->rsk_prot;
2875
2876 if (!rsk_prot)
2877 return 0;
2878
2879 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2880 prot->name);
2881 if (!rsk_prot->slab_name)
2882 return -ENOMEM;
2883
2884 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2885 rsk_prot->obj_size, 0,
2886 prot->slab_flags, NULL);
2887
2888 if (!rsk_prot->slab) {
2889 pr_crit("%s: Can't create request sock SLAB cache!\n",
2890 prot->name);
2891 return -ENOMEM;
2892 }
2893 return 0;
2894 }
2895
proto_register(struct proto * prot,int alloc_slab)2896 int proto_register(struct proto *prot, int alloc_slab)
2897 {
2898 if (alloc_slab) {
2899 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2900 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2901 NULL);
2902
2903 if (prot->slab == NULL) {
2904 pr_crit("%s: Can't create sock SLAB cache!\n",
2905 prot->name);
2906 goto out;
2907 }
2908
2909 if (req_prot_init(prot))
2910 goto out_free_request_sock_slab;
2911
2912 if (prot->twsk_prot != NULL) {
2913 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2914
2915 if (prot->twsk_prot->twsk_slab_name == NULL)
2916 goto out_free_request_sock_slab;
2917
2918 prot->twsk_prot->twsk_slab =
2919 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2920 prot->twsk_prot->twsk_obj_size,
2921 0,
2922 prot->slab_flags,
2923 NULL);
2924 if (prot->twsk_prot->twsk_slab == NULL)
2925 goto out_free_timewait_sock_slab_name;
2926 }
2927 }
2928
2929 mutex_lock(&proto_list_mutex);
2930 list_add(&prot->node, &proto_list);
2931 assign_proto_idx(prot);
2932 mutex_unlock(&proto_list_mutex);
2933 return 0;
2934
2935 out_free_timewait_sock_slab_name:
2936 kfree(prot->twsk_prot->twsk_slab_name);
2937 out_free_request_sock_slab:
2938 req_prot_cleanup(prot->rsk_prot);
2939
2940 kmem_cache_destroy(prot->slab);
2941 prot->slab = NULL;
2942 out:
2943 return -ENOBUFS;
2944 }
2945 EXPORT_SYMBOL(proto_register);
2946
proto_unregister(struct proto * prot)2947 void proto_unregister(struct proto *prot)
2948 {
2949 mutex_lock(&proto_list_mutex);
2950 release_proto_idx(prot);
2951 list_del(&prot->node);
2952 mutex_unlock(&proto_list_mutex);
2953
2954 kmem_cache_destroy(prot->slab);
2955 prot->slab = NULL;
2956
2957 req_prot_cleanup(prot->rsk_prot);
2958
2959 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2960 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2961 kfree(prot->twsk_prot->twsk_slab_name);
2962 prot->twsk_prot->twsk_slab = NULL;
2963 }
2964 }
2965 EXPORT_SYMBOL(proto_unregister);
2966
2967 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)2968 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2969 __acquires(proto_list_mutex)
2970 {
2971 mutex_lock(&proto_list_mutex);
2972 return seq_list_start_head(&proto_list, *pos);
2973 }
2974
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)2975 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2976 {
2977 return seq_list_next(v, &proto_list, pos);
2978 }
2979
proto_seq_stop(struct seq_file * seq,void * v)2980 static void proto_seq_stop(struct seq_file *seq, void *v)
2981 __releases(proto_list_mutex)
2982 {
2983 mutex_unlock(&proto_list_mutex);
2984 }
2985
proto_method_implemented(const void * method)2986 static char proto_method_implemented(const void *method)
2987 {
2988 return method == NULL ? 'n' : 'y';
2989 }
sock_prot_memory_allocated(struct proto * proto)2990 static long sock_prot_memory_allocated(struct proto *proto)
2991 {
2992 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2993 }
2994
sock_prot_memory_pressure(struct proto * proto)2995 static char *sock_prot_memory_pressure(struct proto *proto)
2996 {
2997 return proto->memory_pressure != NULL ?
2998 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2999 }
3000
proto_seq_printf(struct seq_file * seq,struct proto * proto)3001 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3002 {
3003
3004 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
3005 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3006 proto->name,
3007 proto->obj_size,
3008 sock_prot_inuse_get(seq_file_net(seq), proto),
3009 sock_prot_memory_allocated(proto),
3010 sock_prot_memory_pressure(proto),
3011 proto->max_header,
3012 proto->slab == NULL ? "no" : "yes",
3013 module_name(proto->owner),
3014 proto_method_implemented(proto->close),
3015 proto_method_implemented(proto->connect),
3016 proto_method_implemented(proto->disconnect),
3017 proto_method_implemented(proto->accept),
3018 proto_method_implemented(proto->ioctl),
3019 proto_method_implemented(proto->init),
3020 proto_method_implemented(proto->destroy),
3021 proto_method_implemented(proto->shutdown),
3022 proto_method_implemented(proto->setsockopt),
3023 proto_method_implemented(proto->getsockopt),
3024 proto_method_implemented(proto->sendmsg),
3025 proto_method_implemented(proto->recvmsg),
3026 proto_method_implemented(proto->sendpage),
3027 proto_method_implemented(proto->bind),
3028 proto_method_implemented(proto->backlog_rcv),
3029 proto_method_implemented(proto->hash),
3030 proto_method_implemented(proto->unhash),
3031 proto_method_implemented(proto->get_port),
3032 proto_method_implemented(proto->enter_memory_pressure));
3033 }
3034
proto_seq_show(struct seq_file * seq,void * v)3035 static int proto_seq_show(struct seq_file *seq, void *v)
3036 {
3037 if (v == &proto_list)
3038 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3039 "protocol",
3040 "size",
3041 "sockets",
3042 "memory",
3043 "press",
3044 "maxhdr",
3045 "slab",
3046 "module",
3047 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3048 else
3049 proto_seq_printf(seq, list_entry(v, struct proto, node));
3050 return 0;
3051 }
3052
3053 static const struct seq_operations proto_seq_ops = {
3054 .start = proto_seq_start,
3055 .next = proto_seq_next,
3056 .stop = proto_seq_stop,
3057 .show = proto_seq_show,
3058 };
3059
proto_seq_open(struct inode * inode,struct file * file)3060 static int proto_seq_open(struct inode *inode, struct file *file)
3061 {
3062 return seq_open_net(inode, file, &proto_seq_ops,
3063 sizeof(struct seq_net_private));
3064 }
3065
3066 static const struct file_operations proto_seq_fops = {
3067 .owner = THIS_MODULE,
3068 .open = proto_seq_open,
3069 .read = seq_read,
3070 .llseek = seq_lseek,
3071 .release = seq_release_net,
3072 };
3073
proto_init_net(struct net * net)3074 static __net_init int proto_init_net(struct net *net)
3075 {
3076 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3077 return -ENOMEM;
3078
3079 return 0;
3080 }
3081
proto_exit_net(struct net * net)3082 static __net_exit void proto_exit_net(struct net *net)
3083 {
3084 remove_proc_entry("protocols", net->proc_net);
3085 }
3086
3087
3088 static __net_initdata struct pernet_operations proto_net_ops = {
3089 .init = proto_init_net,
3090 .exit = proto_exit_net,
3091 };
3092
proto_init(void)3093 static int __init proto_init(void)
3094 {
3095 return register_pernet_subsys(&proto_net_ops);
3096 }
3097
3098 subsys_initcall(proto_init);
3099
3100 #endif /* PROC_FS */
3101