1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114 #include <linux/highmem.h>
115 #include <linux/user_namespace.h>
116 #include <linux/static_key.h>
117 #include <linux/memcontrol.h>
118 #include <linux/prefetch.h>
119
120 #include <asm/uaccess.h>
121
122 #include <linux/netdevice.h>
123 #include <net/protocol.h>
124 #include <linux/skbuff.h>
125 #include <net/net_namespace.h>
126 #include <net/request_sock.h>
127 #include <net/sock.h>
128 #include <linux/net_tstamp.h>
129 #include <net/xfrm.h>
130 #include <linux/ipsec.h>
131 #include <net/cls_cgroup.h>
132 #include <net/netprio_cgroup.h>
133
134 #include <linux/filter.h>
135
136 #include <trace/events/sock.h>
137
138 #ifdef CONFIG_INET
139 #include <net/tcp.h>
140 #endif
141
142 static DEFINE_MUTEX(proto_list_mutex);
143 static LIST_HEAD(proto_list);
144
145 #ifdef CONFIG_MEMCG_KMEM
mem_cgroup_sockets_init(struct mem_cgroup * memcg,struct cgroup_subsys * ss)146 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
147 {
148 struct proto *proto;
149 int ret = 0;
150
151 mutex_lock(&proto_list_mutex);
152 list_for_each_entry(proto, &proto_list, node) {
153 if (proto->init_cgroup) {
154 ret = proto->init_cgroup(memcg, ss);
155 if (ret)
156 goto out;
157 }
158 }
159
160 mutex_unlock(&proto_list_mutex);
161 return ret;
162 out:
163 list_for_each_entry_continue_reverse(proto, &proto_list, node)
164 if (proto->destroy_cgroup)
165 proto->destroy_cgroup(memcg);
166 mutex_unlock(&proto_list_mutex);
167 return ret;
168 }
169
mem_cgroup_sockets_destroy(struct mem_cgroup * memcg)170 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
171 {
172 struct proto *proto;
173
174 mutex_lock(&proto_list_mutex);
175 list_for_each_entry_reverse(proto, &proto_list, node)
176 if (proto->destroy_cgroup)
177 proto->destroy_cgroup(memcg);
178 mutex_unlock(&proto_list_mutex);
179 }
180 #endif
181
182 /*
183 * Each address family might have different locking rules, so we have
184 * one slock key per address family:
185 */
186 static struct lock_class_key af_family_keys[AF_MAX];
187 static struct lock_class_key af_family_slock_keys[AF_MAX];
188
189 #if defined(CONFIG_MEMCG_KMEM)
190 struct static_key memcg_socket_limit_enabled;
191 EXPORT_SYMBOL(memcg_socket_limit_enabled);
192 #endif
193
194 /*
195 * Make lock validator output more readable. (we pre-construct these
196 * strings build-time, so that runtime initialization of socket
197 * locks is fast):
198 */
199 static const char *const af_family_key_strings[AF_MAX+1] = {
200 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
201 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
202 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
203 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
204 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
205 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
206 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
207 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
208 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
209 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
210 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
211 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
212 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
213 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX"
214 };
215 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
216 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
217 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
218 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
219 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
220 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
221 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
222 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
223 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
224 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
225 "slock-27" , "slock-28" , "slock-AF_CAN" ,
226 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
227 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
228 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
229 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
230 };
231 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
232 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
233 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
234 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
235 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
236 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
237 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
238 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
239 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
240 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
241 "clock-27" , "clock-28" , "clock-AF_CAN" ,
242 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
243 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
244 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
245 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX"
246 };
247
248 /*
249 * sk_callback_lock locking rules are per-address-family,
250 * so split the lock classes by using a per-AF key:
251 */
252 static struct lock_class_key af_callback_keys[AF_MAX];
253
254 /* Take into consideration the size of the struct sk_buff overhead in the
255 * determination of these values, since that is non-constant across
256 * platforms. This makes socket queueing behavior and performance
257 * not depend upon such differences.
258 */
259 #define _SK_MEM_PACKETS 256
260 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
261 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
262 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
263
264 /* Run time adjustable parameters. */
265 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
266 EXPORT_SYMBOL(sysctl_wmem_max);
267 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
268 EXPORT_SYMBOL(sysctl_rmem_max);
269 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
270 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
271
272 /* Maximal space eaten by iovec or ancillary data plus some space */
273 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
274 EXPORT_SYMBOL(sysctl_optmem_max);
275
276 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
277 EXPORT_SYMBOL_GPL(memalloc_socks);
278
279 /**
280 * sk_set_memalloc - sets %SOCK_MEMALLOC
281 * @sk: socket to set it on
282 *
283 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
284 * It's the responsibility of the admin to adjust min_free_kbytes
285 * to meet the requirements
286 */
sk_set_memalloc(struct sock * sk)287 void sk_set_memalloc(struct sock *sk)
288 {
289 sock_set_flag(sk, SOCK_MEMALLOC);
290 sk->sk_allocation |= __GFP_MEMALLOC;
291 static_key_slow_inc(&memalloc_socks);
292 }
293 EXPORT_SYMBOL_GPL(sk_set_memalloc);
294
sk_clear_memalloc(struct sock * sk)295 void sk_clear_memalloc(struct sock *sk)
296 {
297 sock_reset_flag(sk, SOCK_MEMALLOC);
298 sk->sk_allocation &= ~__GFP_MEMALLOC;
299 static_key_slow_dec(&memalloc_socks);
300
301 /*
302 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
303 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
304 * it has rmem allocations there is a risk that the user of the
305 * socket cannot make forward progress due to exceeding the rmem
306 * limits. By rights, sk_clear_memalloc() should only be called
307 * on sockets being torn down but warn and reset the accounting if
308 * that assumption breaks.
309 */
310 if (WARN_ON(sk->sk_forward_alloc))
311 sk_mem_reclaim(sk);
312 }
313 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
314
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)315 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
316 {
317 int ret;
318 unsigned long pflags = current->flags;
319
320 /* these should have been dropped before queueing */
321 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
322
323 current->flags |= PF_MEMALLOC;
324 ret = sk->sk_backlog_rcv(sk, skb);
325 tsk_restore_flags(current, pflags, PF_MEMALLOC);
326
327 return ret;
328 }
329 EXPORT_SYMBOL(__sk_backlog_rcv);
330
sock_set_timeout(long * timeo_p,char __user * optval,int optlen)331 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
332 {
333 struct timeval tv;
334
335 if (optlen < sizeof(tv))
336 return -EINVAL;
337 if (copy_from_user(&tv, optval, sizeof(tv)))
338 return -EFAULT;
339 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
340 return -EDOM;
341
342 if (tv.tv_sec < 0) {
343 static int warned __read_mostly;
344
345 *timeo_p = 0;
346 if (warned < 10 && net_ratelimit()) {
347 warned++;
348 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
349 __func__, current->comm, task_pid_nr(current));
350 }
351 return 0;
352 }
353 *timeo_p = MAX_SCHEDULE_TIMEOUT;
354 if (tv.tv_sec == 0 && tv.tv_usec == 0)
355 return 0;
356 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
357 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
358 return 0;
359 }
360
sock_warn_obsolete_bsdism(const char * name)361 static void sock_warn_obsolete_bsdism(const char *name)
362 {
363 static int warned;
364 static char warncomm[TASK_COMM_LEN];
365 if (strcmp(warncomm, current->comm) && warned < 5) {
366 strcpy(warncomm, current->comm);
367 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
368 warncomm, name);
369 warned++;
370 }
371 }
372
373 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
374
sock_disable_timestamp(struct sock * sk,unsigned long flags)375 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
376 {
377 if (sk->sk_flags & flags) {
378 sk->sk_flags &= ~flags;
379 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
380 net_disable_timestamp();
381 }
382 }
383
384
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)385 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
386 {
387 int err;
388 int skb_len;
389 unsigned long flags;
390 struct sk_buff_head *list = &sk->sk_receive_queue;
391
392 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
393 atomic_inc(&sk->sk_drops);
394 trace_sock_rcvqueue_full(sk, skb);
395 return -ENOMEM;
396 }
397
398 err = sk_filter(sk, skb);
399 if (err)
400 return err;
401
402 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
403 atomic_inc(&sk->sk_drops);
404 return -ENOBUFS;
405 }
406
407 skb->dev = NULL;
408 skb_set_owner_r(skb, sk);
409
410 /* Cache the SKB length before we tack it onto the receive
411 * queue. Once it is added it no longer belongs to us and
412 * may be freed by other threads of control pulling packets
413 * from the queue.
414 */
415 skb_len = skb->len;
416
417 /* we escape from rcu protected region, make sure we dont leak
418 * a norefcounted dst
419 */
420 skb_dst_force(skb);
421
422 spin_lock_irqsave(&list->lock, flags);
423 skb->dropcount = atomic_read(&sk->sk_drops);
424 __skb_queue_tail(list, skb);
425 spin_unlock_irqrestore(&list->lock, flags);
426
427 if (!sock_flag(sk, SOCK_DEAD))
428 sk->sk_data_ready(sk, skb_len);
429 return 0;
430 }
431 EXPORT_SYMBOL(sock_queue_rcv_skb);
432
sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested)433 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
434 {
435 int rc = NET_RX_SUCCESS;
436
437 if (sk_filter(sk, skb))
438 goto discard_and_relse;
439
440 skb->dev = NULL;
441
442 if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
443 atomic_inc(&sk->sk_drops);
444 goto discard_and_relse;
445 }
446 if (nested)
447 bh_lock_sock_nested(sk);
448 else
449 bh_lock_sock(sk);
450 if (!sock_owned_by_user(sk)) {
451 /*
452 * trylock + unlock semantics:
453 */
454 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
455
456 rc = sk_backlog_rcv(sk, skb);
457
458 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
459 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
460 bh_unlock_sock(sk);
461 atomic_inc(&sk->sk_drops);
462 goto discard_and_relse;
463 }
464
465 bh_unlock_sock(sk);
466 out:
467 sock_put(sk);
468 return rc;
469 discard_and_relse:
470 kfree_skb(skb);
471 goto out;
472 }
473 EXPORT_SYMBOL(sk_receive_skb);
474
sk_reset_txq(struct sock * sk)475 void sk_reset_txq(struct sock *sk)
476 {
477 sk_tx_queue_clear(sk);
478 }
479 EXPORT_SYMBOL(sk_reset_txq);
480
__sk_dst_check(struct sock * sk,u32 cookie)481 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
482 {
483 struct dst_entry *dst = __sk_dst_get(sk);
484
485 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
486 sk_tx_queue_clear(sk);
487 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
488 dst_release(dst);
489 return NULL;
490 }
491
492 return dst;
493 }
494 EXPORT_SYMBOL(__sk_dst_check);
495
sk_dst_check(struct sock * sk,u32 cookie)496 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
497 {
498 struct dst_entry *dst = sk_dst_get(sk);
499
500 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
501 sk_dst_reset(sk);
502 dst_release(dst);
503 return NULL;
504 }
505
506 return dst;
507 }
508 EXPORT_SYMBOL(sk_dst_check);
509
sock_setbindtodevice(struct sock * sk,char __user * optval,int optlen)510 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
511 int optlen)
512 {
513 int ret = -ENOPROTOOPT;
514 #ifdef CONFIG_NETDEVICES
515 struct net *net = sock_net(sk);
516 char devname[IFNAMSIZ];
517 int index;
518
519 /* Sorry... */
520 ret = -EPERM;
521 if (!ns_capable(net->user_ns, CAP_NET_RAW))
522 goto out;
523
524 ret = -EINVAL;
525 if (optlen < 0)
526 goto out;
527
528 /* Bind this socket to a particular device like "eth0",
529 * as specified in the passed interface name. If the
530 * name is "" or the option length is zero the socket
531 * is not bound.
532 */
533 if (optlen > IFNAMSIZ - 1)
534 optlen = IFNAMSIZ - 1;
535 memset(devname, 0, sizeof(devname));
536
537 ret = -EFAULT;
538 if (copy_from_user(devname, optval, optlen))
539 goto out;
540
541 index = 0;
542 if (devname[0] != '\0') {
543 struct net_device *dev;
544
545 rcu_read_lock();
546 dev = dev_get_by_name_rcu(net, devname);
547 if (dev)
548 index = dev->ifindex;
549 rcu_read_unlock();
550 ret = -ENODEV;
551 if (!dev)
552 goto out;
553 }
554
555 lock_sock(sk);
556 sk->sk_bound_dev_if = index;
557 sk_dst_reset(sk);
558 release_sock(sk);
559
560 ret = 0;
561
562 out:
563 #endif
564
565 return ret;
566 }
567
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)568 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
569 int __user *optlen, int len)
570 {
571 int ret = -ENOPROTOOPT;
572 #ifdef CONFIG_NETDEVICES
573 struct net *net = sock_net(sk);
574 char devname[IFNAMSIZ];
575
576 if (sk->sk_bound_dev_if == 0) {
577 len = 0;
578 goto zero;
579 }
580
581 ret = -EINVAL;
582 if (len < IFNAMSIZ)
583 goto out;
584
585 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
586 if (ret)
587 goto out;
588
589 len = strlen(devname) + 1;
590
591 ret = -EFAULT;
592 if (copy_to_user(optval, devname, len))
593 goto out;
594
595 zero:
596 ret = -EFAULT;
597 if (put_user(len, optlen))
598 goto out;
599
600 ret = 0;
601
602 out:
603 #endif
604
605 return ret;
606 }
607
sock_valbool_flag(struct sock * sk,int bit,int valbool)608 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
609 {
610 if (valbool)
611 sock_set_flag(sk, bit);
612 else
613 sock_reset_flag(sk, bit);
614 }
615
616 /*
617 * This is meant for all protocols to use and covers goings on
618 * at the socket level. Everything here is generic.
619 */
620
sock_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)621 int sock_setsockopt(struct socket *sock, int level, int optname,
622 char __user *optval, unsigned int optlen)
623 {
624 struct sock *sk = sock->sk;
625 int val;
626 int valbool;
627 struct linger ling;
628 int ret = 0;
629
630 /*
631 * Options without arguments
632 */
633
634 if (optname == SO_BINDTODEVICE)
635 return sock_setbindtodevice(sk, optval, optlen);
636
637 if (optlen < sizeof(int))
638 return -EINVAL;
639
640 if (get_user(val, (int __user *)optval))
641 return -EFAULT;
642
643 valbool = val ? 1 : 0;
644
645 lock_sock(sk);
646
647 switch (optname) {
648 case SO_DEBUG:
649 if (val && !capable(CAP_NET_ADMIN))
650 ret = -EACCES;
651 else
652 sock_valbool_flag(sk, SOCK_DBG, valbool);
653 break;
654 case SO_REUSEADDR:
655 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
656 break;
657 case SO_REUSEPORT:
658 sk->sk_reuseport = valbool;
659 break;
660 case SO_TYPE:
661 case SO_PROTOCOL:
662 case SO_DOMAIN:
663 case SO_ERROR:
664 ret = -ENOPROTOOPT;
665 break;
666 case SO_DONTROUTE:
667 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
668 break;
669 case SO_BROADCAST:
670 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
671 break;
672 case SO_SNDBUF:
673 /* Don't error on this BSD doesn't and if you think
674 * about it this is right. Otherwise apps have to
675 * play 'guess the biggest size' games. RCVBUF/SNDBUF
676 * are treated in BSD as hints
677 */
678 val = min_t(u32, val, sysctl_wmem_max);
679 set_sndbuf:
680 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
681 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
682 /* Wake up sending tasks if we upped the value. */
683 sk->sk_write_space(sk);
684 break;
685
686 case SO_SNDBUFFORCE:
687 if (!capable(CAP_NET_ADMIN)) {
688 ret = -EPERM;
689 break;
690 }
691 goto set_sndbuf;
692
693 case SO_RCVBUF:
694 /* Don't error on this BSD doesn't and if you think
695 * about it this is right. Otherwise apps have to
696 * play 'guess the biggest size' games. RCVBUF/SNDBUF
697 * are treated in BSD as hints
698 */
699 val = min_t(u32, val, sysctl_rmem_max);
700 set_rcvbuf:
701 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
702 /*
703 * We double it on the way in to account for
704 * "struct sk_buff" etc. overhead. Applications
705 * assume that the SO_RCVBUF setting they make will
706 * allow that much actual data to be received on that
707 * socket.
708 *
709 * Applications are unaware that "struct sk_buff" and
710 * other overheads allocate from the receive buffer
711 * during socket buffer allocation.
712 *
713 * And after considering the possible alternatives,
714 * returning the value we actually used in getsockopt
715 * is the most desirable behavior.
716 */
717 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
718 break;
719
720 case SO_RCVBUFFORCE:
721 if (!capable(CAP_NET_ADMIN)) {
722 ret = -EPERM;
723 break;
724 }
725 goto set_rcvbuf;
726
727 case SO_KEEPALIVE:
728 #ifdef CONFIG_INET
729 if (sk->sk_protocol == IPPROTO_TCP &&
730 sk->sk_type == SOCK_STREAM)
731 tcp_set_keepalive(sk, valbool);
732 #endif
733 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
734 break;
735
736 case SO_OOBINLINE:
737 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
738 break;
739
740 case SO_NO_CHECK:
741 sk->sk_no_check = valbool;
742 break;
743
744 case SO_PRIORITY:
745 if ((val >= 0 && val <= 6) ||
746 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
747 sk->sk_priority = val;
748 else
749 ret = -EPERM;
750 break;
751
752 case SO_LINGER:
753 if (optlen < sizeof(ling)) {
754 ret = -EINVAL; /* 1003.1g */
755 break;
756 }
757 if (copy_from_user(&ling, optval, sizeof(ling))) {
758 ret = -EFAULT;
759 break;
760 }
761 if (!ling.l_onoff)
762 sock_reset_flag(sk, SOCK_LINGER);
763 else {
764 #if (BITS_PER_LONG == 32)
765 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
766 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
767 else
768 #endif
769 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
770 sock_set_flag(sk, SOCK_LINGER);
771 }
772 break;
773
774 case SO_BSDCOMPAT:
775 sock_warn_obsolete_bsdism("setsockopt");
776 break;
777
778 case SO_PASSCRED:
779 if (valbool)
780 set_bit(SOCK_PASSCRED, &sock->flags);
781 else
782 clear_bit(SOCK_PASSCRED, &sock->flags);
783 break;
784
785 case SO_TIMESTAMP:
786 case SO_TIMESTAMPNS:
787 if (valbool) {
788 if (optname == SO_TIMESTAMP)
789 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
790 else
791 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
792 sock_set_flag(sk, SOCK_RCVTSTAMP);
793 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
794 } else {
795 sock_reset_flag(sk, SOCK_RCVTSTAMP);
796 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
797 }
798 break;
799
800 case SO_TIMESTAMPING:
801 if (val & ~SOF_TIMESTAMPING_MASK) {
802 ret = -EINVAL;
803 break;
804 }
805 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
806 val & SOF_TIMESTAMPING_TX_HARDWARE);
807 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
808 val & SOF_TIMESTAMPING_TX_SOFTWARE);
809 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
810 val & SOF_TIMESTAMPING_RX_HARDWARE);
811 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
812 sock_enable_timestamp(sk,
813 SOCK_TIMESTAMPING_RX_SOFTWARE);
814 else
815 sock_disable_timestamp(sk,
816 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
817 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
818 val & SOF_TIMESTAMPING_SOFTWARE);
819 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
820 val & SOF_TIMESTAMPING_SYS_HARDWARE);
821 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
822 val & SOF_TIMESTAMPING_RAW_HARDWARE);
823 break;
824
825 case SO_RCVLOWAT:
826 if (val < 0)
827 val = INT_MAX;
828 sk->sk_rcvlowat = val ? : 1;
829 break;
830
831 case SO_RCVTIMEO:
832 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
833 break;
834
835 case SO_SNDTIMEO:
836 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
837 break;
838
839 case SO_ATTACH_FILTER:
840 ret = -EINVAL;
841 if (optlen == sizeof(struct sock_fprog)) {
842 struct sock_fprog fprog;
843
844 ret = -EFAULT;
845 if (copy_from_user(&fprog, optval, sizeof(fprog)))
846 break;
847
848 ret = sk_attach_filter(&fprog, sk);
849 }
850 break;
851
852 case SO_DETACH_FILTER:
853 ret = sk_detach_filter(sk);
854 break;
855
856 case SO_LOCK_FILTER:
857 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
858 ret = -EPERM;
859 else
860 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
861 break;
862
863 case SO_PASSSEC:
864 if (valbool)
865 set_bit(SOCK_PASSSEC, &sock->flags);
866 else
867 clear_bit(SOCK_PASSSEC, &sock->flags);
868 break;
869 case SO_MARK:
870 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
871 ret = -EPERM;
872 else
873 sk->sk_mark = val;
874 break;
875
876 /* We implement the SO_SNDLOWAT etc to
877 not be settable (1003.1g 5.3) */
878 case SO_RXQ_OVFL:
879 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
880 break;
881
882 case SO_WIFI_STATUS:
883 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
884 break;
885
886 case SO_PEEK_OFF:
887 if (sock->ops->set_peek_off)
888 sock->ops->set_peek_off(sk, val);
889 else
890 ret = -EOPNOTSUPP;
891 break;
892
893 case SO_NOFCS:
894 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
895 break;
896
897 case SO_SELECT_ERR_QUEUE:
898 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
899 break;
900
901 default:
902 ret = -ENOPROTOOPT;
903 break;
904 }
905 release_sock(sk);
906 return ret;
907 }
908 EXPORT_SYMBOL(sock_setsockopt);
909
910
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)911 void cred_to_ucred(struct pid *pid, const struct cred *cred,
912 struct ucred *ucred)
913 {
914 ucred->pid = pid_vnr(pid);
915 ucred->uid = ucred->gid = -1;
916 if (cred) {
917 struct user_namespace *current_ns = current_user_ns();
918
919 ucred->uid = from_kuid_munged(current_ns, cred->euid);
920 ucred->gid = from_kgid_munged(current_ns, cred->egid);
921 }
922 }
923 EXPORT_SYMBOL_GPL(cred_to_ucred);
924
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)925 int sock_getsockopt(struct socket *sock, int level, int optname,
926 char __user *optval, int __user *optlen)
927 {
928 struct sock *sk = sock->sk;
929
930 union {
931 int val;
932 struct linger ling;
933 struct timeval tm;
934 } v;
935
936 int lv = sizeof(int);
937 int len;
938
939 if (get_user(len, optlen))
940 return -EFAULT;
941 if (len < 0)
942 return -EINVAL;
943
944 memset(&v, 0, sizeof(v));
945
946 switch (optname) {
947 case SO_DEBUG:
948 v.val = sock_flag(sk, SOCK_DBG);
949 break;
950
951 case SO_DONTROUTE:
952 v.val = sock_flag(sk, SOCK_LOCALROUTE);
953 break;
954
955 case SO_BROADCAST:
956 v.val = sock_flag(sk, SOCK_BROADCAST);
957 break;
958
959 case SO_SNDBUF:
960 v.val = sk->sk_sndbuf;
961 break;
962
963 case SO_RCVBUF:
964 v.val = sk->sk_rcvbuf;
965 break;
966
967 case SO_REUSEADDR:
968 v.val = sk->sk_reuse;
969 break;
970
971 case SO_REUSEPORT:
972 v.val = sk->sk_reuseport;
973 break;
974
975 case SO_KEEPALIVE:
976 v.val = sock_flag(sk, SOCK_KEEPOPEN);
977 break;
978
979 case SO_TYPE:
980 v.val = sk->sk_type;
981 break;
982
983 case SO_PROTOCOL:
984 v.val = sk->sk_protocol;
985 break;
986
987 case SO_DOMAIN:
988 v.val = sk->sk_family;
989 break;
990
991 case SO_ERROR:
992 v.val = -sock_error(sk);
993 if (v.val == 0)
994 v.val = xchg(&sk->sk_err_soft, 0);
995 break;
996
997 case SO_OOBINLINE:
998 v.val = sock_flag(sk, SOCK_URGINLINE);
999 break;
1000
1001 case SO_NO_CHECK:
1002 v.val = sk->sk_no_check;
1003 break;
1004
1005 case SO_PRIORITY:
1006 v.val = sk->sk_priority;
1007 break;
1008
1009 case SO_LINGER:
1010 lv = sizeof(v.ling);
1011 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1012 v.ling.l_linger = sk->sk_lingertime / HZ;
1013 break;
1014
1015 case SO_BSDCOMPAT:
1016 sock_warn_obsolete_bsdism("getsockopt");
1017 break;
1018
1019 case SO_TIMESTAMP:
1020 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1021 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1022 break;
1023
1024 case SO_TIMESTAMPNS:
1025 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1026 break;
1027
1028 case SO_TIMESTAMPING:
1029 v.val = 0;
1030 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1031 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1032 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1033 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1034 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1035 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1036 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1037 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1038 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1039 v.val |= SOF_TIMESTAMPING_SOFTWARE;
1040 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1041 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1042 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1043 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1044 break;
1045
1046 case SO_RCVTIMEO:
1047 lv = sizeof(struct timeval);
1048 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1049 v.tm.tv_sec = 0;
1050 v.tm.tv_usec = 0;
1051 } else {
1052 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1053 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1054 }
1055 break;
1056
1057 case SO_SNDTIMEO:
1058 lv = sizeof(struct timeval);
1059 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1060 v.tm.tv_sec = 0;
1061 v.tm.tv_usec = 0;
1062 } else {
1063 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1064 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1065 }
1066 break;
1067
1068 case SO_RCVLOWAT:
1069 v.val = sk->sk_rcvlowat;
1070 break;
1071
1072 case SO_SNDLOWAT:
1073 v.val = 1;
1074 break;
1075
1076 case SO_PASSCRED:
1077 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1078 break;
1079
1080 case SO_PEERCRED:
1081 {
1082 struct ucred peercred;
1083 if (len > sizeof(peercred))
1084 len = sizeof(peercred);
1085 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1086 if (copy_to_user(optval, &peercred, len))
1087 return -EFAULT;
1088 goto lenout;
1089 }
1090
1091 case SO_PEERNAME:
1092 {
1093 char address[128];
1094
1095 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1096 return -ENOTCONN;
1097 if (lv < len)
1098 return -EINVAL;
1099 if (copy_to_user(optval, address, len))
1100 return -EFAULT;
1101 goto lenout;
1102 }
1103
1104 /* Dubious BSD thing... Probably nobody even uses it, but
1105 * the UNIX standard wants it for whatever reason... -DaveM
1106 */
1107 case SO_ACCEPTCONN:
1108 v.val = sk->sk_state == TCP_LISTEN;
1109 break;
1110
1111 case SO_PASSSEC:
1112 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1113 break;
1114
1115 case SO_PEERSEC:
1116 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1117
1118 case SO_MARK:
1119 v.val = sk->sk_mark;
1120 break;
1121
1122 case SO_RXQ_OVFL:
1123 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1124 break;
1125
1126 case SO_WIFI_STATUS:
1127 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1128 break;
1129
1130 case SO_PEEK_OFF:
1131 if (!sock->ops->set_peek_off)
1132 return -EOPNOTSUPP;
1133
1134 v.val = sk->sk_peek_off;
1135 break;
1136 case SO_NOFCS:
1137 v.val = sock_flag(sk, SOCK_NOFCS);
1138 break;
1139
1140 case SO_BINDTODEVICE:
1141 return sock_getbindtodevice(sk, optval, optlen, len);
1142
1143 case SO_GET_FILTER:
1144 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1145 if (len < 0)
1146 return len;
1147
1148 goto lenout;
1149
1150 case SO_LOCK_FILTER:
1151 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1152 break;
1153
1154 case SO_SELECT_ERR_QUEUE:
1155 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1156 break;
1157
1158 default:
1159 return -ENOPROTOOPT;
1160 }
1161
1162 if (len > lv)
1163 len = lv;
1164 if (copy_to_user(optval, &v, len))
1165 return -EFAULT;
1166 lenout:
1167 if (put_user(len, optlen))
1168 return -EFAULT;
1169 return 0;
1170 }
1171
1172 /*
1173 * Initialize an sk_lock.
1174 *
1175 * (We also register the sk_lock with the lock validator.)
1176 */
sock_lock_init(struct sock * sk)1177 static inline void sock_lock_init(struct sock *sk)
1178 {
1179 sock_lock_init_class_and_name(sk,
1180 af_family_slock_key_strings[sk->sk_family],
1181 af_family_slock_keys + sk->sk_family,
1182 af_family_key_strings[sk->sk_family],
1183 af_family_keys + sk->sk_family);
1184 }
1185
1186 /*
1187 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1188 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1189 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1190 */
sock_copy(struct sock * nsk,const struct sock * osk)1191 static void sock_copy(struct sock *nsk, const struct sock *osk)
1192 {
1193 #ifdef CONFIG_SECURITY_NETWORK
1194 void *sptr = nsk->sk_security;
1195 #endif
1196 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1197
1198 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1199 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1200
1201 #ifdef CONFIG_SECURITY_NETWORK
1202 nsk->sk_security = sptr;
1203 security_sk_clone(osk, nsk);
1204 #endif
1205 }
1206
sk_prot_clear_portaddr_nulls(struct sock * sk,int size)1207 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1208 {
1209 unsigned long nulls1, nulls2;
1210
1211 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1212 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1213 if (nulls1 > nulls2)
1214 swap(nulls1, nulls2);
1215
1216 if (nulls1 != 0)
1217 memset((char *)sk, 0, nulls1);
1218 memset((char *)sk + nulls1 + sizeof(void *), 0,
1219 nulls2 - nulls1 - sizeof(void *));
1220 memset((char *)sk + nulls2 + sizeof(void *), 0,
1221 size - nulls2 - sizeof(void *));
1222 }
1223 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1224
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1225 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1226 int family)
1227 {
1228 struct sock *sk;
1229 struct kmem_cache *slab;
1230
1231 slab = prot->slab;
1232 if (slab != NULL) {
1233 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1234 if (!sk)
1235 return sk;
1236 if (priority & __GFP_ZERO) {
1237 if (prot->clear_sk)
1238 prot->clear_sk(sk, prot->obj_size);
1239 else
1240 sk_prot_clear_nulls(sk, prot->obj_size);
1241 }
1242 } else
1243 sk = kmalloc(prot->obj_size, priority);
1244
1245 if (sk != NULL) {
1246 kmemcheck_annotate_bitfield(sk, flags);
1247
1248 if (security_sk_alloc(sk, family, priority))
1249 goto out_free;
1250
1251 if (!try_module_get(prot->owner))
1252 goto out_free_sec;
1253 sk_tx_queue_clear(sk);
1254 }
1255
1256 return sk;
1257
1258 out_free_sec:
1259 security_sk_free(sk);
1260 out_free:
1261 if (slab != NULL)
1262 kmem_cache_free(slab, sk);
1263 else
1264 kfree(sk);
1265 return NULL;
1266 }
1267
sk_prot_free(struct proto * prot,struct sock * sk)1268 static void sk_prot_free(struct proto *prot, struct sock *sk)
1269 {
1270 struct kmem_cache *slab;
1271 struct module *owner;
1272
1273 owner = prot->owner;
1274 slab = prot->slab;
1275
1276 security_sk_free(sk);
1277 if (slab != NULL)
1278 kmem_cache_free(slab, sk);
1279 else
1280 kfree(sk);
1281 module_put(owner);
1282 }
1283
1284 #if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
sock_update_classid(struct sock * sk)1285 void sock_update_classid(struct sock *sk)
1286 {
1287 u32 classid;
1288
1289 classid = task_cls_classid(current);
1290 if (classid != sk->sk_classid)
1291 sk->sk_classid = classid;
1292 }
1293 EXPORT_SYMBOL(sock_update_classid);
1294 #endif
1295
1296 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
sock_update_netprioidx(struct sock * sk)1297 void sock_update_netprioidx(struct sock *sk)
1298 {
1299 if (in_interrupt())
1300 return;
1301
1302 sk->sk_cgrp_prioidx = task_netprioidx(current);
1303 }
1304 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1305 #endif
1306
1307 /**
1308 * sk_alloc - All socket objects are allocated here
1309 * @net: the applicable net namespace
1310 * @family: protocol family
1311 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1312 * @prot: struct proto associated with this new sock instance
1313 */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot)1314 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1315 struct proto *prot)
1316 {
1317 struct sock *sk;
1318
1319 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1320 if (sk) {
1321 sk->sk_family = family;
1322 /*
1323 * See comment in struct sock definition to understand
1324 * why we need sk_prot_creator -acme
1325 */
1326 sk->sk_prot = sk->sk_prot_creator = prot;
1327 sock_lock_init(sk);
1328 sock_net_set(sk, get_net(net));
1329 atomic_set(&sk->sk_wmem_alloc, 1);
1330
1331 sock_update_classid(sk);
1332 sock_update_netprioidx(sk);
1333 }
1334
1335 return sk;
1336 }
1337 EXPORT_SYMBOL(sk_alloc);
1338
__sk_free(struct sock * sk)1339 static void __sk_free(struct sock *sk)
1340 {
1341 struct sk_filter *filter;
1342
1343 if (sk->sk_destruct)
1344 sk->sk_destruct(sk);
1345
1346 filter = rcu_dereference_check(sk->sk_filter,
1347 atomic_read(&sk->sk_wmem_alloc) == 0);
1348 if (filter) {
1349 sk_filter_uncharge(sk, filter);
1350 RCU_INIT_POINTER(sk->sk_filter, NULL);
1351 }
1352
1353 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1354
1355 if (atomic_read(&sk->sk_omem_alloc))
1356 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1357 __func__, atomic_read(&sk->sk_omem_alloc));
1358
1359 if (sk->sk_peer_cred)
1360 put_cred(sk->sk_peer_cred);
1361 put_pid(sk->sk_peer_pid);
1362 put_net(sock_net(sk));
1363 sk_prot_free(sk->sk_prot_creator, sk);
1364 }
1365
sk_free(struct sock * sk)1366 void sk_free(struct sock *sk)
1367 {
1368 /*
1369 * We subtract one from sk_wmem_alloc and can know if
1370 * some packets are still in some tx queue.
1371 * If not null, sock_wfree() will call __sk_free(sk) later
1372 */
1373 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1374 __sk_free(sk);
1375 }
1376 EXPORT_SYMBOL(sk_free);
1377
1378 /*
1379 * Last sock_put should drop reference to sk->sk_net. It has already
1380 * been dropped in sk_change_net. Taking reference to stopping namespace
1381 * is not an option.
1382 * Take reference to a socket to remove it from hash _alive_ and after that
1383 * destroy it in the context of init_net.
1384 */
sk_release_kernel(struct sock * sk)1385 void sk_release_kernel(struct sock *sk)
1386 {
1387 if (sk == NULL || sk->sk_socket == NULL)
1388 return;
1389
1390 sock_hold(sk);
1391 sock_release(sk->sk_socket);
1392 release_net(sock_net(sk));
1393 sock_net_set(sk, get_net(&init_net));
1394 sock_put(sk);
1395 }
1396 EXPORT_SYMBOL(sk_release_kernel);
1397
sk_update_clone(const struct sock * sk,struct sock * newsk)1398 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1399 {
1400 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1401 sock_update_memcg(newsk);
1402 }
1403
1404 /**
1405 * sk_clone_lock - clone a socket, and lock its clone
1406 * @sk: the socket to clone
1407 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1408 *
1409 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1410 */
sk_clone_lock(const struct sock * sk,const gfp_t priority)1411 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1412 {
1413 struct sock *newsk;
1414
1415 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1416 if (newsk != NULL) {
1417 struct sk_filter *filter;
1418
1419 sock_copy(newsk, sk);
1420
1421 /* SANITY */
1422 get_net(sock_net(newsk));
1423 sk_node_init(&newsk->sk_node);
1424 sock_lock_init(newsk);
1425 bh_lock_sock(newsk);
1426 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1427 newsk->sk_backlog.len = 0;
1428
1429 atomic_set(&newsk->sk_rmem_alloc, 0);
1430 /*
1431 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1432 */
1433 atomic_set(&newsk->sk_wmem_alloc, 1);
1434 atomic_set(&newsk->sk_omem_alloc, 0);
1435 skb_queue_head_init(&newsk->sk_receive_queue);
1436 skb_queue_head_init(&newsk->sk_write_queue);
1437 #ifdef CONFIG_NET_DMA
1438 skb_queue_head_init(&newsk->sk_async_wait_queue);
1439 #endif
1440
1441 spin_lock_init(&newsk->sk_dst_lock);
1442 rwlock_init(&newsk->sk_callback_lock);
1443 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1444 af_callback_keys + newsk->sk_family,
1445 af_family_clock_key_strings[newsk->sk_family]);
1446
1447 newsk->sk_dst_cache = NULL;
1448 newsk->sk_wmem_queued = 0;
1449 newsk->sk_forward_alloc = 0;
1450 newsk->sk_send_head = NULL;
1451 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1452
1453 sock_reset_flag(newsk, SOCK_DONE);
1454 skb_queue_head_init(&newsk->sk_error_queue);
1455
1456 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1457 if (filter != NULL)
1458 sk_filter_charge(newsk, filter);
1459
1460 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1461 /* It is still raw copy of parent, so invalidate
1462 * destructor and make plain sk_free() */
1463 newsk->sk_destruct = NULL;
1464 bh_unlock_sock(newsk);
1465 sk_free(newsk);
1466 newsk = NULL;
1467 goto out;
1468 }
1469
1470 newsk->sk_err = 0;
1471 newsk->sk_priority = 0;
1472 /*
1473 * Before updating sk_refcnt, we must commit prior changes to memory
1474 * (Documentation/RCU/rculist_nulls.txt for details)
1475 */
1476 smp_wmb();
1477 atomic_set(&newsk->sk_refcnt, 2);
1478
1479 /*
1480 * Increment the counter in the same struct proto as the master
1481 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1482 * is the same as sk->sk_prot->socks, as this field was copied
1483 * with memcpy).
1484 *
1485 * This _changes_ the previous behaviour, where
1486 * tcp_create_openreq_child always was incrementing the
1487 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1488 * to be taken into account in all callers. -acme
1489 */
1490 sk_refcnt_debug_inc(newsk);
1491 sk_set_socket(newsk, NULL);
1492 newsk->sk_wq = NULL;
1493
1494 sk_update_clone(sk, newsk);
1495
1496 if (newsk->sk_prot->sockets_allocated)
1497 sk_sockets_allocated_inc(newsk);
1498
1499 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1500 net_enable_timestamp();
1501 }
1502 out:
1503 return newsk;
1504 }
1505 EXPORT_SYMBOL_GPL(sk_clone_lock);
1506
sk_setup_caps(struct sock * sk,struct dst_entry * dst)1507 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1508 {
1509 __sk_dst_set(sk, dst);
1510 sk->sk_route_caps = dst->dev->features;
1511 if (sk->sk_route_caps & NETIF_F_GSO)
1512 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1513 sk->sk_route_caps &= ~sk->sk_route_nocaps;
1514 if (sk_can_gso(sk)) {
1515 if (dst->header_len) {
1516 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1517 } else {
1518 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1519 sk->sk_gso_max_size = dst->dev->gso_max_size;
1520 sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1521 }
1522 }
1523 }
1524 EXPORT_SYMBOL_GPL(sk_setup_caps);
1525
1526 /*
1527 * Simple resource managers for sockets.
1528 */
1529
1530
1531 /*
1532 * Write buffer destructor automatically called from kfree_skb.
1533 */
sock_wfree(struct sk_buff * skb)1534 void sock_wfree(struct sk_buff *skb)
1535 {
1536 struct sock *sk = skb->sk;
1537 unsigned int len = skb->truesize;
1538
1539 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1540 /*
1541 * Keep a reference on sk_wmem_alloc, this will be released
1542 * after sk_write_space() call
1543 */
1544 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1545 sk->sk_write_space(sk);
1546 len = 1;
1547 }
1548 /*
1549 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1550 * could not do because of in-flight packets
1551 */
1552 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1553 __sk_free(sk);
1554 }
1555 EXPORT_SYMBOL(sock_wfree);
1556
1557 /*
1558 * Read buffer destructor automatically called from kfree_skb.
1559 */
sock_rfree(struct sk_buff * skb)1560 void sock_rfree(struct sk_buff *skb)
1561 {
1562 struct sock *sk = skb->sk;
1563 unsigned int len = skb->truesize;
1564
1565 atomic_sub(len, &sk->sk_rmem_alloc);
1566 sk_mem_uncharge(sk, len);
1567 }
1568 EXPORT_SYMBOL(sock_rfree);
1569
sock_edemux(struct sk_buff * skb)1570 void sock_edemux(struct sk_buff *skb)
1571 {
1572 struct sock *sk = skb->sk;
1573
1574 #ifdef CONFIG_INET
1575 if (sk->sk_state == TCP_TIME_WAIT)
1576 inet_twsk_put(inet_twsk(sk));
1577 else
1578 #endif
1579 sock_put(sk);
1580 }
1581 EXPORT_SYMBOL(sock_edemux);
1582
sock_i_uid(struct sock * sk)1583 kuid_t sock_i_uid(struct sock *sk)
1584 {
1585 kuid_t uid;
1586
1587 read_lock_bh(&sk->sk_callback_lock);
1588 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1589 read_unlock_bh(&sk->sk_callback_lock);
1590 return uid;
1591 }
1592 EXPORT_SYMBOL(sock_i_uid);
1593
sock_i_ino(struct sock * sk)1594 unsigned long sock_i_ino(struct sock *sk)
1595 {
1596 unsigned long ino;
1597
1598 read_lock_bh(&sk->sk_callback_lock);
1599 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1600 read_unlock_bh(&sk->sk_callback_lock);
1601 return ino;
1602 }
1603 EXPORT_SYMBOL(sock_i_ino);
1604
1605 /*
1606 * Allocate a skb from the socket's send buffer.
1607 */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)1608 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1609 gfp_t priority)
1610 {
1611 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1612 struct sk_buff *skb = alloc_skb(size, priority);
1613 if (skb) {
1614 skb_set_owner_w(skb, sk);
1615 return skb;
1616 }
1617 }
1618 return NULL;
1619 }
1620 EXPORT_SYMBOL(sock_wmalloc);
1621
1622 /*
1623 * Allocate a skb from the socket's receive buffer.
1624 */
sock_rmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)1625 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1626 gfp_t priority)
1627 {
1628 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1629 struct sk_buff *skb = alloc_skb(size, priority);
1630 if (skb) {
1631 skb_set_owner_r(skb, sk);
1632 return skb;
1633 }
1634 }
1635 return NULL;
1636 }
1637
1638 /*
1639 * Allocate a memory block from the socket's option memory buffer.
1640 */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)1641 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1642 {
1643 if ((unsigned int)size <= sysctl_optmem_max &&
1644 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1645 void *mem;
1646 /* First do the add, to avoid the race if kmalloc
1647 * might sleep.
1648 */
1649 atomic_add(size, &sk->sk_omem_alloc);
1650 mem = kmalloc(size, priority);
1651 if (mem)
1652 return mem;
1653 atomic_sub(size, &sk->sk_omem_alloc);
1654 }
1655 return NULL;
1656 }
1657 EXPORT_SYMBOL(sock_kmalloc);
1658
1659 /*
1660 * Free an option memory block.
1661 */
sock_kfree_s(struct sock * sk,void * mem,int size)1662 void sock_kfree_s(struct sock *sk, void *mem, int size)
1663 {
1664 kfree(mem);
1665 atomic_sub(size, &sk->sk_omem_alloc);
1666 }
1667 EXPORT_SYMBOL(sock_kfree_s);
1668
1669 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1670 I think, these locks should be removed for datagram sockets.
1671 */
sock_wait_for_wmem(struct sock * sk,long timeo)1672 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1673 {
1674 DEFINE_WAIT(wait);
1675
1676 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1677 for (;;) {
1678 if (!timeo)
1679 break;
1680 if (signal_pending(current))
1681 break;
1682 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1683 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1684 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1685 break;
1686 if (sk->sk_shutdown & SEND_SHUTDOWN)
1687 break;
1688 if (sk->sk_err)
1689 break;
1690 timeo = schedule_timeout(timeo);
1691 }
1692 finish_wait(sk_sleep(sk), &wait);
1693 return timeo;
1694 }
1695
1696
1697 /*
1698 * Generic send/receive buffer handlers
1699 */
1700
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode)1701 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1702 unsigned long data_len, int noblock,
1703 int *errcode)
1704 {
1705 struct sk_buff *skb;
1706 gfp_t gfp_mask;
1707 long timeo;
1708 int err;
1709 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1710
1711 err = -EMSGSIZE;
1712 if (npages > MAX_SKB_FRAGS)
1713 goto failure;
1714
1715 gfp_mask = sk->sk_allocation;
1716 if (gfp_mask & __GFP_WAIT)
1717 gfp_mask |= __GFP_REPEAT;
1718
1719 timeo = sock_sndtimeo(sk, noblock);
1720 while (1) {
1721 err = sock_error(sk);
1722 if (err != 0)
1723 goto failure;
1724
1725 err = -EPIPE;
1726 if (sk->sk_shutdown & SEND_SHUTDOWN)
1727 goto failure;
1728
1729 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1730 skb = alloc_skb(header_len, gfp_mask);
1731 if (skb) {
1732 int i;
1733
1734 /* No pages, we're done... */
1735 if (!data_len)
1736 break;
1737
1738 skb->truesize += data_len;
1739 skb_shinfo(skb)->nr_frags = npages;
1740 for (i = 0; i < npages; i++) {
1741 struct page *page;
1742
1743 page = alloc_pages(sk->sk_allocation, 0);
1744 if (!page) {
1745 err = -ENOBUFS;
1746 skb_shinfo(skb)->nr_frags = i;
1747 kfree_skb(skb);
1748 goto failure;
1749 }
1750
1751 __skb_fill_page_desc(skb, i,
1752 page, 0,
1753 (data_len >= PAGE_SIZE ?
1754 PAGE_SIZE :
1755 data_len));
1756 data_len -= PAGE_SIZE;
1757 }
1758
1759 /* Full success... */
1760 break;
1761 }
1762 err = -ENOBUFS;
1763 goto failure;
1764 }
1765 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1766 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1767 err = -EAGAIN;
1768 if (!timeo)
1769 goto failure;
1770 if (signal_pending(current))
1771 goto interrupted;
1772 timeo = sock_wait_for_wmem(sk, timeo);
1773 }
1774
1775 skb_set_owner_w(skb, sk);
1776 return skb;
1777
1778 interrupted:
1779 err = sock_intr_errno(timeo);
1780 failure:
1781 *errcode = err;
1782 return NULL;
1783 }
1784 EXPORT_SYMBOL(sock_alloc_send_pskb);
1785
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)1786 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1787 int noblock, int *errcode)
1788 {
1789 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1790 }
1791 EXPORT_SYMBOL(sock_alloc_send_skb);
1792
1793 /* On 32bit arches, an skb frag is limited to 2^15 */
1794 #define SKB_FRAG_PAGE_ORDER get_order(32768)
1795
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)1796 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1797 {
1798 int order;
1799
1800 if (pfrag->page) {
1801 if (atomic_read(&pfrag->page->_count) == 1) {
1802 pfrag->offset = 0;
1803 return true;
1804 }
1805 if (pfrag->offset < pfrag->size)
1806 return true;
1807 put_page(pfrag->page);
1808 }
1809
1810 /* We restrict high order allocations to users that can afford to wait */
1811 order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1812
1813 do {
1814 gfp_t gfp = sk->sk_allocation;
1815
1816 if (order)
1817 gfp |= __GFP_COMP | __GFP_NOWARN;
1818 pfrag->page = alloc_pages(gfp, order);
1819 if (likely(pfrag->page)) {
1820 pfrag->offset = 0;
1821 pfrag->size = PAGE_SIZE << order;
1822 return true;
1823 }
1824 } while (--order >= 0);
1825
1826 sk_enter_memory_pressure(sk);
1827 sk_stream_moderate_sndbuf(sk);
1828 return false;
1829 }
1830 EXPORT_SYMBOL(sk_page_frag_refill);
1831
__lock_sock(struct sock * sk)1832 static void __lock_sock(struct sock *sk)
1833 __releases(&sk->sk_lock.slock)
1834 __acquires(&sk->sk_lock.slock)
1835 {
1836 DEFINE_WAIT(wait);
1837
1838 for (;;) {
1839 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1840 TASK_UNINTERRUPTIBLE);
1841 spin_unlock_bh(&sk->sk_lock.slock);
1842 schedule();
1843 spin_lock_bh(&sk->sk_lock.slock);
1844 if (!sock_owned_by_user(sk))
1845 break;
1846 }
1847 finish_wait(&sk->sk_lock.wq, &wait);
1848 }
1849
__release_sock(struct sock * sk)1850 static void __release_sock(struct sock *sk)
1851 __releases(&sk->sk_lock.slock)
1852 __acquires(&sk->sk_lock.slock)
1853 {
1854 struct sk_buff *skb = sk->sk_backlog.head;
1855
1856 do {
1857 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1858 bh_unlock_sock(sk);
1859
1860 do {
1861 struct sk_buff *next = skb->next;
1862
1863 prefetch(next);
1864 WARN_ON_ONCE(skb_dst_is_noref(skb));
1865 skb->next = NULL;
1866 sk_backlog_rcv(sk, skb);
1867
1868 /*
1869 * We are in process context here with softirqs
1870 * disabled, use cond_resched_softirq() to preempt.
1871 * This is safe to do because we've taken the backlog
1872 * queue private:
1873 */
1874 cond_resched_softirq();
1875
1876 skb = next;
1877 } while (skb != NULL);
1878
1879 bh_lock_sock(sk);
1880 } while ((skb = sk->sk_backlog.head) != NULL);
1881
1882 /*
1883 * Doing the zeroing here guarantee we can not loop forever
1884 * while a wild producer attempts to flood us.
1885 */
1886 sk->sk_backlog.len = 0;
1887 }
1888
1889 /**
1890 * sk_wait_data - wait for data to arrive at sk_receive_queue
1891 * @sk: sock to wait on
1892 * @timeo: for how long
1893 *
1894 * Now socket state including sk->sk_err is changed only under lock,
1895 * hence we may omit checks after joining wait queue.
1896 * We check receive queue before schedule() only as optimization;
1897 * it is very likely that release_sock() added new data.
1898 */
sk_wait_data(struct sock * sk,long * timeo)1899 int sk_wait_data(struct sock *sk, long *timeo)
1900 {
1901 int rc;
1902 DEFINE_WAIT(wait);
1903
1904 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1905 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1906 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1907 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1908 finish_wait(sk_sleep(sk), &wait);
1909 return rc;
1910 }
1911 EXPORT_SYMBOL(sk_wait_data);
1912
1913 /**
1914 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1915 * @sk: socket
1916 * @size: memory size to allocate
1917 * @kind: allocation type
1918 *
1919 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1920 * rmem allocation. This function assumes that protocols which have
1921 * memory_pressure use sk_wmem_queued as write buffer accounting.
1922 */
__sk_mem_schedule(struct sock * sk,int size,int kind)1923 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1924 {
1925 struct proto *prot = sk->sk_prot;
1926 int amt = sk_mem_pages(size);
1927 long allocated;
1928 int parent_status = UNDER_LIMIT;
1929
1930 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1931
1932 allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1933
1934 /* Under limit. */
1935 if (parent_status == UNDER_LIMIT &&
1936 allocated <= sk_prot_mem_limits(sk, 0)) {
1937 sk_leave_memory_pressure(sk);
1938 return 1;
1939 }
1940
1941 /* Under pressure. (we or our parents) */
1942 if ((parent_status > SOFT_LIMIT) ||
1943 allocated > sk_prot_mem_limits(sk, 1))
1944 sk_enter_memory_pressure(sk);
1945
1946 /* Over hard limit (we or our parents) */
1947 if ((parent_status == OVER_LIMIT) ||
1948 (allocated > sk_prot_mem_limits(sk, 2)))
1949 goto suppress_allocation;
1950
1951 /* guarantee minimum buffer size under pressure */
1952 if (kind == SK_MEM_RECV) {
1953 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1954 return 1;
1955
1956 } else { /* SK_MEM_SEND */
1957 if (sk->sk_type == SOCK_STREAM) {
1958 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1959 return 1;
1960 } else if (atomic_read(&sk->sk_wmem_alloc) <
1961 prot->sysctl_wmem[0])
1962 return 1;
1963 }
1964
1965 if (sk_has_memory_pressure(sk)) {
1966 int alloc;
1967
1968 if (!sk_under_memory_pressure(sk))
1969 return 1;
1970 alloc = sk_sockets_allocated_read_positive(sk);
1971 if (sk_prot_mem_limits(sk, 2) > alloc *
1972 sk_mem_pages(sk->sk_wmem_queued +
1973 atomic_read(&sk->sk_rmem_alloc) +
1974 sk->sk_forward_alloc))
1975 return 1;
1976 }
1977
1978 suppress_allocation:
1979
1980 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1981 sk_stream_moderate_sndbuf(sk);
1982
1983 /* Fail only if socket is _under_ its sndbuf.
1984 * In this case we cannot block, so that we have to fail.
1985 */
1986 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1987 return 1;
1988 }
1989
1990 trace_sock_exceed_buf_limit(sk, prot, allocated);
1991
1992 /* Alas. Undo changes. */
1993 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1994
1995 sk_memory_allocated_sub(sk, amt);
1996
1997 return 0;
1998 }
1999 EXPORT_SYMBOL(__sk_mem_schedule);
2000
2001 /**
2002 * __sk_reclaim - reclaim memory_allocated
2003 * @sk: socket
2004 */
__sk_mem_reclaim(struct sock * sk)2005 void __sk_mem_reclaim(struct sock *sk)
2006 {
2007 sk_memory_allocated_sub(sk,
2008 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2009 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2010
2011 if (sk_under_memory_pressure(sk) &&
2012 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2013 sk_leave_memory_pressure(sk);
2014 }
2015 EXPORT_SYMBOL(__sk_mem_reclaim);
2016
2017
2018 /*
2019 * Set of default routines for initialising struct proto_ops when
2020 * the protocol does not support a particular function. In certain
2021 * cases where it makes no sense for a protocol to have a "do nothing"
2022 * function, some default processing is provided.
2023 */
2024
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2025 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2026 {
2027 return -EOPNOTSUPP;
2028 }
2029 EXPORT_SYMBOL(sock_no_bind);
2030
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2031 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2032 int len, int flags)
2033 {
2034 return -EOPNOTSUPP;
2035 }
2036 EXPORT_SYMBOL(sock_no_connect);
2037
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2038 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2039 {
2040 return -EOPNOTSUPP;
2041 }
2042 EXPORT_SYMBOL(sock_no_socketpair);
2043
sock_no_accept(struct socket * sock,struct socket * newsock,int flags)2044 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2045 {
2046 return -EOPNOTSUPP;
2047 }
2048 EXPORT_SYMBOL(sock_no_accept);
2049
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int * len,int peer)2050 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2051 int *len, int peer)
2052 {
2053 return -EOPNOTSUPP;
2054 }
2055 EXPORT_SYMBOL(sock_no_getname);
2056
sock_no_poll(struct file * file,struct socket * sock,poll_table * pt)2057 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2058 {
2059 return 0;
2060 }
2061 EXPORT_SYMBOL(sock_no_poll);
2062
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2063 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2064 {
2065 return -EOPNOTSUPP;
2066 }
2067 EXPORT_SYMBOL(sock_no_ioctl);
2068
sock_no_listen(struct socket * sock,int backlog)2069 int sock_no_listen(struct socket *sock, int backlog)
2070 {
2071 return -EOPNOTSUPP;
2072 }
2073 EXPORT_SYMBOL(sock_no_listen);
2074
sock_no_shutdown(struct socket * sock,int how)2075 int sock_no_shutdown(struct socket *sock, int how)
2076 {
2077 return -EOPNOTSUPP;
2078 }
2079 EXPORT_SYMBOL(sock_no_shutdown);
2080
sock_no_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2081 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2082 char __user *optval, unsigned int optlen)
2083 {
2084 return -EOPNOTSUPP;
2085 }
2086 EXPORT_SYMBOL(sock_no_setsockopt);
2087
sock_no_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2088 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2089 char __user *optval, int __user *optlen)
2090 {
2091 return -EOPNOTSUPP;
2092 }
2093 EXPORT_SYMBOL(sock_no_getsockopt);
2094
sock_no_sendmsg(struct kiocb * iocb,struct socket * sock,struct msghdr * m,size_t len)2095 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2096 size_t len)
2097 {
2098 return -EOPNOTSUPP;
2099 }
2100 EXPORT_SYMBOL(sock_no_sendmsg);
2101
sock_no_recvmsg(struct kiocb * iocb,struct socket * sock,struct msghdr * m,size_t len,int flags)2102 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2103 size_t len, int flags)
2104 {
2105 return -EOPNOTSUPP;
2106 }
2107 EXPORT_SYMBOL(sock_no_recvmsg);
2108
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)2109 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2110 {
2111 /* Mirror missing mmap method error code */
2112 return -ENODEV;
2113 }
2114 EXPORT_SYMBOL(sock_no_mmap);
2115
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)2116 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2117 {
2118 ssize_t res;
2119 struct msghdr msg = {.msg_flags = flags};
2120 struct kvec iov;
2121 char *kaddr = kmap(page);
2122 iov.iov_base = kaddr + offset;
2123 iov.iov_len = size;
2124 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2125 kunmap(page);
2126 return res;
2127 }
2128 EXPORT_SYMBOL(sock_no_sendpage);
2129
2130 /*
2131 * Default Socket Callbacks
2132 */
2133
sock_def_wakeup(struct sock * sk)2134 static void sock_def_wakeup(struct sock *sk)
2135 {
2136 struct socket_wq *wq;
2137
2138 rcu_read_lock();
2139 wq = rcu_dereference(sk->sk_wq);
2140 if (wq_has_sleeper(wq))
2141 wake_up_interruptible_all(&wq->wait);
2142 rcu_read_unlock();
2143 }
2144
sock_def_error_report(struct sock * sk)2145 static void sock_def_error_report(struct sock *sk)
2146 {
2147 struct socket_wq *wq;
2148
2149 rcu_read_lock();
2150 wq = rcu_dereference(sk->sk_wq);
2151 if (wq_has_sleeper(wq))
2152 wake_up_interruptible_poll(&wq->wait, POLLERR);
2153 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2154 rcu_read_unlock();
2155 }
2156
sock_def_readable(struct sock * sk,int len)2157 static void sock_def_readable(struct sock *sk, int len)
2158 {
2159 struct socket_wq *wq;
2160
2161 rcu_read_lock();
2162 wq = rcu_dereference(sk->sk_wq);
2163 if (wq_has_sleeper(wq))
2164 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2165 POLLRDNORM | POLLRDBAND);
2166 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2167 rcu_read_unlock();
2168 }
2169
sock_def_write_space(struct sock * sk)2170 static void sock_def_write_space(struct sock *sk)
2171 {
2172 struct socket_wq *wq;
2173
2174 rcu_read_lock();
2175
2176 /* Do not wake up a writer until he can make "significant"
2177 * progress. --DaveM
2178 */
2179 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2180 wq = rcu_dereference(sk->sk_wq);
2181 if (wq_has_sleeper(wq))
2182 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2183 POLLWRNORM | POLLWRBAND);
2184
2185 /* Should agree with poll, otherwise some programs break */
2186 if (sock_writeable(sk))
2187 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2188 }
2189
2190 rcu_read_unlock();
2191 }
2192
sock_def_destruct(struct sock * sk)2193 static void sock_def_destruct(struct sock *sk)
2194 {
2195 kfree(sk->sk_protinfo);
2196 }
2197
sk_send_sigurg(struct sock * sk)2198 void sk_send_sigurg(struct sock *sk)
2199 {
2200 if (sk->sk_socket && sk->sk_socket->file)
2201 if (send_sigurg(&sk->sk_socket->file->f_owner))
2202 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2203 }
2204 EXPORT_SYMBOL(sk_send_sigurg);
2205
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)2206 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2207 unsigned long expires)
2208 {
2209 if (!mod_timer(timer, expires))
2210 sock_hold(sk);
2211 }
2212 EXPORT_SYMBOL(sk_reset_timer);
2213
sk_stop_timer(struct sock * sk,struct timer_list * timer)2214 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2215 {
2216 if (del_timer(timer))
2217 __sock_put(sk);
2218 }
2219 EXPORT_SYMBOL(sk_stop_timer);
2220
sock_init_data(struct socket * sock,struct sock * sk)2221 void sock_init_data(struct socket *sock, struct sock *sk)
2222 {
2223 skb_queue_head_init(&sk->sk_receive_queue);
2224 skb_queue_head_init(&sk->sk_write_queue);
2225 skb_queue_head_init(&sk->sk_error_queue);
2226 #ifdef CONFIG_NET_DMA
2227 skb_queue_head_init(&sk->sk_async_wait_queue);
2228 #endif
2229
2230 sk->sk_send_head = NULL;
2231
2232 init_timer(&sk->sk_timer);
2233
2234 sk->sk_allocation = GFP_KERNEL;
2235 sk->sk_rcvbuf = sysctl_rmem_default;
2236 sk->sk_sndbuf = sysctl_wmem_default;
2237 sk->sk_state = TCP_CLOSE;
2238 sk_set_socket(sk, sock);
2239
2240 sock_set_flag(sk, SOCK_ZAPPED);
2241
2242 if (sock) {
2243 sk->sk_type = sock->type;
2244 sk->sk_wq = sock->wq;
2245 sock->sk = sk;
2246 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2247 } else {
2248 sk->sk_wq = NULL;
2249 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2250 }
2251
2252 spin_lock_init(&sk->sk_dst_lock);
2253 rwlock_init(&sk->sk_callback_lock);
2254 lockdep_set_class_and_name(&sk->sk_callback_lock,
2255 af_callback_keys + sk->sk_family,
2256 af_family_clock_key_strings[sk->sk_family]);
2257
2258 sk->sk_state_change = sock_def_wakeup;
2259 sk->sk_data_ready = sock_def_readable;
2260 sk->sk_write_space = sock_def_write_space;
2261 sk->sk_error_report = sock_def_error_report;
2262 sk->sk_destruct = sock_def_destruct;
2263
2264 sk->sk_frag.page = NULL;
2265 sk->sk_frag.offset = 0;
2266 sk->sk_peek_off = -1;
2267
2268 sk->sk_peer_pid = NULL;
2269 sk->sk_peer_cred = NULL;
2270 sk->sk_write_pending = 0;
2271 sk->sk_rcvlowat = 1;
2272 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2273 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2274
2275 sk->sk_stamp = ktime_set(-1L, 0);
2276
2277 /*
2278 * Before updating sk_refcnt, we must commit prior changes to memory
2279 * (Documentation/RCU/rculist_nulls.txt for details)
2280 */
2281 smp_wmb();
2282 atomic_set(&sk->sk_refcnt, 1);
2283 atomic_set(&sk->sk_drops, 0);
2284 }
2285 EXPORT_SYMBOL(sock_init_data);
2286
lock_sock_nested(struct sock * sk,int subclass)2287 void lock_sock_nested(struct sock *sk, int subclass)
2288 {
2289 might_sleep();
2290 spin_lock_bh(&sk->sk_lock.slock);
2291 if (sk->sk_lock.owned)
2292 __lock_sock(sk);
2293 sk->sk_lock.owned = 1;
2294 spin_unlock(&sk->sk_lock.slock);
2295 /*
2296 * The sk_lock has mutex_lock() semantics here:
2297 */
2298 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2299 local_bh_enable();
2300 }
2301 EXPORT_SYMBOL(lock_sock_nested);
2302
release_sock(struct sock * sk)2303 void release_sock(struct sock *sk)
2304 {
2305 /*
2306 * The sk_lock has mutex_unlock() semantics:
2307 */
2308 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2309
2310 spin_lock_bh(&sk->sk_lock.slock);
2311 if (sk->sk_backlog.tail)
2312 __release_sock(sk);
2313
2314 if (sk->sk_prot->release_cb)
2315 sk->sk_prot->release_cb(sk);
2316
2317 sk->sk_lock.owned = 0;
2318 if (waitqueue_active(&sk->sk_lock.wq))
2319 wake_up(&sk->sk_lock.wq);
2320 spin_unlock_bh(&sk->sk_lock.slock);
2321 }
2322 EXPORT_SYMBOL(release_sock);
2323
2324 /**
2325 * lock_sock_fast - fast version of lock_sock
2326 * @sk: socket
2327 *
2328 * This version should be used for very small section, where process wont block
2329 * return false if fast path is taken
2330 * sk_lock.slock locked, owned = 0, BH disabled
2331 * return true if slow path is taken
2332 * sk_lock.slock unlocked, owned = 1, BH enabled
2333 */
lock_sock_fast(struct sock * sk)2334 bool lock_sock_fast(struct sock *sk)
2335 {
2336 might_sleep();
2337 spin_lock_bh(&sk->sk_lock.slock);
2338
2339 if (!sk->sk_lock.owned)
2340 /*
2341 * Note : We must disable BH
2342 */
2343 return false;
2344
2345 __lock_sock(sk);
2346 sk->sk_lock.owned = 1;
2347 spin_unlock(&sk->sk_lock.slock);
2348 /*
2349 * The sk_lock has mutex_lock() semantics here:
2350 */
2351 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2352 local_bh_enable();
2353 return true;
2354 }
2355 EXPORT_SYMBOL(lock_sock_fast);
2356
sock_get_timestamp(struct sock * sk,struct timeval __user * userstamp)2357 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2358 {
2359 struct timeval tv;
2360 if (!sock_flag(sk, SOCK_TIMESTAMP))
2361 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2362 tv = ktime_to_timeval(sk->sk_stamp);
2363 if (tv.tv_sec == -1)
2364 return -ENOENT;
2365 if (tv.tv_sec == 0) {
2366 sk->sk_stamp = ktime_get_real();
2367 tv = ktime_to_timeval(sk->sk_stamp);
2368 }
2369 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2370 }
2371 EXPORT_SYMBOL(sock_get_timestamp);
2372
sock_get_timestampns(struct sock * sk,struct timespec __user * userstamp)2373 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2374 {
2375 struct timespec ts;
2376 if (!sock_flag(sk, SOCK_TIMESTAMP))
2377 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2378 ts = ktime_to_timespec(sk->sk_stamp);
2379 if (ts.tv_sec == -1)
2380 return -ENOENT;
2381 if (ts.tv_sec == 0) {
2382 sk->sk_stamp = ktime_get_real();
2383 ts = ktime_to_timespec(sk->sk_stamp);
2384 }
2385 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2386 }
2387 EXPORT_SYMBOL(sock_get_timestampns);
2388
sock_enable_timestamp(struct sock * sk,int flag)2389 void sock_enable_timestamp(struct sock *sk, int flag)
2390 {
2391 if (!sock_flag(sk, flag)) {
2392 unsigned long previous_flags = sk->sk_flags;
2393
2394 sock_set_flag(sk, flag);
2395 /*
2396 * we just set one of the two flags which require net
2397 * time stamping, but time stamping might have been on
2398 * already because of the other one
2399 */
2400 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2401 net_enable_timestamp();
2402 }
2403 }
2404
2405 /*
2406 * Get a socket option on an socket.
2407 *
2408 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2409 * asynchronous errors should be reported by getsockopt. We assume
2410 * this means if you specify SO_ERROR (otherwise whats the point of it).
2411 */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2412 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2413 char __user *optval, int __user *optlen)
2414 {
2415 struct sock *sk = sock->sk;
2416
2417 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2418 }
2419 EXPORT_SYMBOL(sock_common_getsockopt);
2420
2421 #ifdef CONFIG_COMPAT
compat_sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2422 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2423 char __user *optval, int __user *optlen)
2424 {
2425 struct sock *sk = sock->sk;
2426
2427 if (sk->sk_prot->compat_getsockopt != NULL)
2428 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2429 optval, optlen);
2430 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2431 }
2432 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2433 #endif
2434
sock_common_recvmsg(struct kiocb * iocb,struct socket * sock,struct msghdr * msg,size_t size,int flags)2435 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2436 struct msghdr *msg, size_t size, int flags)
2437 {
2438 struct sock *sk = sock->sk;
2439 int addr_len = 0;
2440 int err;
2441
2442 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2443 flags & ~MSG_DONTWAIT, &addr_len);
2444 if (err >= 0)
2445 msg->msg_namelen = addr_len;
2446 return err;
2447 }
2448 EXPORT_SYMBOL(sock_common_recvmsg);
2449
2450 /*
2451 * Set socket options on an inet socket.
2452 */
sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2453 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2454 char __user *optval, unsigned int optlen)
2455 {
2456 struct sock *sk = sock->sk;
2457
2458 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2459 }
2460 EXPORT_SYMBOL(sock_common_setsockopt);
2461
2462 #ifdef CONFIG_COMPAT
compat_sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2463 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2464 char __user *optval, unsigned int optlen)
2465 {
2466 struct sock *sk = sock->sk;
2467
2468 if (sk->sk_prot->compat_setsockopt != NULL)
2469 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2470 optval, optlen);
2471 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2472 }
2473 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2474 #endif
2475
sk_common_release(struct sock * sk)2476 void sk_common_release(struct sock *sk)
2477 {
2478 if (sk->sk_prot->destroy)
2479 sk->sk_prot->destroy(sk);
2480
2481 /*
2482 * Observation: when sock_common_release is called, processes have
2483 * no access to socket. But net still has.
2484 * Step one, detach it from networking:
2485 *
2486 * A. Remove from hash tables.
2487 */
2488
2489 sk->sk_prot->unhash(sk);
2490
2491 /*
2492 * In this point socket cannot receive new packets, but it is possible
2493 * that some packets are in flight because some CPU runs receiver and
2494 * did hash table lookup before we unhashed socket. They will achieve
2495 * receive queue and will be purged by socket destructor.
2496 *
2497 * Also we still have packets pending on receive queue and probably,
2498 * our own packets waiting in device queues. sock_destroy will drain
2499 * receive queue, but transmitted packets will delay socket destruction
2500 * until the last reference will be released.
2501 */
2502
2503 sock_orphan(sk);
2504
2505 xfrm_sk_free_policy(sk);
2506
2507 sk_refcnt_debug_release(sk);
2508
2509 if (sk->sk_frag.page) {
2510 put_page(sk->sk_frag.page);
2511 sk->sk_frag.page = NULL;
2512 }
2513
2514 sock_put(sk);
2515 }
2516 EXPORT_SYMBOL(sk_common_release);
2517
2518 #ifdef CONFIG_PROC_FS
2519 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
2520 struct prot_inuse {
2521 int val[PROTO_INUSE_NR];
2522 };
2523
2524 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2525
2526 #ifdef CONFIG_NET_NS
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)2527 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2528 {
2529 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2530 }
2531 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2532
sock_prot_inuse_get(struct net * net,struct proto * prot)2533 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2534 {
2535 int cpu, idx = prot->inuse_idx;
2536 int res = 0;
2537
2538 for_each_possible_cpu(cpu)
2539 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2540
2541 return res >= 0 ? res : 0;
2542 }
2543 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2544
sock_inuse_init_net(struct net * net)2545 static int __net_init sock_inuse_init_net(struct net *net)
2546 {
2547 net->core.inuse = alloc_percpu(struct prot_inuse);
2548 return net->core.inuse ? 0 : -ENOMEM;
2549 }
2550
sock_inuse_exit_net(struct net * net)2551 static void __net_exit sock_inuse_exit_net(struct net *net)
2552 {
2553 free_percpu(net->core.inuse);
2554 }
2555
2556 static struct pernet_operations net_inuse_ops = {
2557 .init = sock_inuse_init_net,
2558 .exit = sock_inuse_exit_net,
2559 };
2560
net_inuse_init(void)2561 static __init int net_inuse_init(void)
2562 {
2563 if (register_pernet_subsys(&net_inuse_ops))
2564 panic("Cannot initialize net inuse counters");
2565
2566 return 0;
2567 }
2568
2569 core_initcall(net_inuse_init);
2570 #else
2571 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2572
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)2573 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2574 {
2575 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2576 }
2577 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2578
sock_prot_inuse_get(struct net * net,struct proto * prot)2579 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2580 {
2581 int cpu, idx = prot->inuse_idx;
2582 int res = 0;
2583
2584 for_each_possible_cpu(cpu)
2585 res += per_cpu(prot_inuse, cpu).val[idx];
2586
2587 return res >= 0 ? res : 0;
2588 }
2589 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2590 #endif
2591
assign_proto_idx(struct proto * prot)2592 static void assign_proto_idx(struct proto *prot)
2593 {
2594 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2595
2596 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2597 pr_err("PROTO_INUSE_NR exhausted\n");
2598 return;
2599 }
2600
2601 set_bit(prot->inuse_idx, proto_inuse_idx);
2602 }
2603
release_proto_idx(struct proto * prot)2604 static void release_proto_idx(struct proto *prot)
2605 {
2606 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2607 clear_bit(prot->inuse_idx, proto_inuse_idx);
2608 }
2609 #else
assign_proto_idx(struct proto * prot)2610 static inline void assign_proto_idx(struct proto *prot)
2611 {
2612 }
2613
release_proto_idx(struct proto * prot)2614 static inline void release_proto_idx(struct proto *prot)
2615 {
2616 }
2617 #endif
2618
proto_register(struct proto * prot,int alloc_slab)2619 int proto_register(struct proto *prot, int alloc_slab)
2620 {
2621 if (alloc_slab) {
2622 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2623 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2624 NULL);
2625
2626 if (prot->slab == NULL) {
2627 pr_crit("%s: Can't create sock SLAB cache!\n",
2628 prot->name);
2629 goto out;
2630 }
2631
2632 if (prot->rsk_prot != NULL) {
2633 prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2634 if (prot->rsk_prot->slab_name == NULL)
2635 goto out_free_sock_slab;
2636
2637 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2638 prot->rsk_prot->obj_size, 0,
2639 SLAB_HWCACHE_ALIGN, NULL);
2640
2641 if (prot->rsk_prot->slab == NULL) {
2642 pr_crit("%s: Can't create request sock SLAB cache!\n",
2643 prot->name);
2644 goto out_free_request_sock_slab_name;
2645 }
2646 }
2647
2648 if (prot->twsk_prot != NULL) {
2649 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2650
2651 if (prot->twsk_prot->twsk_slab_name == NULL)
2652 goto out_free_request_sock_slab;
2653
2654 prot->twsk_prot->twsk_slab =
2655 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2656 prot->twsk_prot->twsk_obj_size,
2657 0,
2658 SLAB_HWCACHE_ALIGN |
2659 prot->slab_flags,
2660 NULL);
2661 if (prot->twsk_prot->twsk_slab == NULL)
2662 goto out_free_timewait_sock_slab_name;
2663 }
2664 }
2665
2666 mutex_lock(&proto_list_mutex);
2667 list_add(&prot->node, &proto_list);
2668 assign_proto_idx(prot);
2669 mutex_unlock(&proto_list_mutex);
2670 return 0;
2671
2672 out_free_timewait_sock_slab_name:
2673 kfree(prot->twsk_prot->twsk_slab_name);
2674 out_free_request_sock_slab:
2675 if (prot->rsk_prot && prot->rsk_prot->slab) {
2676 kmem_cache_destroy(prot->rsk_prot->slab);
2677 prot->rsk_prot->slab = NULL;
2678 }
2679 out_free_request_sock_slab_name:
2680 if (prot->rsk_prot)
2681 kfree(prot->rsk_prot->slab_name);
2682 out_free_sock_slab:
2683 kmem_cache_destroy(prot->slab);
2684 prot->slab = NULL;
2685 out:
2686 return -ENOBUFS;
2687 }
2688 EXPORT_SYMBOL(proto_register);
2689
proto_unregister(struct proto * prot)2690 void proto_unregister(struct proto *prot)
2691 {
2692 mutex_lock(&proto_list_mutex);
2693 release_proto_idx(prot);
2694 list_del(&prot->node);
2695 mutex_unlock(&proto_list_mutex);
2696
2697 if (prot->slab != NULL) {
2698 kmem_cache_destroy(prot->slab);
2699 prot->slab = NULL;
2700 }
2701
2702 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2703 kmem_cache_destroy(prot->rsk_prot->slab);
2704 kfree(prot->rsk_prot->slab_name);
2705 prot->rsk_prot->slab = NULL;
2706 }
2707
2708 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2709 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2710 kfree(prot->twsk_prot->twsk_slab_name);
2711 prot->twsk_prot->twsk_slab = NULL;
2712 }
2713 }
2714 EXPORT_SYMBOL(proto_unregister);
2715
2716 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)2717 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2718 __acquires(proto_list_mutex)
2719 {
2720 mutex_lock(&proto_list_mutex);
2721 return seq_list_start_head(&proto_list, *pos);
2722 }
2723
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)2724 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2725 {
2726 return seq_list_next(v, &proto_list, pos);
2727 }
2728
proto_seq_stop(struct seq_file * seq,void * v)2729 static void proto_seq_stop(struct seq_file *seq, void *v)
2730 __releases(proto_list_mutex)
2731 {
2732 mutex_unlock(&proto_list_mutex);
2733 }
2734
proto_method_implemented(const void * method)2735 static char proto_method_implemented(const void *method)
2736 {
2737 return method == NULL ? 'n' : 'y';
2738 }
sock_prot_memory_allocated(struct proto * proto)2739 static long sock_prot_memory_allocated(struct proto *proto)
2740 {
2741 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2742 }
2743
sock_prot_memory_pressure(struct proto * proto)2744 static char *sock_prot_memory_pressure(struct proto *proto)
2745 {
2746 return proto->memory_pressure != NULL ?
2747 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2748 }
2749
proto_seq_printf(struct seq_file * seq,struct proto * proto)2750 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2751 {
2752
2753 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
2754 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2755 proto->name,
2756 proto->obj_size,
2757 sock_prot_inuse_get(seq_file_net(seq), proto),
2758 sock_prot_memory_allocated(proto),
2759 sock_prot_memory_pressure(proto),
2760 proto->max_header,
2761 proto->slab == NULL ? "no" : "yes",
2762 module_name(proto->owner),
2763 proto_method_implemented(proto->close),
2764 proto_method_implemented(proto->connect),
2765 proto_method_implemented(proto->disconnect),
2766 proto_method_implemented(proto->accept),
2767 proto_method_implemented(proto->ioctl),
2768 proto_method_implemented(proto->init),
2769 proto_method_implemented(proto->destroy),
2770 proto_method_implemented(proto->shutdown),
2771 proto_method_implemented(proto->setsockopt),
2772 proto_method_implemented(proto->getsockopt),
2773 proto_method_implemented(proto->sendmsg),
2774 proto_method_implemented(proto->recvmsg),
2775 proto_method_implemented(proto->sendpage),
2776 proto_method_implemented(proto->bind),
2777 proto_method_implemented(proto->backlog_rcv),
2778 proto_method_implemented(proto->hash),
2779 proto_method_implemented(proto->unhash),
2780 proto_method_implemented(proto->get_port),
2781 proto_method_implemented(proto->enter_memory_pressure));
2782 }
2783
proto_seq_show(struct seq_file * seq,void * v)2784 static int proto_seq_show(struct seq_file *seq, void *v)
2785 {
2786 if (v == &proto_list)
2787 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2788 "protocol",
2789 "size",
2790 "sockets",
2791 "memory",
2792 "press",
2793 "maxhdr",
2794 "slab",
2795 "module",
2796 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2797 else
2798 proto_seq_printf(seq, list_entry(v, struct proto, node));
2799 return 0;
2800 }
2801
2802 static const struct seq_operations proto_seq_ops = {
2803 .start = proto_seq_start,
2804 .next = proto_seq_next,
2805 .stop = proto_seq_stop,
2806 .show = proto_seq_show,
2807 };
2808
proto_seq_open(struct inode * inode,struct file * file)2809 static int proto_seq_open(struct inode *inode, struct file *file)
2810 {
2811 return seq_open_net(inode, file, &proto_seq_ops,
2812 sizeof(struct seq_net_private));
2813 }
2814
2815 static const struct file_operations proto_seq_fops = {
2816 .owner = THIS_MODULE,
2817 .open = proto_seq_open,
2818 .read = seq_read,
2819 .llseek = seq_lseek,
2820 .release = seq_release_net,
2821 };
2822
proto_init_net(struct net * net)2823 static __net_init int proto_init_net(struct net *net)
2824 {
2825 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2826 return -ENOMEM;
2827
2828 return 0;
2829 }
2830
proto_exit_net(struct net * net)2831 static __net_exit void proto_exit_net(struct net *net)
2832 {
2833 remove_proc_entry("protocols", net->proc_net);
2834 }
2835
2836
2837 static __net_initdata struct pernet_operations proto_net_ops = {
2838 .init = proto_init_net,
2839 .exit = proto_exit_net,
2840 };
2841
proto_init(void)2842 static int __init proto_init(void)
2843 {
2844 return register_pernet_subsys(&proto_net_ops);
2845 }
2846
2847 subsys_initcall(proto_init);
2848
2849 #endif /* PROC_FS */
2850