1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 */
85
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117
118 #include <linux/uaccess.h>
119
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136
137 #include <trace/events/sock.h>
138 #include <trace/hooks/sched.h>
139
140 #include <net/tcp.h>
141 #include <net/busy_poll.h>
142
143 static DEFINE_MUTEX(proto_list_mutex);
144 static LIST_HEAD(proto_list);
145
146 static void sock_inuse_add(struct net *net, int val);
147
148 /**
149 * sk_ns_capable - General socket capability test
150 * @sk: Socket to use a capability on or through
151 * @user_ns: The user namespace of the capability to use
152 * @cap: The capability to use
153 *
154 * Test to see if the opener of the socket had when the socket was
155 * created and the current process has the capability @cap in the user
156 * namespace @user_ns.
157 */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)158 bool sk_ns_capable(const struct sock *sk,
159 struct user_namespace *user_ns, int cap)
160 {
161 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
162 ns_capable(user_ns, cap);
163 }
164 EXPORT_SYMBOL(sk_ns_capable);
165
166 /**
167 * sk_capable - Socket global capability test
168 * @sk: Socket to use a capability on or through
169 * @cap: The global capability to use
170 *
171 * Test to see if the opener of the socket had when the socket was
172 * created and the current process has the capability @cap in all user
173 * namespaces.
174 */
sk_capable(const struct sock * sk,int cap)175 bool sk_capable(const struct sock *sk, int cap)
176 {
177 return sk_ns_capable(sk, &init_user_ns, cap);
178 }
179 EXPORT_SYMBOL(sk_capable);
180
181 /**
182 * sk_net_capable - Network namespace socket capability test
183 * @sk: Socket to use a capability on or through
184 * @cap: The capability to use
185 *
186 * Test to see if the opener of the socket had when the socket was created
187 * and the current process has the capability @cap over the network namespace
188 * the socket is a member of.
189 */
sk_net_capable(const struct sock * sk,int cap)190 bool sk_net_capable(const struct sock *sk, int cap)
191 {
192 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
193 }
194 EXPORT_SYMBOL(sk_net_capable);
195
196 /*
197 * Each address family might have different locking rules, so we have
198 * one slock key per address family and separate keys for internal and
199 * userspace sockets.
200 */
201 static struct lock_class_key af_family_keys[AF_MAX];
202 static struct lock_class_key af_family_kern_keys[AF_MAX];
203 static struct lock_class_key af_family_slock_keys[AF_MAX];
204 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
205
206 /*
207 * Make lock validator output more readable. (we pre-construct these
208 * strings build-time, so that runtime initialization of socket
209 * locks is fast):
210 */
211
212 #define _sock_locks(x) \
213 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
214 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
215 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
216 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
217 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
218 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
219 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
220 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
221 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
222 x "27" , x "28" , x "AF_CAN" , \
223 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
224 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
225 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
226 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
227 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
228 x "AF_MAX"
229
230 static const char *const af_family_key_strings[AF_MAX+1] = {
231 _sock_locks("sk_lock-")
232 };
233 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
234 _sock_locks("slock-")
235 };
236 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
237 _sock_locks("clock-")
238 };
239
240 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
241 _sock_locks("k-sk_lock-")
242 };
243 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
244 _sock_locks("k-slock-")
245 };
246 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
247 _sock_locks("k-clock-")
248 };
249 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
250 _sock_locks("rlock-")
251 };
252 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
253 _sock_locks("wlock-")
254 };
255 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
256 _sock_locks("elock-")
257 };
258
259 /*
260 * sk_callback_lock and sk queues locking rules are per-address-family,
261 * so split the lock classes by using a per-AF key:
262 */
263 static struct lock_class_key af_callback_keys[AF_MAX];
264 static struct lock_class_key af_rlock_keys[AF_MAX];
265 static struct lock_class_key af_wlock_keys[AF_MAX];
266 static struct lock_class_key af_elock_keys[AF_MAX];
267 static struct lock_class_key af_kern_callback_keys[AF_MAX];
268
269 /* Run time adjustable parameters. */
270 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
271 EXPORT_SYMBOL(sysctl_wmem_max);
272 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
273 EXPORT_SYMBOL(sysctl_rmem_max);
274 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
275 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
276
277 /* Maximal space eaten by iovec or ancillary data plus some space */
278 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
279 EXPORT_SYMBOL(sysctl_optmem_max);
280
281 int sysctl_tstamp_allow_data __read_mostly = 1;
282
283 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
284 EXPORT_SYMBOL_GPL(memalloc_socks_key);
285
286 /**
287 * sk_set_memalloc - sets %SOCK_MEMALLOC
288 * @sk: socket to set it on
289 *
290 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
291 * It's the responsibility of the admin to adjust min_free_kbytes
292 * to meet the requirements
293 */
sk_set_memalloc(struct sock * sk)294 void sk_set_memalloc(struct sock *sk)
295 {
296 sock_set_flag(sk, SOCK_MEMALLOC);
297 sk->sk_allocation |= __GFP_MEMALLOC;
298 static_branch_inc(&memalloc_socks_key);
299 }
300 EXPORT_SYMBOL_GPL(sk_set_memalloc);
301
sk_clear_memalloc(struct sock * sk)302 void sk_clear_memalloc(struct sock *sk)
303 {
304 sock_reset_flag(sk, SOCK_MEMALLOC);
305 sk->sk_allocation &= ~__GFP_MEMALLOC;
306 static_branch_dec(&memalloc_socks_key);
307
308 /*
309 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
310 * progress of swapping. SOCK_MEMALLOC may be cleared while
311 * it has rmem allocations due to the last swapfile being deactivated
312 * but there is a risk that the socket is unusable due to exceeding
313 * the rmem limits. Reclaim the reserves and obey rmem limits again.
314 */
315 sk_mem_reclaim(sk);
316 }
317 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
318
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)319 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
320 {
321 int ret;
322 unsigned int noreclaim_flag;
323
324 /* these should have been dropped before queueing */
325 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
326
327 noreclaim_flag = memalloc_noreclaim_save();
328 ret = sk->sk_backlog_rcv(sk, skb);
329 memalloc_noreclaim_restore(noreclaim_flag);
330
331 return ret;
332 }
333 EXPORT_SYMBOL(__sk_backlog_rcv);
334
sock_get_timeout(long timeo,void * optval,bool old_timeval)335 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
336 {
337 struct __kernel_sock_timeval tv;
338
339 if (timeo == MAX_SCHEDULE_TIMEOUT) {
340 tv.tv_sec = 0;
341 tv.tv_usec = 0;
342 } else {
343 tv.tv_sec = timeo / HZ;
344 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
345 }
346
347 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
348 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
349 *(struct old_timeval32 *)optval = tv32;
350 return sizeof(tv32);
351 }
352
353 if (old_timeval) {
354 struct __kernel_old_timeval old_tv;
355 old_tv.tv_sec = tv.tv_sec;
356 old_tv.tv_usec = tv.tv_usec;
357 *(struct __kernel_old_timeval *)optval = old_tv;
358 return sizeof(old_tv);
359 }
360
361 *(struct __kernel_sock_timeval *)optval = tv;
362 return sizeof(tv);
363 }
364
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)365 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
366 bool old_timeval)
367 {
368 struct __kernel_sock_timeval tv;
369
370 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
371 struct old_timeval32 tv32;
372
373 if (optlen < sizeof(tv32))
374 return -EINVAL;
375
376 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
377 return -EFAULT;
378 tv.tv_sec = tv32.tv_sec;
379 tv.tv_usec = tv32.tv_usec;
380 } else if (old_timeval) {
381 struct __kernel_old_timeval old_tv;
382
383 if (optlen < sizeof(old_tv))
384 return -EINVAL;
385 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
386 return -EFAULT;
387 tv.tv_sec = old_tv.tv_sec;
388 tv.tv_usec = old_tv.tv_usec;
389 } else {
390 if (optlen < sizeof(tv))
391 return -EINVAL;
392 if (copy_from_sockptr(&tv, optval, sizeof(tv)))
393 return -EFAULT;
394 }
395 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
396 return -EDOM;
397
398 if (tv.tv_sec < 0) {
399 static int warned __read_mostly;
400
401 *timeo_p = 0;
402 if (warned < 10 && net_ratelimit()) {
403 warned++;
404 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
405 __func__, current->comm, task_pid_nr(current));
406 }
407 return 0;
408 }
409 *timeo_p = MAX_SCHEDULE_TIMEOUT;
410 if (tv.tv_sec == 0 && tv.tv_usec == 0)
411 return 0;
412 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
413 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
414 return 0;
415 }
416
sock_needs_netstamp(const struct sock * sk)417 static bool sock_needs_netstamp(const struct sock *sk)
418 {
419 switch (sk->sk_family) {
420 case AF_UNSPEC:
421 case AF_UNIX:
422 return false;
423 default:
424 return true;
425 }
426 }
427
sock_disable_timestamp(struct sock * sk,unsigned long flags)428 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
429 {
430 if (sk->sk_flags & flags) {
431 sk->sk_flags &= ~flags;
432 if (sock_needs_netstamp(sk) &&
433 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
434 net_disable_timestamp();
435 }
436 }
437
438
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)439 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
440 {
441 unsigned long flags;
442 struct sk_buff_head *list = &sk->sk_receive_queue;
443
444 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
445 atomic_inc(&sk->sk_drops);
446 trace_sock_rcvqueue_full(sk, skb);
447 return -ENOMEM;
448 }
449
450 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
451 atomic_inc(&sk->sk_drops);
452 return -ENOBUFS;
453 }
454
455 skb->dev = NULL;
456 skb_set_owner_r(skb, sk);
457
458 /* we escape from rcu protected region, make sure we dont leak
459 * a norefcounted dst
460 */
461 skb_dst_force(skb);
462
463 spin_lock_irqsave(&list->lock, flags);
464 sock_skb_set_dropcount(sk, skb);
465 __skb_queue_tail(list, skb);
466 spin_unlock_irqrestore(&list->lock, flags);
467
468 if (!sock_flag(sk, SOCK_DEAD))
469 sk->sk_data_ready(sk);
470 return 0;
471 }
472 EXPORT_SYMBOL(__sock_queue_rcv_skb);
473
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)474 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
475 {
476 int err;
477
478 err = sk_filter(sk, skb);
479 if (err)
480 return err;
481
482 return __sock_queue_rcv_skb(sk, skb);
483 }
484 EXPORT_SYMBOL(sock_queue_rcv_skb);
485
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)486 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
487 const int nested, unsigned int trim_cap, bool refcounted)
488 {
489 int rc = NET_RX_SUCCESS;
490
491 if (sk_filter_trim_cap(sk, skb, trim_cap))
492 goto discard_and_relse;
493
494 skb->dev = NULL;
495
496 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
497 atomic_inc(&sk->sk_drops);
498 goto discard_and_relse;
499 }
500 if (nested)
501 bh_lock_sock_nested(sk);
502 else
503 bh_lock_sock(sk);
504 if (!sock_owned_by_user(sk)) {
505 /*
506 * trylock + unlock semantics:
507 */
508 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
509
510 rc = sk_backlog_rcv(sk, skb);
511
512 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
513 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
514 bh_unlock_sock(sk);
515 atomic_inc(&sk->sk_drops);
516 goto discard_and_relse;
517 }
518
519 bh_unlock_sock(sk);
520 out:
521 if (refcounted)
522 sock_put(sk);
523 return rc;
524 discard_and_relse:
525 kfree_skb(skb);
526 goto out;
527 }
528 EXPORT_SYMBOL(__sk_receive_skb);
529
__sk_dst_check(struct sock * sk,u32 cookie)530 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
531 {
532 struct dst_entry *dst = __sk_dst_get(sk);
533
534 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
535 sk_tx_queue_clear(sk);
536 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
537 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
538 dst_release(dst);
539 return NULL;
540 }
541
542 return dst;
543 }
544 EXPORT_SYMBOL(__sk_dst_check);
545
sk_dst_check(struct sock * sk,u32 cookie)546 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
547 {
548 struct dst_entry *dst = sk_dst_get(sk);
549
550 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
551 sk_dst_reset(sk);
552 dst_release(dst);
553 return NULL;
554 }
555
556 return dst;
557 }
558 EXPORT_SYMBOL(sk_dst_check);
559
sock_bindtoindex_locked(struct sock * sk,int ifindex)560 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
561 {
562 int ret = -ENOPROTOOPT;
563 #ifdef CONFIG_NETDEVICES
564 struct net *net = sock_net(sk);
565
566 /* Sorry... */
567 ret = -EPERM;
568 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
569 goto out;
570
571 ret = -EINVAL;
572 if (ifindex < 0)
573 goto out;
574
575 sk->sk_bound_dev_if = ifindex;
576 if (sk->sk_prot->rehash)
577 sk->sk_prot->rehash(sk);
578 sk_dst_reset(sk);
579
580 ret = 0;
581
582 out:
583 #endif
584
585 return ret;
586 }
587
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)588 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
589 {
590 int ret;
591
592 if (lock_sk)
593 lock_sock(sk);
594 ret = sock_bindtoindex_locked(sk, ifindex);
595 if (lock_sk)
596 release_sock(sk);
597
598 return ret;
599 }
600 EXPORT_SYMBOL(sock_bindtoindex);
601
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)602 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
603 {
604 int ret = -ENOPROTOOPT;
605 #ifdef CONFIG_NETDEVICES
606 struct net *net = sock_net(sk);
607 char devname[IFNAMSIZ];
608 int index;
609
610 ret = -EINVAL;
611 if (optlen < 0)
612 goto out;
613
614 /* Bind this socket to a particular device like "eth0",
615 * as specified in the passed interface name. If the
616 * name is "" or the option length is zero the socket
617 * is not bound.
618 */
619 if (optlen > IFNAMSIZ - 1)
620 optlen = IFNAMSIZ - 1;
621 memset(devname, 0, sizeof(devname));
622
623 ret = -EFAULT;
624 if (copy_from_sockptr(devname, optval, optlen))
625 goto out;
626
627 index = 0;
628 if (devname[0] != '\0') {
629 struct net_device *dev;
630
631 rcu_read_lock();
632 dev = dev_get_by_name_rcu(net, devname);
633 if (dev)
634 index = dev->ifindex;
635 rcu_read_unlock();
636 ret = -ENODEV;
637 if (!dev)
638 goto out;
639 }
640
641 return sock_bindtoindex(sk, index, true);
642 out:
643 #endif
644
645 return ret;
646 }
647
sock_getbindtodevice(struct sock * sk,sockptr_t optval,sockptr_t optlen,int len)648 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
649 sockptr_t optlen, int len)
650 {
651 int ret = -ENOPROTOOPT;
652 #ifdef CONFIG_NETDEVICES
653 struct net *net = sock_net(sk);
654 char devname[IFNAMSIZ];
655
656 if (sk->sk_bound_dev_if == 0) {
657 len = 0;
658 goto zero;
659 }
660
661 ret = -EINVAL;
662 if (len < IFNAMSIZ)
663 goto out;
664
665 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
666 if (ret)
667 goto out;
668
669 len = strlen(devname) + 1;
670
671 ret = -EFAULT;
672 if (copy_to_sockptr(optval, devname, len))
673 goto out;
674
675 zero:
676 ret = -EFAULT;
677 if (copy_to_sockptr(optlen, &len, sizeof(int)))
678 goto out;
679
680 ret = 0;
681
682 out:
683 #endif
684
685 return ret;
686 }
687
sk_mc_loop(struct sock * sk)688 bool sk_mc_loop(struct sock *sk)
689 {
690 if (dev_recursion_level())
691 return false;
692 if (!sk)
693 return true;
694 /* IPV6_ADDRFORM can change sk->sk_family under us. */
695 switch (READ_ONCE(sk->sk_family)) {
696 case AF_INET:
697 return inet_sk(sk)->mc_loop;
698 #if IS_ENABLED(CONFIG_IPV6)
699 case AF_INET6:
700 return inet6_sk(sk)->mc_loop;
701 #endif
702 }
703 WARN_ON_ONCE(1);
704 return true;
705 }
706 EXPORT_SYMBOL(sk_mc_loop);
707
sock_set_reuseaddr(struct sock * sk)708 void sock_set_reuseaddr(struct sock *sk)
709 {
710 lock_sock(sk);
711 sk->sk_reuse = SK_CAN_REUSE;
712 release_sock(sk);
713 }
714 EXPORT_SYMBOL(sock_set_reuseaddr);
715
sock_set_reuseport(struct sock * sk)716 void sock_set_reuseport(struct sock *sk)
717 {
718 lock_sock(sk);
719 sk->sk_reuseport = true;
720 release_sock(sk);
721 }
722 EXPORT_SYMBOL(sock_set_reuseport);
723
sock_no_linger(struct sock * sk)724 void sock_no_linger(struct sock *sk)
725 {
726 lock_sock(sk);
727 sk->sk_lingertime = 0;
728 sock_set_flag(sk, SOCK_LINGER);
729 release_sock(sk);
730 }
731 EXPORT_SYMBOL(sock_no_linger);
732
sock_set_priority(struct sock * sk,u32 priority)733 void sock_set_priority(struct sock *sk, u32 priority)
734 {
735 lock_sock(sk);
736 sk->sk_priority = priority;
737 release_sock(sk);
738 }
739 EXPORT_SYMBOL(sock_set_priority);
740
sock_set_sndtimeo(struct sock * sk,s64 secs)741 void sock_set_sndtimeo(struct sock *sk, s64 secs)
742 {
743 lock_sock(sk);
744 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
745 sk->sk_sndtimeo = secs * HZ;
746 else
747 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
748 release_sock(sk);
749 }
750 EXPORT_SYMBOL(sock_set_sndtimeo);
751
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)752 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
753 {
754 if (val) {
755 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
756 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
757 sock_set_flag(sk, SOCK_RCVTSTAMP);
758 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
759 } else {
760 sock_reset_flag(sk, SOCK_RCVTSTAMP);
761 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
762 }
763 }
764
sock_enable_timestamps(struct sock * sk)765 void sock_enable_timestamps(struct sock *sk)
766 {
767 lock_sock(sk);
768 __sock_set_timestamps(sk, true, false, true);
769 release_sock(sk);
770 }
771 EXPORT_SYMBOL(sock_enable_timestamps);
772
sock_set_keepalive(struct sock * sk)773 void sock_set_keepalive(struct sock *sk)
774 {
775 lock_sock(sk);
776 if (sk->sk_prot->keepalive)
777 sk->sk_prot->keepalive(sk, true);
778 sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
779 release_sock(sk);
780 }
781 EXPORT_SYMBOL(sock_set_keepalive);
782
__sock_set_rcvbuf(struct sock * sk,int val)783 static void __sock_set_rcvbuf(struct sock *sk, int val)
784 {
785 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
786 * as a negative value.
787 */
788 val = min_t(int, val, INT_MAX / 2);
789 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
790
791 /* We double it on the way in to account for "struct sk_buff" etc.
792 * overhead. Applications assume that the SO_RCVBUF setting they make
793 * will allow that much actual data to be received on that socket.
794 *
795 * Applications are unaware that "struct sk_buff" and other overheads
796 * allocate from the receive buffer during socket buffer allocation.
797 *
798 * And after considering the possible alternatives, returning the value
799 * we actually used in getsockopt is the most desirable behavior.
800 */
801 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
802 }
803
sock_set_rcvbuf(struct sock * sk,int val)804 void sock_set_rcvbuf(struct sock *sk, int val)
805 {
806 lock_sock(sk);
807 __sock_set_rcvbuf(sk, val);
808 release_sock(sk);
809 }
810 EXPORT_SYMBOL(sock_set_rcvbuf);
811
__sock_set_mark(struct sock * sk,u32 val)812 static void __sock_set_mark(struct sock *sk, u32 val)
813 {
814 if (val != sk->sk_mark) {
815 sk->sk_mark = val;
816 sk_dst_reset(sk);
817 }
818 }
819
sock_set_mark(struct sock * sk,u32 val)820 void sock_set_mark(struct sock *sk, u32 val)
821 {
822 lock_sock(sk);
823 __sock_set_mark(sk, val);
824 release_sock(sk);
825 }
826 EXPORT_SYMBOL(sock_set_mark);
827
828 /*
829 * This is meant for all protocols to use and covers goings on
830 * at the socket level. Everything here is generic.
831 */
832
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)833 int sock_setsockopt(struct socket *sock, int level, int optname,
834 sockptr_t optval, unsigned int optlen)
835 {
836 struct sock_txtime sk_txtime;
837 struct sock *sk = sock->sk;
838 int val;
839 int valbool;
840 struct linger ling;
841 int ret = 0;
842
843 /*
844 * Options without arguments
845 */
846
847 if (optname == SO_BINDTODEVICE)
848 return sock_setbindtodevice(sk, optval, optlen);
849
850 if (optlen < sizeof(int))
851 return -EINVAL;
852
853 if (copy_from_sockptr(&val, optval, sizeof(val)))
854 return -EFAULT;
855
856 valbool = val ? 1 : 0;
857
858 lock_sock(sk);
859
860 switch (optname) {
861 case SO_DEBUG:
862 if (val && !capable(CAP_NET_ADMIN))
863 ret = -EACCES;
864 else
865 sock_valbool_flag(sk, SOCK_DBG, valbool);
866 break;
867 case SO_REUSEADDR:
868 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
869 break;
870 case SO_REUSEPORT:
871 sk->sk_reuseport = valbool;
872 break;
873 case SO_TYPE:
874 case SO_PROTOCOL:
875 case SO_DOMAIN:
876 case SO_ERROR:
877 ret = -ENOPROTOOPT;
878 break;
879 case SO_DONTROUTE:
880 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
881 sk_dst_reset(sk);
882 break;
883 case SO_BROADCAST:
884 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
885 break;
886 case SO_SNDBUF:
887 /* Don't error on this BSD doesn't and if you think
888 * about it this is right. Otherwise apps have to
889 * play 'guess the biggest size' games. RCVBUF/SNDBUF
890 * are treated in BSD as hints
891 */
892 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
893 set_sndbuf:
894 /* Ensure val * 2 fits into an int, to prevent max_t()
895 * from treating it as a negative value.
896 */
897 val = min_t(int, val, INT_MAX / 2);
898 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
899 WRITE_ONCE(sk->sk_sndbuf,
900 max_t(int, val * 2, SOCK_MIN_SNDBUF));
901 /* Wake up sending tasks if we upped the value. */
902 sk->sk_write_space(sk);
903 break;
904
905 case SO_SNDBUFFORCE:
906 if (!capable(CAP_NET_ADMIN)) {
907 ret = -EPERM;
908 break;
909 }
910
911 /* No negative values (to prevent underflow, as val will be
912 * multiplied by 2).
913 */
914 if (val < 0)
915 val = 0;
916 goto set_sndbuf;
917
918 case SO_RCVBUF:
919 /* Don't error on this BSD doesn't and if you think
920 * about it this is right. Otherwise apps have to
921 * play 'guess the biggest size' games. RCVBUF/SNDBUF
922 * are treated in BSD as hints
923 */
924 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
925 break;
926
927 case SO_RCVBUFFORCE:
928 if (!capable(CAP_NET_ADMIN)) {
929 ret = -EPERM;
930 break;
931 }
932
933 /* No negative values (to prevent underflow, as val will be
934 * multiplied by 2).
935 */
936 __sock_set_rcvbuf(sk, max(val, 0));
937 break;
938
939 case SO_KEEPALIVE:
940 if (sk->sk_prot->keepalive)
941 sk->sk_prot->keepalive(sk, valbool);
942 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
943 break;
944
945 case SO_OOBINLINE:
946 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
947 break;
948
949 case SO_NO_CHECK:
950 sk->sk_no_check_tx = valbool;
951 break;
952
953 case SO_PRIORITY:
954 if ((val >= 0 && val <= 6) ||
955 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
956 sk->sk_priority = val;
957 else
958 ret = -EPERM;
959 break;
960
961 case SO_LINGER:
962 if (optlen < sizeof(ling)) {
963 ret = -EINVAL; /* 1003.1g */
964 break;
965 }
966 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
967 ret = -EFAULT;
968 break;
969 }
970 if (!ling.l_onoff)
971 sock_reset_flag(sk, SOCK_LINGER);
972 else {
973 #if (BITS_PER_LONG == 32)
974 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
975 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
976 else
977 #endif
978 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
979 sock_set_flag(sk, SOCK_LINGER);
980 }
981 break;
982
983 case SO_BSDCOMPAT:
984 break;
985
986 case SO_PASSCRED:
987 if (valbool)
988 set_bit(SOCK_PASSCRED, &sock->flags);
989 else
990 clear_bit(SOCK_PASSCRED, &sock->flags);
991 break;
992
993 case SO_TIMESTAMP_OLD:
994 __sock_set_timestamps(sk, valbool, false, false);
995 break;
996 case SO_TIMESTAMP_NEW:
997 __sock_set_timestamps(sk, valbool, true, false);
998 break;
999 case SO_TIMESTAMPNS_OLD:
1000 __sock_set_timestamps(sk, valbool, false, true);
1001 break;
1002 case SO_TIMESTAMPNS_NEW:
1003 __sock_set_timestamps(sk, valbool, true, true);
1004 break;
1005 case SO_TIMESTAMPING_NEW:
1006 case SO_TIMESTAMPING_OLD:
1007 if (val & ~SOF_TIMESTAMPING_MASK) {
1008 ret = -EINVAL;
1009 break;
1010 }
1011
1012 if (val & SOF_TIMESTAMPING_OPT_ID &&
1013 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
1014 if (sk->sk_protocol == IPPROTO_TCP &&
1015 sk->sk_type == SOCK_STREAM) {
1016 if ((1 << sk->sk_state) &
1017 (TCPF_CLOSE | TCPF_LISTEN)) {
1018 ret = -EINVAL;
1019 break;
1020 }
1021 sk->sk_tskey = tcp_sk(sk)->snd_una;
1022 } else {
1023 sk->sk_tskey = 0;
1024 }
1025 }
1026
1027 if (val & SOF_TIMESTAMPING_OPT_STATS &&
1028 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
1029 ret = -EINVAL;
1030 break;
1031 }
1032
1033 sk->sk_tsflags = val;
1034 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
1035
1036 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
1037 sock_enable_timestamp(sk,
1038 SOCK_TIMESTAMPING_RX_SOFTWARE);
1039 else
1040 sock_disable_timestamp(sk,
1041 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
1042 break;
1043
1044 case SO_RCVLOWAT:
1045 if (val < 0)
1046 val = INT_MAX;
1047 if (sock->ops->set_rcvlowat)
1048 ret = sock->ops->set_rcvlowat(sk, val);
1049 else
1050 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1051 break;
1052
1053 case SO_RCVTIMEO_OLD:
1054 case SO_RCVTIMEO_NEW:
1055 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1056 optlen, optname == SO_RCVTIMEO_OLD);
1057 break;
1058
1059 case SO_SNDTIMEO_OLD:
1060 case SO_SNDTIMEO_NEW:
1061 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1062 optlen, optname == SO_SNDTIMEO_OLD);
1063 break;
1064
1065 case SO_ATTACH_FILTER: {
1066 struct sock_fprog fprog;
1067
1068 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1069 if (!ret)
1070 ret = sk_attach_filter(&fprog, sk);
1071 break;
1072 }
1073 case SO_ATTACH_BPF:
1074 ret = -EINVAL;
1075 if (optlen == sizeof(u32)) {
1076 u32 ufd;
1077
1078 ret = -EFAULT;
1079 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1080 break;
1081
1082 ret = sk_attach_bpf(ufd, sk);
1083 }
1084 break;
1085
1086 case SO_ATTACH_REUSEPORT_CBPF: {
1087 struct sock_fprog fprog;
1088
1089 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1090 if (!ret)
1091 ret = sk_reuseport_attach_filter(&fprog, sk);
1092 break;
1093 }
1094 case SO_ATTACH_REUSEPORT_EBPF:
1095 ret = -EINVAL;
1096 if (optlen == sizeof(u32)) {
1097 u32 ufd;
1098
1099 ret = -EFAULT;
1100 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1101 break;
1102
1103 ret = sk_reuseport_attach_bpf(ufd, sk);
1104 }
1105 break;
1106
1107 case SO_DETACH_REUSEPORT_BPF:
1108 ret = reuseport_detach_prog(sk);
1109 break;
1110
1111 case SO_DETACH_FILTER:
1112 ret = sk_detach_filter(sk);
1113 break;
1114
1115 case SO_LOCK_FILTER:
1116 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1117 ret = -EPERM;
1118 else
1119 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1120 break;
1121
1122 case SO_PASSSEC:
1123 if (valbool)
1124 set_bit(SOCK_PASSSEC, &sock->flags);
1125 else
1126 clear_bit(SOCK_PASSSEC, &sock->flags);
1127 break;
1128 case SO_MARK:
1129 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1130 ret = -EPERM;
1131 break;
1132 }
1133
1134 __sock_set_mark(sk, val);
1135 break;
1136
1137 case SO_RXQ_OVFL:
1138 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1139 break;
1140
1141 case SO_WIFI_STATUS:
1142 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1143 break;
1144
1145 case SO_PEEK_OFF:
1146 if (sock->ops->set_peek_off)
1147 ret = sock->ops->set_peek_off(sk, val);
1148 else
1149 ret = -EOPNOTSUPP;
1150 break;
1151
1152 case SO_NOFCS:
1153 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1154 break;
1155
1156 case SO_SELECT_ERR_QUEUE:
1157 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1158 break;
1159
1160 #ifdef CONFIG_NET_RX_BUSY_POLL
1161 case SO_BUSY_POLL:
1162 /* allow unprivileged users to decrease the value */
1163 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1164 ret = -EPERM;
1165 else {
1166 if (val < 0)
1167 ret = -EINVAL;
1168 else
1169 WRITE_ONCE(sk->sk_ll_usec, val);
1170 }
1171 break;
1172 #endif
1173
1174 case SO_MAX_PACING_RATE:
1175 {
1176 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1177
1178 if (sizeof(ulval) != sizeof(val) &&
1179 optlen >= sizeof(ulval) &&
1180 copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1181 ret = -EFAULT;
1182 break;
1183 }
1184 if (ulval != ~0UL)
1185 cmpxchg(&sk->sk_pacing_status,
1186 SK_PACING_NONE,
1187 SK_PACING_NEEDED);
1188 /* Pairs with READ_ONCE() from sk_getsockopt() */
1189 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1190 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1191 break;
1192 }
1193 case SO_INCOMING_CPU:
1194 WRITE_ONCE(sk->sk_incoming_cpu, val);
1195 break;
1196
1197 case SO_CNX_ADVICE:
1198 if (val == 1)
1199 dst_negative_advice(sk);
1200 break;
1201
1202 case SO_ZEROCOPY:
1203 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1204 if (!((sk->sk_type == SOCK_STREAM &&
1205 sk->sk_protocol == IPPROTO_TCP) ||
1206 (sk->sk_type == SOCK_DGRAM &&
1207 sk->sk_protocol == IPPROTO_UDP)))
1208 ret = -ENOTSUPP;
1209 } else if (sk->sk_family != PF_RDS) {
1210 ret = -ENOTSUPP;
1211 }
1212 if (!ret) {
1213 if (val < 0 || val > 1)
1214 ret = -EINVAL;
1215 else
1216 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1217 }
1218 break;
1219
1220 case SO_TXTIME:
1221 if (optlen != sizeof(struct sock_txtime)) {
1222 ret = -EINVAL;
1223 break;
1224 } else if (copy_from_sockptr(&sk_txtime, optval,
1225 sizeof(struct sock_txtime))) {
1226 ret = -EFAULT;
1227 break;
1228 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1229 ret = -EINVAL;
1230 break;
1231 }
1232 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1233 * scheduler has enough safe guards.
1234 */
1235 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1236 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1237 ret = -EPERM;
1238 break;
1239 }
1240 sock_valbool_flag(sk, SOCK_TXTIME, true);
1241 sk->sk_clockid = sk_txtime.clockid;
1242 sk->sk_txtime_deadline_mode =
1243 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1244 sk->sk_txtime_report_errors =
1245 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1246 break;
1247
1248 case SO_BINDTOIFINDEX:
1249 ret = sock_bindtoindex_locked(sk, val);
1250 break;
1251
1252 default:
1253 ret = -ENOPROTOOPT;
1254 break;
1255 }
1256 release_sock(sk);
1257 return ret;
1258 }
1259 EXPORT_SYMBOL(sock_setsockopt);
1260
sk_get_peer_cred(struct sock * sk)1261 static const struct cred *sk_get_peer_cred(struct sock *sk)
1262 {
1263 const struct cred *cred;
1264
1265 spin_lock(&sk->sk_peer_lock);
1266 cred = get_cred(sk->sk_peer_cred);
1267 spin_unlock(&sk->sk_peer_lock);
1268
1269 return cred;
1270 }
1271
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1272 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1273 struct ucred *ucred)
1274 {
1275 ucred->pid = pid_vnr(pid);
1276 ucred->uid = ucred->gid = -1;
1277 if (cred) {
1278 struct user_namespace *current_ns = current_user_ns();
1279
1280 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1281 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1282 }
1283 }
1284
groups_to_user(sockptr_t dst,const struct group_info * src)1285 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1286 {
1287 struct user_namespace *user_ns = current_user_ns();
1288 int i;
1289
1290 for (i = 0; i < src->ngroups; i++) {
1291 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1292
1293 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1294 return -EFAULT;
1295 }
1296
1297 return 0;
1298 }
1299
sk_getsockopt(struct sock * sk,int level,int optname,sockptr_t optval,sockptr_t optlen)1300 static int sk_getsockopt(struct sock *sk, int level, int optname,
1301 sockptr_t optval, sockptr_t optlen)
1302 {
1303 struct socket *sock = sk->sk_socket;
1304
1305 union {
1306 int val;
1307 u64 val64;
1308 unsigned long ulval;
1309 struct linger ling;
1310 struct old_timeval32 tm32;
1311 struct __kernel_old_timeval tm;
1312 struct __kernel_sock_timeval stm;
1313 struct sock_txtime txtime;
1314 } v;
1315
1316 int lv = sizeof(int);
1317 int len;
1318
1319 if (copy_from_sockptr(&len, optlen, sizeof(int)))
1320 return -EFAULT;
1321 if (len < 0)
1322 return -EINVAL;
1323
1324 memset(&v, 0, sizeof(v));
1325
1326 switch (optname) {
1327 case SO_DEBUG:
1328 v.val = sock_flag(sk, SOCK_DBG);
1329 break;
1330
1331 case SO_DONTROUTE:
1332 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1333 break;
1334
1335 case SO_BROADCAST:
1336 v.val = sock_flag(sk, SOCK_BROADCAST);
1337 break;
1338
1339 case SO_SNDBUF:
1340 v.val = READ_ONCE(sk->sk_sndbuf);
1341 break;
1342
1343 case SO_RCVBUF:
1344 v.val = READ_ONCE(sk->sk_rcvbuf);
1345 break;
1346
1347 case SO_REUSEADDR:
1348 v.val = sk->sk_reuse;
1349 break;
1350
1351 case SO_REUSEPORT:
1352 v.val = sk->sk_reuseport;
1353 break;
1354
1355 case SO_KEEPALIVE:
1356 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1357 break;
1358
1359 case SO_TYPE:
1360 v.val = sk->sk_type;
1361 break;
1362
1363 case SO_PROTOCOL:
1364 v.val = sk->sk_protocol;
1365 break;
1366
1367 case SO_DOMAIN:
1368 v.val = sk->sk_family;
1369 break;
1370
1371 case SO_ERROR:
1372 v.val = -sock_error(sk);
1373 if (v.val == 0)
1374 v.val = xchg(&sk->sk_err_soft, 0);
1375 break;
1376
1377 case SO_OOBINLINE:
1378 v.val = sock_flag(sk, SOCK_URGINLINE);
1379 break;
1380
1381 case SO_NO_CHECK:
1382 v.val = sk->sk_no_check_tx;
1383 break;
1384
1385 case SO_PRIORITY:
1386 v.val = sk->sk_priority;
1387 break;
1388
1389 case SO_LINGER:
1390 lv = sizeof(v.ling);
1391 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1392 v.ling.l_linger = sk->sk_lingertime / HZ;
1393 break;
1394
1395 case SO_BSDCOMPAT:
1396 break;
1397
1398 case SO_TIMESTAMP_OLD:
1399 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1400 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1401 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1402 break;
1403
1404 case SO_TIMESTAMPNS_OLD:
1405 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1406 break;
1407
1408 case SO_TIMESTAMP_NEW:
1409 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1410 break;
1411
1412 case SO_TIMESTAMPNS_NEW:
1413 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1414 break;
1415
1416 case SO_TIMESTAMPING_OLD:
1417 v.val = sk->sk_tsflags;
1418 break;
1419
1420 case SO_RCVTIMEO_OLD:
1421 case SO_RCVTIMEO_NEW:
1422 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1423 break;
1424
1425 case SO_SNDTIMEO_OLD:
1426 case SO_SNDTIMEO_NEW:
1427 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1428 break;
1429
1430 case SO_RCVLOWAT:
1431 v.val = READ_ONCE(sk->sk_rcvlowat);
1432 break;
1433
1434 case SO_SNDLOWAT:
1435 v.val = 1;
1436 break;
1437
1438 case SO_PASSCRED:
1439 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1440 break;
1441
1442 case SO_PEERCRED:
1443 {
1444 struct ucred peercred;
1445 if (len > sizeof(peercred))
1446 len = sizeof(peercred);
1447
1448 spin_lock(&sk->sk_peer_lock);
1449 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1450 spin_unlock(&sk->sk_peer_lock);
1451
1452 if (copy_to_sockptr(optval, &peercred, len))
1453 return -EFAULT;
1454 goto lenout;
1455 }
1456
1457 case SO_PEERGROUPS:
1458 {
1459 const struct cred *cred;
1460 int ret, n;
1461
1462 cred = sk_get_peer_cred(sk);
1463 if (!cred)
1464 return -ENODATA;
1465
1466 n = cred->group_info->ngroups;
1467 if (len < n * sizeof(gid_t)) {
1468 len = n * sizeof(gid_t);
1469 put_cred(cred);
1470 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1471 }
1472 len = n * sizeof(gid_t);
1473
1474 ret = groups_to_user(optval, cred->group_info);
1475 put_cred(cred);
1476 if (ret)
1477 return ret;
1478 goto lenout;
1479 }
1480
1481 case SO_PEERNAME:
1482 {
1483 char address[128];
1484
1485 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1486 if (lv < 0)
1487 return -ENOTCONN;
1488 if (lv < len)
1489 return -EINVAL;
1490 if (copy_to_sockptr(optval, address, len))
1491 return -EFAULT;
1492 goto lenout;
1493 }
1494
1495 /* Dubious BSD thing... Probably nobody even uses it, but
1496 * the UNIX standard wants it for whatever reason... -DaveM
1497 */
1498 case SO_ACCEPTCONN:
1499 v.val = sk->sk_state == TCP_LISTEN;
1500 break;
1501
1502 case SO_PASSSEC:
1503 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1504 break;
1505
1506 case SO_PEERSEC:
1507 return security_socket_getpeersec_stream(sock,
1508 optval, optlen, len);
1509
1510 case SO_MARK:
1511 v.val = sk->sk_mark;
1512 break;
1513
1514 case SO_RXQ_OVFL:
1515 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1516 break;
1517
1518 case SO_WIFI_STATUS:
1519 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1520 break;
1521
1522 case SO_PEEK_OFF:
1523 if (!sock->ops->set_peek_off)
1524 return -EOPNOTSUPP;
1525
1526 v.val = READ_ONCE(sk->sk_peek_off);
1527 break;
1528 case SO_NOFCS:
1529 v.val = sock_flag(sk, SOCK_NOFCS);
1530 break;
1531
1532 case SO_BINDTODEVICE:
1533 return sock_getbindtodevice(sk, optval, optlen, len);
1534
1535 case SO_GET_FILTER:
1536 len = sk_get_filter(sk, optval, len);
1537 if (len < 0)
1538 return len;
1539
1540 goto lenout;
1541
1542 case SO_LOCK_FILTER:
1543 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1544 break;
1545
1546 case SO_BPF_EXTENSIONS:
1547 v.val = bpf_tell_extensions();
1548 break;
1549
1550 case SO_SELECT_ERR_QUEUE:
1551 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1552 break;
1553
1554 #ifdef CONFIG_NET_RX_BUSY_POLL
1555 case SO_BUSY_POLL:
1556 v.val = READ_ONCE(sk->sk_ll_usec);
1557 break;
1558 #endif
1559
1560 case SO_MAX_PACING_RATE:
1561 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1562 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1563 lv = sizeof(v.ulval);
1564 v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1565 } else {
1566 /* 32bit version */
1567 v.val = min_t(unsigned long, ~0U,
1568 READ_ONCE(sk->sk_max_pacing_rate));
1569 }
1570 break;
1571
1572 case SO_INCOMING_CPU:
1573 v.val = READ_ONCE(sk->sk_incoming_cpu);
1574 break;
1575
1576 case SO_MEMINFO:
1577 {
1578 u32 meminfo[SK_MEMINFO_VARS];
1579
1580 sk_get_meminfo(sk, meminfo);
1581
1582 len = min_t(unsigned int, len, sizeof(meminfo));
1583 if (copy_to_sockptr(optval, &meminfo, len))
1584 return -EFAULT;
1585
1586 goto lenout;
1587 }
1588
1589 #ifdef CONFIG_NET_RX_BUSY_POLL
1590 case SO_INCOMING_NAPI_ID:
1591 v.val = READ_ONCE(sk->sk_napi_id);
1592
1593 /* aggregate non-NAPI IDs down to 0 */
1594 if (v.val < MIN_NAPI_ID)
1595 v.val = 0;
1596
1597 break;
1598 #endif
1599
1600 case SO_COOKIE:
1601 lv = sizeof(u64);
1602 if (len < lv)
1603 return -EINVAL;
1604 v.val64 = sock_gen_cookie(sk);
1605 break;
1606
1607 case SO_ZEROCOPY:
1608 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1609 break;
1610
1611 case SO_TXTIME:
1612 lv = sizeof(v.txtime);
1613 v.txtime.clockid = sk->sk_clockid;
1614 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1615 SOF_TXTIME_DEADLINE_MODE : 0;
1616 v.txtime.flags |= sk->sk_txtime_report_errors ?
1617 SOF_TXTIME_REPORT_ERRORS : 0;
1618 break;
1619
1620 case SO_BINDTOIFINDEX:
1621 v.val = sk->sk_bound_dev_if;
1622 break;
1623
1624 case SO_NETNS_COOKIE:
1625 lv = sizeof(u64);
1626 if (len != lv)
1627 return -EINVAL;
1628 v.val64 = atomic64_read(&sock_net(sk)->net_cookie);
1629 break;
1630
1631 default:
1632 /* We implement the SO_SNDLOWAT etc to not be settable
1633 * (1003.1g 7).
1634 */
1635 return -ENOPROTOOPT;
1636 }
1637
1638 if (len > lv)
1639 len = lv;
1640 if (copy_to_sockptr(optval, &v, len))
1641 return -EFAULT;
1642 lenout:
1643 if (copy_to_sockptr(optlen, &len, sizeof(int)))
1644 return -EFAULT;
1645 return 0;
1646 }
1647
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1648 int sock_getsockopt(struct socket *sock, int level, int optname,
1649 char __user *optval, int __user *optlen)
1650 {
1651 return sk_getsockopt(sock->sk, level, optname,
1652 USER_SOCKPTR(optval),
1653 USER_SOCKPTR(optlen));
1654 }
1655
1656 /*
1657 * Initialize an sk_lock.
1658 *
1659 * (We also register the sk_lock with the lock validator.)
1660 */
sock_lock_init(struct sock * sk)1661 static inline void sock_lock_init(struct sock *sk)
1662 {
1663 if (sk->sk_kern_sock)
1664 sock_lock_init_class_and_name(
1665 sk,
1666 af_family_kern_slock_key_strings[sk->sk_family],
1667 af_family_kern_slock_keys + sk->sk_family,
1668 af_family_kern_key_strings[sk->sk_family],
1669 af_family_kern_keys + sk->sk_family);
1670 else
1671 sock_lock_init_class_and_name(
1672 sk,
1673 af_family_slock_key_strings[sk->sk_family],
1674 af_family_slock_keys + sk->sk_family,
1675 af_family_key_strings[sk->sk_family],
1676 af_family_keys + sk->sk_family);
1677 }
1678
1679 /*
1680 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1681 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1682 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1683 */
sock_copy(struct sock * nsk,const struct sock * osk)1684 static void sock_copy(struct sock *nsk, const struct sock *osk)
1685 {
1686 const struct proto *prot = READ_ONCE(osk->sk_prot);
1687 #ifdef CONFIG_SECURITY_NETWORK
1688 void *sptr = nsk->sk_security;
1689 #endif
1690 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1691
1692 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1693 prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1694
1695 #ifdef CONFIG_SECURITY_NETWORK
1696 nsk->sk_security = sptr;
1697 security_sk_clone(osk, nsk);
1698 #endif
1699 }
1700
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1701 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1702 int family)
1703 {
1704 struct sock *sk;
1705 struct kmem_cache *slab;
1706
1707 slab = prot->slab;
1708 if (slab != NULL) {
1709 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1710 if (!sk)
1711 return sk;
1712 if (want_init_on_alloc(priority))
1713 sk_prot_clear_nulls(sk, prot->obj_size);
1714 } else
1715 sk = kmalloc(prot->obj_size, priority);
1716
1717 if (sk != NULL) {
1718 if (security_sk_alloc(sk, family, priority))
1719 goto out_free;
1720
1721 if (!try_module_get(prot->owner))
1722 goto out_free_sec;
1723 sk_tx_queue_clear(sk);
1724 }
1725
1726 return sk;
1727
1728 out_free_sec:
1729 security_sk_free(sk);
1730 out_free:
1731 if (slab != NULL)
1732 kmem_cache_free(slab, sk);
1733 else
1734 kfree(sk);
1735 return NULL;
1736 }
1737
sk_prot_free(struct proto * prot,struct sock * sk)1738 static void sk_prot_free(struct proto *prot, struct sock *sk)
1739 {
1740 struct kmem_cache *slab;
1741 struct module *owner;
1742
1743 owner = prot->owner;
1744 slab = prot->slab;
1745
1746 cgroup_sk_free(&sk->sk_cgrp_data);
1747 mem_cgroup_sk_free(sk);
1748 security_sk_free(sk);
1749 if (slab != NULL)
1750 kmem_cache_free(slab, sk);
1751 else
1752 kfree(sk);
1753 module_put(owner);
1754 }
1755
1756 /**
1757 * sk_alloc - All socket objects are allocated here
1758 * @net: the applicable net namespace
1759 * @family: protocol family
1760 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1761 * @prot: struct proto associated with this new sock instance
1762 * @kern: is this to be a kernel socket?
1763 */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)1764 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1765 struct proto *prot, int kern)
1766 {
1767 struct sock *sk;
1768
1769 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1770 if (sk) {
1771 sk->sk_family = family;
1772 /*
1773 * See comment in struct sock definition to understand
1774 * why we need sk_prot_creator -acme
1775 */
1776 sk->sk_prot = sk->sk_prot_creator = prot;
1777 sk->sk_kern_sock = kern;
1778 sock_lock_init(sk);
1779 sk->sk_net_refcnt = kern ? 0 : 1;
1780 if (likely(sk->sk_net_refcnt)) {
1781 get_net(net);
1782 sock_inuse_add(net, 1);
1783 }
1784
1785 sock_net_set(sk, net);
1786 refcount_set(&sk->sk_wmem_alloc, 1);
1787
1788 mem_cgroup_sk_alloc(sk);
1789 cgroup_sk_alloc(&sk->sk_cgrp_data);
1790 sock_update_classid(&sk->sk_cgrp_data);
1791 sock_update_netprioidx(&sk->sk_cgrp_data);
1792 sk_tx_queue_clear(sk);
1793 }
1794
1795 return sk;
1796 }
1797 EXPORT_SYMBOL(sk_alloc);
1798
1799 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1800 * grace period. This is the case for UDP sockets and TCP listeners.
1801 */
__sk_destruct(struct rcu_head * head)1802 static void __sk_destruct(struct rcu_head *head)
1803 {
1804 struct sock *sk = container_of(head, struct sock, sk_rcu);
1805 struct sk_filter *filter;
1806
1807 if (sk->sk_destruct)
1808 sk->sk_destruct(sk);
1809
1810 filter = rcu_dereference_check(sk->sk_filter,
1811 refcount_read(&sk->sk_wmem_alloc) == 0);
1812 if (filter) {
1813 sk_filter_uncharge(sk, filter);
1814 RCU_INIT_POINTER(sk->sk_filter, NULL);
1815 }
1816
1817 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1818
1819 #ifdef CONFIG_BPF_SYSCALL
1820 bpf_sk_storage_free(sk);
1821 #endif
1822
1823 if (atomic_read(&sk->sk_omem_alloc))
1824 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1825 __func__, atomic_read(&sk->sk_omem_alloc));
1826
1827 if (sk->sk_frag.page) {
1828 put_page(sk->sk_frag.page);
1829 sk->sk_frag.page = NULL;
1830 }
1831
1832 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1833 put_cred(sk->sk_peer_cred);
1834 put_pid(sk->sk_peer_pid);
1835
1836 if (likely(sk->sk_net_refcnt))
1837 put_net(sock_net(sk));
1838 sk_prot_free(sk->sk_prot_creator, sk);
1839 }
1840
sk_destruct(struct sock * sk)1841 void sk_destruct(struct sock *sk)
1842 {
1843 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1844
1845 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1846 reuseport_detach_sock(sk);
1847 use_call_rcu = true;
1848 }
1849
1850 if (use_call_rcu)
1851 call_rcu(&sk->sk_rcu, __sk_destruct);
1852 else
1853 __sk_destruct(&sk->sk_rcu);
1854 }
1855
__sk_free(struct sock * sk)1856 static void __sk_free(struct sock *sk)
1857 {
1858 if (likely(sk->sk_net_refcnt))
1859 sock_inuse_add(sock_net(sk), -1);
1860
1861 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1862 sock_diag_broadcast_destroy(sk);
1863 else
1864 sk_destruct(sk);
1865 }
1866
sk_free(struct sock * sk)1867 void sk_free(struct sock *sk)
1868 {
1869 /*
1870 * We subtract one from sk_wmem_alloc and can know if
1871 * some packets are still in some tx queue.
1872 * If not null, sock_wfree() will call __sk_free(sk) later
1873 */
1874 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1875 __sk_free(sk);
1876 }
1877 EXPORT_SYMBOL(sk_free);
1878
sk_init_common(struct sock * sk)1879 static void sk_init_common(struct sock *sk)
1880 {
1881 skb_queue_head_init(&sk->sk_receive_queue);
1882 skb_queue_head_init(&sk->sk_write_queue);
1883 skb_queue_head_init(&sk->sk_error_queue);
1884
1885 rwlock_init(&sk->sk_callback_lock);
1886 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1887 af_rlock_keys + sk->sk_family,
1888 af_family_rlock_key_strings[sk->sk_family]);
1889 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1890 af_wlock_keys + sk->sk_family,
1891 af_family_wlock_key_strings[sk->sk_family]);
1892 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1893 af_elock_keys + sk->sk_family,
1894 af_family_elock_key_strings[sk->sk_family]);
1895 lockdep_set_class_and_name(&sk->sk_callback_lock,
1896 af_callback_keys + sk->sk_family,
1897 af_family_clock_key_strings[sk->sk_family]);
1898 }
1899
1900 /**
1901 * sk_clone_lock - clone a socket, and lock its clone
1902 * @sk: the socket to clone
1903 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1904 *
1905 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1906 */
sk_clone_lock(const struct sock * sk,const gfp_t priority)1907 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1908 {
1909 struct proto *prot = READ_ONCE(sk->sk_prot);
1910 struct sk_filter *filter;
1911 bool is_charged = true;
1912 struct sock *newsk;
1913
1914 newsk = sk_prot_alloc(prot, priority, sk->sk_family);
1915 if (!newsk)
1916 goto out;
1917
1918 sock_copy(newsk, sk);
1919
1920 newsk->sk_prot_creator = prot;
1921
1922 /* SANITY */
1923 if (likely(newsk->sk_net_refcnt)) {
1924 get_net(sock_net(newsk));
1925 sock_inuse_add(sock_net(newsk), 1);
1926 }
1927 sk_node_init(&newsk->sk_node);
1928 sock_lock_init(newsk);
1929 bh_lock_sock(newsk);
1930 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1931 newsk->sk_backlog.len = 0;
1932
1933 atomic_set(&newsk->sk_rmem_alloc, 0);
1934
1935 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
1936 refcount_set(&newsk->sk_wmem_alloc, 1);
1937
1938 atomic_set(&newsk->sk_omem_alloc, 0);
1939 sk_init_common(newsk);
1940
1941 newsk->sk_dst_cache = NULL;
1942 newsk->sk_dst_pending_confirm = 0;
1943 newsk->sk_wmem_queued = 0;
1944 newsk->sk_forward_alloc = 0;
1945 atomic_set(&newsk->sk_drops, 0);
1946 newsk->sk_send_head = NULL;
1947 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1948 atomic_set(&newsk->sk_zckey, 0);
1949
1950 sock_reset_flag(newsk, SOCK_DONE);
1951
1952 /* sk->sk_memcg will be populated at accept() time */
1953 newsk->sk_memcg = NULL;
1954
1955 cgroup_sk_clone(&newsk->sk_cgrp_data);
1956
1957 rcu_read_lock();
1958 filter = rcu_dereference(sk->sk_filter);
1959 if (filter != NULL)
1960 /* though it's an empty new sock, the charging may fail
1961 * if sysctl_optmem_max was changed between creation of
1962 * original socket and cloning
1963 */
1964 is_charged = sk_filter_charge(newsk, filter);
1965 RCU_INIT_POINTER(newsk->sk_filter, filter);
1966 rcu_read_unlock();
1967
1968 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1969 /* We need to make sure that we don't uncharge the new
1970 * socket if we couldn't charge it in the first place
1971 * as otherwise we uncharge the parent's filter.
1972 */
1973 if (!is_charged)
1974 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1975 sk_free_unlock_clone(newsk);
1976 newsk = NULL;
1977 goto out;
1978 }
1979 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1980
1981 if (bpf_sk_storage_clone(sk, newsk)) {
1982 sk_free_unlock_clone(newsk);
1983 newsk = NULL;
1984 goto out;
1985 }
1986
1987 /* Clear sk_user_data if parent had the pointer tagged
1988 * as not suitable for copying when cloning.
1989 */
1990 if (sk_user_data_is_nocopy(newsk))
1991 newsk->sk_user_data = NULL;
1992
1993 newsk->sk_err = 0;
1994 newsk->sk_err_soft = 0;
1995 newsk->sk_priority = 0;
1996 newsk->sk_incoming_cpu = raw_smp_processor_id();
1997
1998 /* Before updating sk_refcnt, we must commit prior changes to memory
1999 * (Documentation/RCU/rculist_nulls.rst for details)
2000 */
2001 smp_wmb();
2002 refcount_set(&newsk->sk_refcnt, 2);
2003
2004 /* Increment the counter in the same struct proto as the master
2005 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2006 * is the same as sk->sk_prot->socks, as this field was copied
2007 * with memcpy).
2008 *
2009 * This _changes_ the previous behaviour, where
2010 * tcp_create_openreq_child always was incrementing the
2011 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2012 * to be taken into account in all callers. -acme
2013 */
2014 sk_refcnt_debug_inc(newsk);
2015 sk_set_socket(newsk, NULL);
2016 sk_tx_queue_clear(newsk);
2017 RCU_INIT_POINTER(newsk->sk_wq, NULL);
2018
2019 if (newsk->sk_prot->sockets_allocated)
2020 sk_sockets_allocated_inc(newsk);
2021
2022 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2023 net_enable_timestamp();
2024 out:
2025 return newsk;
2026 }
2027 EXPORT_SYMBOL_GPL(sk_clone_lock);
2028
sk_free_unlock_clone(struct sock * sk)2029 void sk_free_unlock_clone(struct sock *sk)
2030 {
2031 /* It is still raw copy of parent, so invalidate
2032 * destructor and make plain sk_free() */
2033 sk->sk_destruct = NULL;
2034 bh_unlock_sock(sk);
2035 sk_free(sk);
2036 }
2037 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2038
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2039 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2040 {
2041 u32 max_segs = 1;
2042
2043 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2044 if (sk->sk_route_caps & NETIF_F_GSO)
2045 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2046 sk->sk_route_caps &= ~sk->sk_route_nocaps;
2047 if (sk_can_gso(sk)) {
2048 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2049 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2050 } else {
2051 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2052 sk->sk_gso_max_size = dst->dev->gso_max_size;
2053 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2054 }
2055 }
2056 sk->sk_gso_max_segs = max_segs;
2057 sk_dst_set(sk, dst);
2058 }
2059 EXPORT_SYMBOL_GPL(sk_setup_caps);
2060
2061 /*
2062 * Simple resource managers for sockets.
2063 */
2064
2065
2066 /*
2067 * Write buffer destructor automatically called from kfree_skb.
2068 */
sock_wfree(struct sk_buff * skb)2069 void sock_wfree(struct sk_buff *skb)
2070 {
2071 struct sock *sk = skb->sk;
2072 unsigned int len = skb->truesize;
2073
2074 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2075 /*
2076 * Keep a reference on sk_wmem_alloc, this will be released
2077 * after sk_write_space() call
2078 */
2079 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2080 sk->sk_write_space(sk);
2081 len = 1;
2082 }
2083 /*
2084 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2085 * could not do because of in-flight packets
2086 */
2087 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2088 __sk_free(sk);
2089 }
2090 EXPORT_SYMBOL(sock_wfree);
2091
2092 /* This variant of sock_wfree() is used by TCP,
2093 * since it sets SOCK_USE_WRITE_QUEUE.
2094 */
__sock_wfree(struct sk_buff * skb)2095 void __sock_wfree(struct sk_buff *skb)
2096 {
2097 struct sock *sk = skb->sk;
2098
2099 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2100 __sk_free(sk);
2101 }
2102
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2103 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2104 {
2105 skb_orphan(skb);
2106 skb->sk = sk;
2107 #ifdef CONFIG_INET
2108 if (unlikely(!sk_fullsock(sk))) {
2109 skb->destructor = sock_edemux;
2110 sock_hold(sk);
2111 return;
2112 }
2113 #endif
2114 skb->destructor = sock_wfree;
2115 skb_set_hash_from_sk(skb, sk);
2116 /*
2117 * We used to take a refcount on sk, but following operation
2118 * is enough to guarantee sk_free() wont free this sock until
2119 * all in-flight packets are completed
2120 */
2121 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2122 }
2123 EXPORT_SYMBOL(skb_set_owner_w);
2124
can_skb_orphan_partial(const struct sk_buff * skb)2125 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2126 {
2127 #ifdef CONFIG_TLS_DEVICE
2128 /* Drivers depend on in-order delivery for crypto offload,
2129 * partial orphan breaks out-of-order-OK logic.
2130 */
2131 if (skb->decrypted)
2132 return false;
2133 #endif
2134 return (skb->destructor == sock_wfree ||
2135 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2136 }
2137
2138 /* This helper is used by netem, as it can hold packets in its
2139 * delay queue. We want to allow the owner socket to send more
2140 * packets, as if they were already TX completed by a typical driver.
2141 * But we also want to keep skb->sk set because some packet schedulers
2142 * rely on it (sch_fq for example).
2143 */
skb_orphan_partial(struct sk_buff * skb)2144 void skb_orphan_partial(struct sk_buff *skb)
2145 {
2146 if (skb_is_tcp_pure_ack(skb))
2147 return;
2148
2149 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2150 return;
2151
2152 skb_orphan(skb);
2153 }
2154 EXPORT_SYMBOL(skb_orphan_partial);
2155
2156 /*
2157 * Read buffer destructor automatically called from kfree_skb.
2158 */
sock_rfree(struct sk_buff * skb)2159 void sock_rfree(struct sk_buff *skb)
2160 {
2161 struct sock *sk = skb->sk;
2162 unsigned int len = skb->truesize;
2163
2164 atomic_sub(len, &sk->sk_rmem_alloc);
2165 sk_mem_uncharge(sk, len);
2166 }
2167 EXPORT_SYMBOL(sock_rfree);
2168
2169 /*
2170 * Buffer destructor for skbs that are not used directly in read or write
2171 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2172 */
sock_efree(struct sk_buff * skb)2173 void sock_efree(struct sk_buff *skb)
2174 {
2175 sock_put(skb->sk);
2176 }
2177 EXPORT_SYMBOL(sock_efree);
2178
2179 /* Buffer destructor for prefetch/receive path where reference count may
2180 * not be held, e.g. for listen sockets.
2181 */
2182 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2183 void sock_pfree(struct sk_buff *skb)
2184 {
2185 if (sk_is_refcounted(skb->sk))
2186 sock_gen_put(skb->sk);
2187 }
2188 EXPORT_SYMBOL(sock_pfree);
2189 #endif /* CONFIG_INET */
2190
sock_i_uid(struct sock * sk)2191 kuid_t sock_i_uid(struct sock *sk)
2192 {
2193 kuid_t uid;
2194
2195 read_lock_bh(&sk->sk_callback_lock);
2196 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2197 read_unlock_bh(&sk->sk_callback_lock);
2198 return uid;
2199 }
2200 EXPORT_SYMBOL(sock_i_uid);
2201
__sock_i_ino(struct sock * sk)2202 unsigned long __sock_i_ino(struct sock *sk)
2203 {
2204 unsigned long ino;
2205
2206 read_lock(&sk->sk_callback_lock);
2207 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2208 read_unlock(&sk->sk_callback_lock);
2209 return ino;
2210 }
2211 EXPORT_SYMBOL(__sock_i_ino);
2212
sock_i_ino(struct sock * sk)2213 unsigned long sock_i_ino(struct sock *sk)
2214 {
2215 unsigned long ino;
2216
2217 local_bh_disable();
2218 ino = __sock_i_ino(sk);
2219 local_bh_enable();
2220 return ino;
2221 }
2222 EXPORT_SYMBOL(sock_i_ino);
2223
2224 /*
2225 * Allocate a skb from the socket's send buffer.
2226 */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2227 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2228 gfp_t priority)
2229 {
2230 if (force ||
2231 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2232 struct sk_buff *skb = alloc_skb(size, priority);
2233
2234 if (skb) {
2235 skb_set_owner_w(skb, sk);
2236 return skb;
2237 }
2238 }
2239 return NULL;
2240 }
2241 EXPORT_SYMBOL(sock_wmalloc);
2242
sock_ofree(struct sk_buff * skb)2243 static void sock_ofree(struct sk_buff *skb)
2244 {
2245 struct sock *sk = skb->sk;
2246
2247 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2248 }
2249
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2250 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2251 gfp_t priority)
2252 {
2253 struct sk_buff *skb;
2254
2255 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2256 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2257 READ_ONCE(sysctl_optmem_max))
2258 return NULL;
2259
2260 skb = alloc_skb(size, priority);
2261 if (!skb)
2262 return NULL;
2263
2264 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2265 skb->sk = sk;
2266 skb->destructor = sock_ofree;
2267 return skb;
2268 }
2269
2270 /*
2271 * Allocate a memory block from the socket's option memory buffer.
2272 */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2273 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2274 {
2275 int optmem_max = READ_ONCE(sysctl_optmem_max);
2276
2277 if ((unsigned int)size <= optmem_max &&
2278 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2279 void *mem;
2280 /* First do the add, to avoid the race if kmalloc
2281 * might sleep.
2282 */
2283 atomic_add(size, &sk->sk_omem_alloc);
2284 mem = kmalloc(size, priority);
2285 if (mem)
2286 return mem;
2287 atomic_sub(size, &sk->sk_omem_alloc);
2288 }
2289 return NULL;
2290 }
2291 EXPORT_SYMBOL(sock_kmalloc);
2292
2293 /* Free an option memory block. Note, we actually want the inline
2294 * here as this allows gcc to detect the nullify and fold away the
2295 * condition entirely.
2296 */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2297 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2298 const bool nullify)
2299 {
2300 if (WARN_ON_ONCE(!mem))
2301 return;
2302 if (nullify)
2303 kfree_sensitive(mem);
2304 else
2305 kfree(mem);
2306 atomic_sub(size, &sk->sk_omem_alloc);
2307 }
2308
sock_kfree_s(struct sock * sk,void * mem,int size)2309 void sock_kfree_s(struct sock *sk, void *mem, int size)
2310 {
2311 __sock_kfree_s(sk, mem, size, false);
2312 }
2313 EXPORT_SYMBOL(sock_kfree_s);
2314
sock_kzfree_s(struct sock * sk,void * mem,int size)2315 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2316 {
2317 __sock_kfree_s(sk, mem, size, true);
2318 }
2319 EXPORT_SYMBOL(sock_kzfree_s);
2320
2321 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2322 I think, these locks should be removed for datagram sockets.
2323 */
sock_wait_for_wmem(struct sock * sk,long timeo)2324 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2325 {
2326 DEFINE_WAIT(wait);
2327
2328 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2329 for (;;) {
2330 if (!timeo)
2331 break;
2332 if (signal_pending(current))
2333 break;
2334 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2335 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2336 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2337 break;
2338 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2339 break;
2340 if (READ_ONCE(sk->sk_err))
2341 break;
2342 timeo = schedule_timeout(timeo);
2343 }
2344 finish_wait(sk_sleep(sk), &wait);
2345 return timeo;
2346 }
2347
2348
2349 /*
2350 * Generic send/receive buffer handlers
2351 */
2352
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2353 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2354 unsigned long data_len, int noblock,
2355 int *errcode, int max_page_order)
2356 {
2357 struct sk_buff *skb;
2358 long timeo;
2359 int err;
2360
2361 timeo = sock_sndtimeo(sk, noblock);
2362 for (;;) {
2363 err = sock_error(sk);
2364 if (err != 0)
2365 goto failure;
2366
2367 err = -EPIPE;
2368 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2369 goto failure;
2370
2371 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2372 break;
2373
2374 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2375 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2376 err = -EAGAIN;
2377 if (!timeo)
2378 goto failure;
2379 if (signal_pending(current))
2380 goto interrupted;
2381 timeo = sock_wait_for_wmem(sk, timeo);
2382 }
2383 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2384 errcode, sk->sk_allocation);
2385 if (skb)
2386 skb_set_owner_w(skb, sk);
2387 return skb;
2388
2389 interrupted:
2390 err = sock_intr_errno(timeo);
2391 failure:
2392 *errcode = err;
2393 return NULL;
2394 }
2395 EXPORT_SYMBOL(sock_alloc_send_pskb);
2396
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)2397 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2398 int noblock, int *errcode)
2399 {
2400 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2401 }
2402 EXPORT_SYMBOL(sock_alloc_send_skb);
2403
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2404 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2405 struct sockcm_cookie *sockc)
2406 {
2407 u32 tsflags;
2408
2409 switch (cmsg->cmsg_type) {
2410 case SO_MARK:
2411 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2412 return -EPERM;
2413 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2414 return -EINVAL;
2415 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2416 break;
2417 case SO_TIMESTAMPING_OLD:
2418 case SO_TIMESTAMPING_NEW:
2419 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2420 return -EINVAL;
2421
2422 tsflags = *(u32 *)CMSG_DATA(cmsg);
2423 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2424 return -EINVAL;
2425
2426 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2427 sockc->tsflags |= tsflags;
2428 break;
2429 case SCM_TXTIME:
2430 if (!sock_flag(sk, SOCK_TXTIME))
2431 return -EINVAL;
2432 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2433 return -EINVAL;
2434 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2435 break;
2436 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2437 case SCM_RIGHTS:
2438 case SCM_CREDENTIALS:
2439 break;
2440 default:
2441 return -EINVAL;
2442 }
2443 return 0;
2444 }
2445 EXPORT_SYMBOL(__sock_cmsg_send);
2446
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2447 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2448 struct sockcm_cookie *sockc)
2449 {
2450 struct cmsghdr *cmsg;
2451 int ret;
2452
2453 for_each_cmsghdr(cmsg, msg) {
2454 if (!CMSG_OK(msg, cmsg))
2455 return -EINVAL;
2456 if (cmsg->cmsg_level != SOL_SOCKET)
2457 continue;
2458 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2459 if (ret)
2460 return ret;
2461 }
2462 return 0;
2463 }
2464 EXPORT_SYMBOL(sock_cmsg_send);
2465
sk_enter_memory_pressure(struct sock * sk)2466 static void sk_enter_memory_pressure(struct sock *sk)
2467 {
2468 if (!sk->sk_prot->enter_memory_pressure)
2469 return;
2470
2471 sk->sk_prot->enter_memory_pressure(sk);
2472 }
2473
sk_leave_memory_pressure(struct sock * sk)2474 static void sk_leave_memory_pressure(struct sock *sk)
2475 {
2476 if (sk->sk_prot->leave_memory_pressure) {
2477 sk->sk_prot->leave_memory_pressure(sk);
2478 } else {
2479 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2480
2481 if (memory_pressure && READ_ONCE(*memory_pressure))
2482 WRITE_ONCE(*memory_pressure, 0);
2483 }
2484 }
2485
2486 #define SKB_FRAG_PAGE_ORDER get_order(32768)
2487 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2488
2489 /**
2490 * skb_page_frag_refill - check that a page_frag contains enough room
2491 * @sz: minimum size of the fragment we want to get
2492 * @pfrag: pointer to page_frag
2493 * @gfp: priority for memory allocation
2494 *
2495 * Note: While this allocator tries to use high order pages, there is
2496 * no guarantee that allocations succeed. Therefore, @sz MUST be
2497 * less or equal than PAGE_SIZE.
2498 */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2499 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2500 {
2501 if (pfrag->page) {
2502 if (page_ref_count(pfrag->page) == 1) {
2503 pfrag->offset = 0;
2504 return true;
2505 }
2506 if (pfrag->offset + sz <= pfrag->size)
2507 return true;
2508 put_page(pfrag->page);
2509 }
2510
2511 pfrag->offset = 0;
2512 if (SKB_FRAG_PAGE_ORDER &&
2513 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2514 /* Avoid direct reclaim but allow kswapd to wake */
2515 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2516 __GFP_COMP | __GFP_NOWARN |
2517 __GFP_NORETRY,
2518 SKB_FRAG_PAGE_ORDER);
2519 if (likely(pfrag->page)) {
2520 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2521 return true;
2522 }
2523 }
2524 pfrag->page = alloc_page(gfp);
2525 if (likely(pfrag->page)) {
2526 pfrag->size = PAGE_SIZE;
2527 return true;
2528 }
2529 return false;
2530 }
2531 EXPORT_SYMBOL(skb_page_frag_refill);
2532
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2533 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2534 {
2535 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2536 return true;
2537
2538 sk_enter_memory_pressure(sk);
2539 sk_stream_moderate_sndbuf(sk);
2540 return false;
2541 }
2542 EXPORT_SYMBOL(sk_page_frag_refill);
2543
__lock_sock(struct sock * sk)2544 static void __lock_sock(struct sock *sk)
2545 __releases(&sk->sk_lock.slock)
2546 __acquires(&sk->sk_lock.slock)
2547 {
2548 DEFINE_WAIT(wait);
2549
2550 for (;;) {
2551 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2552 TASK_UNINTERRUPTIBLE);
2553 spin_unlock_bh(&sk->sk_lock.slock);
2554 schedule();
2555 spin_lock_bh(&sk->sk_lock.slock);
2556 if (!sock_owned_by_user(sk))
2557 break;
2558 }
2559 finish_wait(&sk->sk_lock.wq, &wait);
2560 }
2561
__release_sock(struct sock * sk)2562 void __release_sock(struct sock *sk)
2563 __releases(&sk->sk_lock.slock)
2564 __acquires(&sk->sk_lock.slock)
2565 {
2566 struct sk_buff *skb, *next;
2567
2568 while ((skb = sk->sk_backlog.head) != NULL) {
2569 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2570
2571 spin_unlock_bh(&sk->sk_lock.slock);
2572
2573 do {
2574 next = skb->next;
2575 prefetch(next);
2576 WARN_ON_ONCE(skb_dst_is_noref(skb));
2577 skb_mark_not_on_list(skb);
2578 sk_backlog_rcv(sk, skb);
2579
2580 cond_resched();
2581
2582 skb = next;
2583 } while (skb != NULL);
2584
2585 spin_lock_bh(&sk->sk_lock.slock);
2586 }
2587
2588 /*
2589 * Doing the zeroing here guarantee we can not loop forever
2590 * while a wild producer attempts to flood us.
2591 */
2592 sk->sk_backlog.len = 0;
2593 }
2594
__sk_flush_backlog(struct sock * sk)2595 void __sk_flush_backlog(struct sock *sk)
2596 {
2597 spin_lock_bh(&sk->sk_lock.slock);
2598 __release_sock(sk);
2599 spin_unlock_bh(&sk->sk_lock.slock);
2600 }
2601
2602 /**
2603 * sk_wait_data - wait for data to arrive at sk_receive_queue
2604 * @sk: sock to wait on
2605 * @timeo: for how long
2606 * @skb: last skb seen on sk_receive_queue
2607 *
2608 * Now socket state including sk->sk_err is changed only under lock,
2609 * hence we may omit checks after joining wait queue.
2610 * We check receive queue before schedule() only as optimization;
2611 * it is very likely that release_sock() added new data.
2612 */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2613 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2614 {
2615 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2616 int rc;
2617
2618 add_wait_queue(sk_sleep(sk), &wait);
2619 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2620 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2621 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2622 remove_wait_queue(sk_sleep(sk), &wait);
2623 return rc;
2624 }
2625 EXPORT_SYMBOL(sk_wait_data);
2626
2627 /**
2628 * __sk_mem_raise_allocated - increase memory_allocated
2629 * @sk: socket
2630 * @size: memory size to allocate
2631 * @amt: pages to allocate
2632 * @kind: allocation type
2633 *
2634 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2635 */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)2636 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2637 {
2638 struct proto *prot = sk->sk_prot;
2639 long allocated = sk_memory_allocated_add(sk, amt);
2640 bool charged = true;
2641
2642 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2643 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2644 goto suppress_allocation;
2645
2646 /* Under limit. */
2647 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2648 sk_leave_memory_pressure(sk);
2649 return 1;
2650 }
2651
2652 /* Under pressure. */
2653 if (allocated > sk_prot_mem_limits(sk, 1))
2654 sk_enter_memory_pressure(sk);
2655
2656 /* Over hard limit. */
2657 if (allocated > sk_prot_mem_limits(sk, 2))
2658 goto suppress_allocation;
2659
2660 /* guarantee minimum buffer size under pressure */
2661 if (kind == SK_MEM_RECV) {
2662 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2663 return 1;
2664
2665 } else { /* SK_MEM_SEND */
2666 int wmem0 = sk_get_wmem0(sk, prot);
2667
2668 if (sk->sk_type == SOCK_STREAM) {
2669 if (sk->sk_wmem_queued < wmem0)
2670 return 1;
2671 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2672 return 1;
2673 }
2674 }
2675
2676 if (sk_has_memory_pressure(sk)) {
2677 u64 alloc;
2678
2679 if (!sk_under_memory_pressure(sk))
2680 return 1;
2681 alloc = sk_sockets_allocated_read_positive(sk);
2682 if (sk_prot_mem_limits(sk, 2) > alloc *
2683 sk_mem_pages(sk->sk_wmem_queued +
2684 atomic_read(&sk->sk_rmem_alloc) +
2685 sk->sk_forward_alloc))
2686 return 1;
2687 }
2688
2689 suppress_allocation:
2690
2691 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2692 sk_stream_moderate_sndbuf(sk);
2693
2694 /* Fail only if socket is _under_ its sndbuf.
2695 * In this case we cannot block, so that we have to fail.
2696 */
2697 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2698 return 1;
2699 }
2700
2701 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2702 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2703
2704 sk_memory_allocated_sub(sk, amt);
2705
2706 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2707 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2708
2709 return 0;
2710 }
2711 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2712
2713 /**
2714 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2715 * @sk: socket
2716 * @size: memory size to allocate
2717 * @kind: allocation type
2718 *
2719 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2720 * rmem allocation. This function assumes that protocols which have
2721 * memory_pressure use sk_wmem_queued as write buffer accounting.
2722 */
__sk_mem_schedule(struct sock * sk,int size,int kind)2723 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2724 {
2725 int ret, amt = sk_mem_pages(size);
2726
2727 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2728 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2729 if (!ret)
2730 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2731 return ret;
2732 }
2733 EXPORT_SYMBOL(__sk_mem_schedule);
2734
2735 /**
2736 * __sk_mem_reduce_allocated - reclaim memory_allocated
2737 * @sk: socket
2738 * @amount: number of quanta
2739 *
2740 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2741 */
__sk_mem_reduce_allocated(struct sock * sk,int amount)2742 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2743 {
2744 sk_memory_allocated_sub(sk, amount);
2745
2746 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2747 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2748
2749 if (sk_under_global_memory_pressure(sk) &&
2750 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2751 sk_leave_memory_pressure(sk);
2752 }
2753 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2754
2755 /**
2756 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2757 * @sk: socket
2758 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2759 */
__sk_mem_reclaim(struct sock * sk,int amount)2760 void __sk_mem_reclaim(struct sock *sk, int amount)
2761 {
2762 amount >>= SK_MEM_QUANTUM_SHIFT;
2763 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2764 __sk_mem_reduce_allocated(sk, amount);
2765 }
2766 EXPORT_SYMBOL(__sk_mem_reclaim);
2767
sk_set_peek_off(struct sock * sk,int val)2768 int sk_set_peek_off(struct sock *sk, int val)
2769 {
2770 WRITE_ONCE(sk->sk_peek_off, val);
2771 return 0;
2772 }
2773 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2774
2775 /*
2776 * Set of default routines for initialising struct proto_ops when
2777 * the protocol does not support a particular function. In certain
2778 * cases where it makes no sense for a protocol to have a "do nothing"
2779 * function, some default processing is provided.
2780 */
2781
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2782 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2783 {
2784 return -EOPNOTSUPP;
2785 }
2786 EXPORT_SYMBOL(sock_no_bind);
2787
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2788 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2789 int len, int flags)
2790 {
2791 return -EOPNOTSUPP;
2792 }
2793 EXPORT_SYMBOL(sock_no_connect);
2794
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2795 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2796 {
2797 return -EOPNOTSUPP;
2798 }
2799 EXPORT_SYMBOL(sock_no_socketpair);
2800
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)2801 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2802 bool kern)
2803 {
2804 return -EOPNOTSUPP;
2805 }
2806 EXPORT_SYMBOL(sock_no_accept);
2807
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)2808 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2809 int peer)
2810 {
2811 return -EOPNOTSUPP;
2812 }
2813 EXPORT_SYMBOL(sock_no_getname);
2814
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2815 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2816 {
2817 return -EOPNOTSUPP;
2818 }
2819 EXPORT_SYMBOL(sock_no_ioctl);
2820
sock_no_listen(struct socket * sock,int backlog)2821 int sock_no_listen(struct socket *sock, int backlog)
2822 {
2823 return -EOPNOTSUPP;
2824 }
2825 EXPORT_SYMBOL(sock_no_listen);
2826
sock_no_shutdown(struct socket * sock,int how)2827 int sock_no_shutdown(struct socket *sock, int how)
2828 {
2829 return -EOPNOTSUPP;
2830 }
2831 EXPORT_SYMBOL(sock_no_shutdown);
2832
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)2833 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2834 {
2835 return -EOPNOTSUPP;
2836 }
2837 EXPORT_SYMBOL(sock_no_sendmsg);
2838
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)2839 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2840 {
2841 return -EOPNOTSUPP;
2842 }
2843 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2844
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)2845 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2846 int flags)
2847 {
2848 return -EOPNOTSUPP;
2849 }
2850 EXPORT_SYMBOL(sock_no_recvmsg);
2851
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)2852 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2853 {
2854 /* Mirror missing mmap method error code */
2855 return -ENODEV;
2856 }
2857 EXPORT_SYMBOL(sock_no_mmap);
2858
2859 /*
2860 * When a file is received (via SCM_RIGHTS, etc), we must bump the
2861 * various sock-based usage counts.
2862 */
__receive_sock(struct file * file)2863 void __receive_sock(struct file *file)
2864 {
2865 struct socket *sock;
2866 int error;
2867
2868 /*
2869 * The resulting value of "error" is ignored here since we only
2870 * need to take action when the file is a socket and testing
2871 * "sock" for NULL is sufficient.
2872 */
2873 sock = sock_from_file(file, &error);
2874 if (sock) {
2875 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2876 sock_update_classid(&sock->sk->sk_cgrp_data);
2877 }
2878 }
2879
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)2880 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2881 {
2882 ssize_t res;
2883 struct msghdr msg = {.msg_flags = flags};
2884 struct kvec iov;
2885 char *kaddr = kmap(page);
2886 iov.iov_base = kaddr + offset;
2887 iov.iov_len = size;
2888 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2889 kunmap(page);
2890 return res;
2891 }
2892 EXPORT_SYMBOL(sock_no_sendpage);
2893
sock_no_sendpage_locked(struct sock * sk,struct page * page,int offset,size_t size,int flags)2894 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2895 int offset, size_t size, int flags)
2896 {
2897 ssize_t res;
2898 struct msghdr msg = {.msg_flags = flags};
2899 struct kvec iov;
2900 char *kaddr = kmap(page);
2901
2902 iov.iov_base = kaddr + offset;
2903 iov.iov_len = size;
2904 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2905 kunmap(page);
2906 return res;
2907 }
2908 EXPORT_SYMBOL(sock_no_sendpage_locked);
2909
2910 /*
2911 * Default Socket Callbacks
2912 */
2913
sock_def_wakeup(struct sock * sk)2914 static void sock_def_wakeup(struct sock *sk)
2915 {
2916 struct socket_wq *wq;
2917
2918 rcu_read_lock();
2919 wq = rcu_dereference(sk->sk_wq);
2920 if (skwq_has_sleeper(wq))
2921 wake_up_interruptible_all(&wq->wait);
2922 rcu_read_unlock();
2923 }
2924
sock_def_error_report(struct sock * sk)2925 static void sock_def_error_report(struct sock *sk)
2926 {
2927 struct socket_wq *wq;
2928
2929 rcu_read_lock();
2930 wq = rcu_dereference(sk->sk_wq);
2931 if (skwq_has_sleeper(wq))
2932 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2933 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2934 rcu_read_unlock();
2935 }
2936
sock_def_readable(struct sock * sk)2937 void sock_def_readable(struct sock *sk)
2938 {
2939 struct socket_wq *wq;
2940
2941 rcu_read_lock();
2942 wq = rcu_dereference(sk->sk_wq);
2943
2944 if (skwq_has_sleeper(wq)) {
2945 int done = 0;
2946
2947 trace_android_vh_do_wake_up_sync(&wq->wait, &done);
2948 if (done)
2949 goto out;
2950
2951 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2952 EPOLLRDNORM | EPOLLRDBAND);
2953 }
2954
2955 out:
2956 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2957 rcu_read_unlock();
2958 }
2959
sock_def_write_space(struct sock * sk)2960 static void sock_def_write_space(struct sock *sk)
2961 {
2962 struct socket_wq *wq;
2963
2964 rcu_read_lock();
2965
2966 /* Do not wake up a writer until he can make "significant"
2967 * progress. --DaveM
2968 */
2969 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2970 wq = rcu_dereference(sk->sk_wq);
2971 if (skwq_has_sleeper(wq))
2972 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2973 EPOLLWRNORM | EPOLLWRBAND);
2974
2975 /* Should agree with poll, otherwise some programs break */
2976 if (sock_writeable(sk))
2977 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2978 }
2979
2980 rcu_read_unlock();
2981 }
2982
sock_def_destruct(struct sock * sk)2983 static void sock_def_destruct(struct sock *sk)
2984 {
2985 }
2986
sk_send_sigurg(struct sock * sk)2987 void sk_send_sigurg(struct sock *sk)
2988 {
2989 if (sk->sk_socket && sk->sk_socket->file)
2990 if (send_sigurg(&sk->sk_socket->file->f_owner))
2991 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2992 }
2993 EXPORT_SYMBOL(sk_send_sigurg);
2994
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)2995 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2996 unsigned long expires)
2997 {
2998 if (!mod_timer(timer, expires))
2999 sock_hold(sk);
3000 }
3001 EXPORT_SYMBOL(sk_reset_timer);
3002
sk_stop_timer(struct sock * sk,struct timer_list * timer)3003 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3004 {
3005 if (del_timer(timer))
3006 __sock_put(sk);
3007 }
3008 EXPORT_SYMBOL(sk_stop_timer);
3009
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3010 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3011 {
3012 if (del_timer_sync(timer))
3013 __sock_put(sk);
3014 }
3015 EXPORT_SYMBOL(sk_stop_timer_sync);
3016
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)3017 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3018 {
3019 sk_init_common(sk);
3020 sk->sk_send_head = NULL;
3021
3022 timer_setup(&sk->sk_timer, NULL, 0);
3023
3024 sk->sk_allocation = GFP_KERNEL;
3025 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
3026 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
3027 sk->sk_state = TCP_CLOSE;
3028 sk_set_socket(sk, sock);
3029
3030 sock_set_flag(sk, SOCK_ZAPPED);
3031
3032 if (sock) {
3033 sk->sk_type = sock->type;
3034 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3035 sock->sk = sk;
3036 } else {
3037 RCU_INIT_POINTER(sk->sk_wq, NULL);
3038 }
3039 sk->sk_uid = uid;
3040
3041 rwlock_init(&sk->sk_callback_lock);
3042 if (sk->sk_kern_sock)
3043 lockdep_set_class_and_name(
3044 &sk->sk_callback_lock,
3045 af_kern_callback_keys + sk->sk_family,
3046 af_family_kern_clock_key_strings[sk->sk_family]);
3047 else
3048 lockdep_set_class_and_name(
3049 &sk->sk_callback_lock,
3050 af_callback_keys + sk->sk_family,
3051 af_family_clock_key_strings[sk->sk_family]);
3052
3053 sk->sk_state_change = sock_def_wakeup;
3054 sk->sk_data_ready = sock_def_readable;
3055 sk->sk_write_space = sock_def_write_space;
3056 sk->sk_error_report = sock_def_error_report;
3057 sk->sk_destruct = sock_def_destruct;
3058
3059 sk->sk_frag.page = NULL;
3060 sk->sk_frag.offset = 0;
3061 sk->sk_peek_off = -1;
3062
3063 sk->sk_peer_pid = NULL;
3064 sk->sk_peer_cred = NULL;
3065 spin_lock_init(&sk->sk_peer_lock);
3066
3067 sk->sk_write_pending = 0;
3068 sk->sk_rcvlowat = 1;
3069 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
3070 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
3071
3072 sk->sk_stamp = SK_DEFAULT_STAMP;
3073 #if BITS_PER_LONG==32
3074 seqlock_init(&sk->sk_stamp_seq);
3075 #endif
3076 atomic_set(&sk->sk_zckey, 0);
3077
3078 #ifdef CONFIG_NET_RX_BUSY_POLL
3079 sk->sk_napi_id = 0;
3080 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
3081 #endif
3082
3083 sk->sk_max_pacing_rate = ~0UL;
3084 sk->sk_pacing_rate = ~0UL;
3085 WRITE_ONCE(sk->sk_pacing_shift, 10);
3086 sk->sk_incoming_cpu = -1;
3087
3088 sk_rx_queue_clear(sk);
3089 /*
3090 * Before updating sk_refcnt, we must commit prior changes to memory
3091 * (Documentation/RCU/rculist_nulls.rst for details)
3092 */
3093 smp_wmb();
3094 refcount_set(&sk->sk_refcnt, 1);
3095 atomic_set(&sk->sk_drops, 0);
3096 }
3097 EXPORT_SYMBOL(sock_init_data_uid);
3098
sock_init_data(struct socket * sock,struct sock * sk)3099 void sock_init_data(struct socket *sock, struct sock *sk)
3100 {
3101 kuid_t uid = sock ?
3102 SOCK_INODE(sock)->i_uid :
3103 make_kuid(sock_net(sk)->user_ns, 0);
3104
3105 sock_init_data_uid(sock, sk, uid);
3106 }
3107 EXPORT_SYMBOL(sock_init_data);
3108
lock_sock_nested(struct sock * sk,int subclass)3109 void lock_sock_nested(struct sock *sk, int subclass)
3110 {
3111 might_sleep();
3112 spin_lock_bh(&sk->sk_lock.slock);
3113 if (sk->sk_lock.owned)
3114 __lock_sock(sk);
3115 sk->sk_lock.owned = 1;
3116 spin_unlock(&sk->sk_lock.slock);
3117 /*
3118 * The sk_lock has mutex_lock() semantics here:
3119 */
3120 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3121 local_bh_enable();
3122 }
3123 EXPORT_SYMBOL(lock_sock_nested);
3124
release_sock(struct sock * sk)3125 void release_sock(struct sock *sk)
3126 {
3127 spin_lock_bh(&sk->sk_lock.slock);
3128 if (sk->sk_backlog.tail)
3129 __release_sock(sk);
3130
3131 /* Warning : release_cb() might need to release sk ownership,
3132 * ie call sock_release_ownership(sk) before us.
3133 */
3134 if (sk->sk_prot->release_cb)
3135 sk->sk_prot->release_cb(sk);
3136
3137 sock_release_ownership(sk);
3138 if (waitqueue_active(&sk->sk_lock.wq))
3139 wake_up(&sk->sk_lock.wq);
3140 spin_unlock_bh(&sk->sk_lock.slock);
3141 }
3142 EXPORT_SYMBOL(release_sock);
3143
3144 /**
3145 * lock_sock_fast - fast version of lock_sock
3146 * @sk: socket
3147 *
3148 * This version should be used for very small section, where process wont block
3149 * return false if fast path is taken:
3150 *
3151 * sk_lock.slock locked, owned = 0, BH disabled
3152 *
3153 * return true if slow path is taken:
3154 *
3155 * sk_lock.slock unlocked, owned = 1, BH enabled
3156 */
lock_sock_fast(struct sock * sk)3157 bool lock_sock_fast(struct sock *sk)
3158 {
3159 might_sleep();
3160 spin_lock_bh(&sk->sk_lock.slock);
3161
3162 if (!sk->sk_lock.owned)
3163 /*
3164 * Note : We must disable BH
3165 */
3166 return false;
3167
3168 __lock_sock(sk);
3169 sk->sk_lock.owned = 1;
3170 spin_unlock(&sk->sk_lock.slock);
3171 /*
3172 * The sk_lock has mutex_lock() semantics here:
3173 */
3174 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3175 local_bh_enable();
3176 return true;
3177 }
3178 EXPORT_SYMBOL(lock_sock_fast);
3179
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3180 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3181 bool timeval, bool time32)
3182 {
3183 struct sock *sk = sock->sk;
3184 struct timespec64 ts;
3185
3186 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3187 ts = ktime_to_timespec64(sock_read_timestamp(sk));
3188 if (ts.tv_sec == -1)
3189 return -ENOENT;
3190 if (ts.tv_sec == 0) {
3191 ktime_t kt = ktime_get_real();
3192 sock_write_timestamp(sk, kt);
3193 ts = ktime_to_timespec64(kt);
3194 }
3195
3196 if (timeval)
3197 ts.tv_nsec /= 1000;
3198
3199 #ifdef CONFIG_COMPAT_32BIT_TIME
3200 if (time32)
3201 return put_old_timespec32(&ts, userstamp);
3202 #endif
3203 #ifdef CONFIG_SPARC64
3204 /* beware of padding in sparc64 timeval */
3205 if (timeval && !in_compat_syscall()) {
3206 struct __kernel_old_timeval __user tv = {
3207 .tv_sec = ts.tv_sec,
3208 .tv_usec = ts.tv_nsec,
3209 };
3210 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3211 return -EFAULT;
3212 return 0;
3213 }
3214 #endif
3215 return put_timespec64(&ts, userstamp);
3216 }
3217 EXPORT_SYMBOL(sock_gettstamp);
3218
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3219 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3220 {
3221 if (!sock_flag(sk, flag)) {
3222 unsigned long previous_flags = sk->sk_flags;
3223
3224 sock_set_flag(sk, flag);
3225 /*
3226 * we just set one of the two flags which require net
3227 * time stamping, but time stamping might have been on
3228 * already because of the other one
3229 */
3230 if (sock_needs_netstamp(sk) &&
3231 !(previous_flags & SK_FLAGS_TIMESTAMP))
3232 net_enable_timestamp();
3233 }
3234 }
3235
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3236 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3237 int level, int type)
3238 {
3239 struct sock_exterr_skb *serr;
3240 struct sk_buff *skb;
3241 int copied, err;
3242
3243 err = -EAGAIN;
3244 skb = sock_dequeue_err_skb(sk);
3245 if (skb == NULL)
3246 goto out;
3247
3248 copied = skb->len;
3249 if (copied > len) {
3250 msg->msg_flags |= MSG_TRUNC;
3251 copied = len;
3252 }
3253 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3254 if (err)
3255 goto out_free_skb;
3256
3257 sock_recv_timestamp(msg, sk, skb);
3258
3259 serr = SKB_EXT_ERR(skb);
3260 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3261
3262 msg->msg_flags |= MSG_ERRQUEUE;
3263 err = copied;
3264
3265 out_free_skb:
3266 kfree_skb(skb);
3267 out:
3268 return err;
3269 }
3270 EXPORT_SYMBOL(sock_recv_errqueue);
3271
3272 /*
3273 * Get a socket option on an socket.
3274 *
3275 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3276 * asynchronous errors should be reported by getsockopt. We assume
3277 * this means if you specify SO_ERROR (otherwise whats the point of it).
3278 */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3279 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3280 char __user *optval, int __user *optlen)
3281 {
3282 struct sock *sk = sock->sk;
3283
3284 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3285 }
3286 EXPORT_SYMBOL(sock_common_getsockopt);
3287
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3288 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3289 int flags)
3290 {
3291 struct sock *sk = sock->sk;
3292 int addr_len = 0;
3293 int err;
3294
3295 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3296 flags & ~MSG_DONTWAIT, &addr_len);
3297 if (err >= 0)
3298 msg->msg_namelen = addr_len;
3299 return err;
3300 }
3301 EXPORT_SYMBOL(sock_common_recvmsg);
3302
3303 /*
3304 * Set socket options on an inet socket.
3305 */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3306 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3307 sockptr_t optval, unsigned int optlen)
3308 {
3309 struct sock *sk = sock->sk;
3310
3311 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3312 }
3313 EXPORT_SYMBOL(sock_common_setsockopt);
3314
sk_common_release(struct sock * sk)3315 void sk_common_release(struct sock *sk)
3316 {
3317 if (sk->sk_prot->destroy)
3318 sk->sk_prot->destroy(sk);
3319
3320 /*
3321 * Observation: when sk_common_release is called, processes have
3322 * no access to socket. But net still has.
3323 * Step one, detach it from networking:
3324 *
3325 * A. Remove from hash tables.
3326 */
3327
3328 sk->sk_prot->unhash(sk);
3329
3330 /*
3331 * In this point socket cannot receive new packets, but it is possible
3332 * that some packets are in flight because some CPU runs receiver and
3333 * did hash table lookup before we unhashed socket. They will achieve
3334 * receive queue and will be purged by socket destructor.
3335 *
3336 * Also we still have packets pending on receive queue and probably,
3337 * our own packets waiting in device queues. sock_destroy will drain
3338 * receive queue, but transmitted packets will delay socket destruction
3339 * until the last reference will be released.
3340 */
3341
3342 sock_orphan(sk);
3343
3344 xfrm_sk_free_policy(sk);
3345
3346 sk_refcnt_debug_release(sk);
3347
3348 sock_put(sk);
3349 }
3350 EXPORT_SYMBOL(sk_common_release);
3351
sk_get_meminfo(const struct sock * sk,u32 * mem)3352 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3353 {
3354 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3355
3356 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3357 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3358 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3359 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3360 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3361 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3362 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3363 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3364 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3365 }
3366
3367 #ifdef CONFIG_PROC_FS
3368 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
3369 struct prot_inuse {
3370 int val[PROTO_INUSE_NR];
3371 };
3372
3373 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3374
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)3375 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3376 {
3377 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3378 }
3379 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3380
sock_prot_inuse_get(struct net * net,struct proto * prot)3381 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3382 {
3383 int cpu, idx = prot->inuse_idx;
3384 int res = 0;
3385
3386 for_each_possible_cpu(cpu)
3387 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3388
3389 return res >= 0 ? res : 0;
3390 }
3391 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3392
sock_inuse_add(struct net * net,int val)3393 static void sock_inuse_add(struct net *net, int val)
3394 {
3395 this_cpu_add(*net->core.sock_inuse, val);
3396 }
3397
sock_inuse_get(struct net * net)3398 int sock_inuse_get(struct net *net)
3399 {
3400 int cpu, res = 0;
3401
3402 for_each_possible_cpu(cpu)
3403 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3404
3405 return res;
3406 }
3407
3408 EXPORT_SYMBOL_GPL(sock_inuse_get);
3409
sock_inuse_init_net(struct net * net)3410 static int __net_init sock_inuse_init_net(struct net *net)
3411 {
3412 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3413 if (net->core.prot_inuse == NULL)
3414 return -ENOMEM;
3415
3416 net->core.sock_inuse = alloc_percpu(int);
3417 if (net->core.sock_inuse == NULL)
3418 goto out;
3419
3420 return 0;
3421
3422 out:
3423 free_percpu(net->core.prot_inuse);
3424 return -ENOMEM;
3425 }
3426
sock_inuse_exit_net(struct net * net)3427 static void __net_exit sock_inuse_exit_net(struct net *net)
3428 {
3429 free_percpu(net->core.prot_inuse);
3430 free_percpu(net->core.sock_inuse);
3431 }
3432
3433 static struct pernet_operations net_inuse_ops = {
3434 .init = sock_inuse_init_net,
3435 .exit = sock_inuse_exit_net,
3436 };
3437
net_inuse_init(void)3438 static __init int net_inuse_init(void)
3439 {
3440 if (register_pernet_subsys(&net_inuse_ops))
3441 panic("Cannot initialize net inuse counters");
3442
3443 return 0;
3444 }
3445
3446 core_initcall(net_inuse_init);
3447
assign_proto_idx(struct proto * prot)3448 static int assign_proto_idx(struct proto *prot)
3449 {
3450 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3451
3452 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3453 pr_err("PROTO_INUSE_NR exhausted\n");
3454 return -ENOSPC;
3455 }
3456
3457 set_bit(prot->inuse_idx, proto_inuse_idx);
3458 return 0;
3459 }
3460
release_proto_idx(struct proto * prot)3461 static void release_proto_idx(struct proto *prot)
3462 {
3463 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3464 clear_bit(prot->inuse_idx, proto_inuse_idx);
3465 }
3466 #else
assign_proto_idx(struct proto * prot)3467 static inline int assign_proto_idx(struct proto *prot)
3468 {
3469 return 0;
3470 }
3471
release_proto_idx(struct proto * prot)3472 static inline void release_proto_idx(struct proto *prot)
3473 {
3474 }
3475
sock_inuse_add(struct net * net,int val)3476 static void sock_inuse_add(struct net *net, int val)
3477 {
3478 }
3479 #endif
3480
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3481 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3482 {
3483 if (!twsk_prot)
3484 return;
3485 kfree(twsk_prot->twsk_slab_name);
3486 twsk_prot->twsk_slab_name = NULL;
3487 kmem_cache_destroy(twsk_prot->twsk_slab);
3488 twsk_prot->twsk_slab = NULL;
3489 }
3490
req_prot_cleanup(struct request_sock_ops * rsk_prot)3491 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3492 {
3493 if (!rsk_prot)
3494 return;
3495 kfree(rsk_prot->slab_name);
3496 rsk_prot->slab_name = NULL;
3497 kmem_cache_destroy(rsk_prot->slab);
3498 rsk_prot->slab = NULL;
3499 }
3500
req_prot_init(const struct proto * prot)3501 static int req_prot_init(const struct proto *prot)
3502 {
3503 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3504
3505 if (!rsk_prot)
3506 return 0;
3507
3508 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3509 prot->name);
3510 if (!rsk_prot->slab_name)
3511 return -ENOMEM;
3512
3513 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3514 rsk_prot->obj_size, 0,
3515 SLAB_ACCOUNT | prot->slab_flags,
3516 NULL);
3517
3518 if (!rsk_prot->slab) {
3519 pr_crit("%s: Can't create request sock SLAB cache!\n",
3520 prot->name);
3521 return -ENOMEM;
3522 }
3523 return 0;
3524 }
3525
proto_register(struct proto * prot,int alloc_slab)3526 int proto_register(struct proto *prot, int alloc_slab)
3527 {
3528 int ret = -ENOBUFS;
3529
3530 if (alloc_slab) {
3531 prot->slab = kmem_cache_create_usercopy(prot->name,
3532 prot->obj_size, 0,
3533 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3534 prot->slab_flags,
3535 prot->useroffset, prot->usersize,
3536 NULL);
3537
3538 if (prot->slab == NULL) {
3539 pr_crit("%s: Can't create sock SLAB cache!\n",
3540 prot->name);
3541 goto out;
3542 }
3543
3544 if (req_prot_init(prot))
3545 goto out_free_request_sock_slab;
3546
3547 if (prot->twsk_prot != NULL) {
3548 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3549
3550 if (prot->twsk_prot->twsk_slab_name == NULL)
3551 goto out_free_request_sock_slab;
3552
3553 prot->twsk_prot->twsk_slab =
3554 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3555 prot->twsk_prot->twsk_obj_size,
3556 0,
3557 SLAB_ACCOUNT |
3558 prot->slab_flags,
3559 NULL);
3560 if (prot->twsk_prot->twsk_slab == NULL)
3561 goto out_free_timewait_sock_slab;
3562 }
3563 }
3564
3565 mutex_lock(&proto_list_mutex);
3566 ret = assign_proto_idx(prot);
3567 if (ret) {
3568 mutex_unlock(&proto_list_mutex);
3569 goto out_free_timewait_sock_slab;
3570 }
3571 list_add(&prot->node, &proto_list);
3572 mutex_unlock(&proto_list_mutex);
3573 return ret;
3574
3575 out_free_timewait_sock_slab:
3576 if (alloc_slab && prot->twsk_prot)
3577 tw_prot_cleanup(prot->twsk_prot);
3578 out_free_request_sock_slab:
3579 if (alloc_slab) {
3580 req_prot_cleanup(prot->rsk_prot);
3581
3582 kmem_cache_destroy(prot->slab);
3583 prot->slab = NULL;
3584 }
3585 out:
3586 return ret;
3587 }
3588 EXPORT_SYMBOL(proto_register);
3589
proto_unregister(struct proto * prot)3590 void proto_unregister(struct proto *prot)
3591 {
3592 mutex_lock(&proto_list_mutex);
3593 release_proto_idx(prot);
3594 list_del(&prot->node);
3595 mutex_unlock(&proto_list_mutex);
3596
3597 kmem_cache_destroy(prot->slab);
3598 prot->slab = NULL;
3599
3600 req_prot_cleanup(prot->rsk_prot);
3601 tw_prot_cleanup(prot->twsk_prot);
3602 }
3603 EXPORT_SYMBOL(proto_unregister);
3604
sock_load_diag_module(int family,int protocol)3605 int sock_load_diag_module(int family, int protocol)
3606 {
3607 if (!protocol) {
3608 if (!sock_is_registered(family))
3609 return -ENOENT;
3610
3611 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3612 NETLINK_SOCK_DIAG, family);
3613 }
3614
3615 #ifdef CONFIG_INET
3616 if (family == AF_INET &&
3617 protocol != IPPROTO_RAW &&
3618 protocol < MAX_INET_PROTOS &&
3619 !rcu_access_pointer(inet_protos[protocol]))
3620 return -ENOENT;
3621 #endif
3622
3623 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3624 NETLINK_SOCK_DIAG, family, protocol);
3625 }
3626 EXPORT_SYMBOL(sock_load_diag_module);
3627
3628 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)3629 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3630 __acquires(proto_list_mutex)
3631 {
3632 mutex_lock(&proto_list_mutex);
3633 return seq_list_start_head(&proto_list, *pos);
3634 }
3635
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)3636 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3637 {
3638 return seq_list_next(v, &proto_list, pos);
3639 }
3640
proto_seq_stop(struct seq_file * seq,void * v)3641 static void proto_seq_stop(struct seq_file *seq, void *v)
3642 __releases(proto_list_mutex)
3643 {
3644 mutex_unlock(&proto_list_mutex);
3645 }
3646
proto_method_implemented(const void * method)3647 static char proto_method_implemented(const void *method)
3648 {
3649 return method == NULL ? 'n' : 'y';
3650 }
sock_prot_memory_allocated(struct proto * proto)3651 static long sock_prot_memory_allocated(struct proto *proto)
3652 {
3653 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3654 }
3655
sock_prot_memory_pressure(struct proto * proto)3656 static const char *sock_prot_memory_pressure(struct proto *proto)
3657 {
3658 return proto->memory_pressure != NULL ?
3659 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3660 }
3661
proto_seq_printf(struct seq_file * seq,struct proto * proto)3662 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3663 {
3664
3665 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
3666 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3667 proto->name,
3668 proto->obj_size,
3669 sock_prot_inuse_get(seq_file_net(seq), proto),
3670 sock_prot_memory_allocated(proto),
3671 sock_prot_memory_pressure(proto),
3672 proto->max_header,
3673 proto->slab == NULL ? "no" : "yes",
3674 module_name(proto->owner),
3675 proto_method_implemented(proto->close),
3676 proto_method_implemented(proto->connect),
3677 proto_method_implemented(proto->disconnect),
3678 proto_method_implemented(proto->accept),
3679 proto_method_implemented(proto->ioctl),
3680 proto_method_implemented(proto->init),
3681 proto_method_implemented(proto->destroy),
3682 proto_method_implemented(proto->shutdown),
3683 proto_method_implemented(proto->setsockopt),
3684 proto_method_implemented(proto->getsockopt),
3685 proto_method_implemented(proto->sendmsg),
3686 proto_method_implemented(proto->recvmsg),
3687 proto_method_implemented(proto->sendpage),
3688 proto_method_implemented(proto->bind),
3689 proto_method_implemented(proto->backlog_rcv),
3690 proto_method_implemented(proto->hash),
3691 proto_method_implemented(proto->unhash),
3692 proto_method_implemented(proto->get_port),
3693 proto_method_implemented(proto->enter_memory_pressure));
3694 }
3695
proto_seq_show(struct seq_file * seq,void * v)3696 static int proto_seq_show(struct seq_file *seq, void *v)
3697 {
3698 if (v == &proto_list)
3699 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3700 "protocol",
3701 "size",
3702 "sockets",
3703 "memory",
3704 "press",
3705 "maxhdr",
3706 "slab",
3707 "module",
3708 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3709 else
3710 proto_seq_printf(seq, list_entry(v, struct proto, node));
3711 return 0;
3712 }
3713
3714 static const struct seq_operations proto_seq_ops = {
3715 .start = proto_seq_start,
3716 .next = proto_seq_next,
3717 .stop = proto_seq_stop,
3718 .show = proto_seq_show,
3719 };
3720
proto_init_net(struct net * net)3721 static __net_init int proto_init_net(struct net *net)
3722 {
3723 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3724 sizeof(struct seq_net_private)))
3725 return -ENOMEM;
3726
3727 return 0;
3728 }
3729
proto_exit_net(struct net * net)3730 static __net_exit void proto_exit_net(struct net *net)
3731 {
3732 remove_proc_entry("protocols", net->proc_net);
3733 }
3734
3735
3736 static __net_initdata struct pernet_operations proto_net_ops = {
3737 .init = proto_init_net,
3738 .exit = proto_exit_net,
3739 };
3740
proto_init(void)3741 static int __init proto_init(void)
3742 {
3743 return register_pernet_subsys(&proto_net_ops);
3744 }
3745
3746 subsys_initcall(proto_init);
3747
3748 #endif /* PROC_FS */
3749
3750 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)3751 bool sk_busy_loop_end(void *p, unsigned long start_time)
3752 {
3753 struct sock *sk = p;
3754
3755 return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3756 sk_busy_loop_timeout(sk, start_time);
3757 }
3758 EXPORT_SYMBOL(sk_busy_loop_end);
3759 #endif /* CONFIG_NET_RX_BUSY_POLL */
3760
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)3761 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3762 {
3763 if (!sk->sk_prot->bind_add)
3764 return -EOPNOTSUPP;
3765 return sk->sk_prot->bind_add(sk, addr, addr_len);
3766 }
3767 EXPORT_SYMBOL(sock_bind_add);
3768