1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * NET4: Implementation of BSD Unix domain sockets.
4 *
5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
6 *
7 * Fixes:
8 * Linus Torvalds : Assorted bug cures.
9 * Niibe Yutaka : async I/O support.
10 * Carsten Paeth : PF_UNIX check, address fixes.
11 * Alan Cox : Limit size of allocated blocks.
12 * Alan Cox : Fixed the stupid socketpair bug.
13 * Alan Cox : BSD compatibility fine tuning.
14 * Alan Cox : Fixed a bug in connect when interrupted.
15 * Alan Cox : Sorted out a proper draft version of
16 * file descriptor passing hacked up from
17 * Mike Shaver's work.
18 * Marty Leisner : Fixes to fd passing
19 * Nick Nevin : recvmsg bugfix.
20 * Alan Cox : Started proper garbage collector
21 * Heiko EiBfeldt : Missing verify_area check
22 * Alan Cox : Started POSIXisms
23 * Andreas Schwab : Replace inode by dentry for proper
24 * reference counting
25 * Kirk Petersen : Made this a module
26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
27 * Lots of bug fixes.
28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
29 * by above two patches.
30 * Andrea Arcangeli : If possible we block in connect(2)
31 * if the max backlog of the listen socket
32 * is been reached. This won't break
33 * old apps and it will avoid huge amount
34 * of socks hashed (this for unix_gc()
35 * performances reasons).
36 * Security fix that limits the max
37 * number of socks to 2*max_files and
38 * the number of skb queueable in the
39 * dgram receiver.
40 * Artur Skawina : Hash function optimizations
41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42 * Malcolm Beattie : Set peercred for socketpair
43 * Michal Ostrowski : Module initialization cleanup.
44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45 * the core infrastructure is doing that
46 * for all net proto families now (2.5.69+)
47 *
48 * Known differences from reference BSD that was tested:
49 *
50 * [TO FIX]
51 * ECONNREFUSED is not returned from one end of a connected() socket to the
52 * other the moment one end closes.
53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
55 * [NOT TO FIX]
56 * accept() returns a path name even if the connecting socket has closed
57 * in the meantime (BSD loses the path and gives up).
58 * accept() returns 0 length path for an unbound connector. BSD returns 16
59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 * BSD af_unix apparently has connect forgetting to block properly.
62 * (need to check this with the POSIX spec in detail)
63 *
64 * Differences from 2.0.0-11-... (ANK)
65 * Bug fixes and improvements.
66 * - client shutdown killed server socket.
67 * - removed all useless cli/sti pairs.
68 *
69 * Semantic changes/extensions.
70 * - generic control message passing.
71 * - SCM_CREDENTIALS control message.
72 * - "Abstract" (not FS based) socket bindings.
73 * Abstract names are sequences of bytes (not zero terminated)
74 * started by 0, so that this name space does not intersect
75 * with BSD names.
76 */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124
125 /* SMP locking strategy:
126 * hash table is protected with spinlock.
127 * each socket state is protected by separate spinlock.
128 */
129 #ifdef CONFIG_PROVE_LOCKING
130 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r)))
131
unix_table_lock_cmp_fn(const struct lockdep_map * a,const struct lockdep_map * b)132 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
133 const struct lockdep_map *b)
134 {
135 return cmp_ptr(a, b);
136 }
137
unix_state_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)138 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
139 const struct lockdep_map *_b)
140 {
141 const struct unix_sock *a, *b;
142
143 a = container_of(_a, struct unix_sock, lock.dep_map);
144 b = container_of(_b, struct unix_sock, lock.dep_map);
145
146 if (a->sk.sk_state == TCP_LISTEN) {
147 /* unix_stream_connect(): Before the 2nd unix_state_lock(),
148 *
149 * 1. a is TCP_LISTEN.
150 * 2. b is not a.
151 * 3. concurrent connect(b -> a) must fail.
152 *
153 * Except for 2. & 3., the b's state can be any possible
154 * value due to concurrent connect() or listen().
155 *
156 * 2. is detected in debug_spin_lock_before(), and 3. cannot
157 * be expressed as lock_cmp_fn.
158 */
159 switch (b->sk.sk_state) {
160 case TCP_CLOSE:
161 case TCP_ESTABLISHED:
162 case TCP_LISTEN:
163 return -1;
164 default:
165 /* Invalid case. */
166 return 0;
167 }
168 }
169
170 /* Should never happen. Just to be symmetric. */
171 if (b->sk.sk_state == TCP_LISTEN) {
172 switch (b->sk.sk_state) {
173 case TCP_CLOSE:
174 case TCP_ESTABLISHED:
175 return 1;
176 default:
177 return 0;
178 }
179 }
180
181 /* unix_state_double_lock(): ascending address order. */
182 return cmp_ptr(a, b);
183 }
184
unix_recvq_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)185 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
186 const struct lockdep_map *_b)
187 {
188 const struct sock *a, *b;
189
190 a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
191 b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
192
193 /* unix_collect_skb(): listener -> embryo order. */
194 if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
195 return -1;
196
197 /* Should never happen. Just to be symmetric. */
198 if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
199 return 1;
200
201 return 0;
202 }
203 #endif
204
unix_unbound_hash(struct sock * sk)205 static unsigned int unix_unbound_hash(struct sock *sk)
206 {
207 unsigned long hash = (unsigned long)sk;
208
209 hash ^= hash >> 16;
210 hash ^= hash >> 8;
211 hash ^= sk->sk_type;
212
213 return hash & UNIX_HASH_MOD;
214 }
215
unix_bsd_hash(struct inode * i)216 static unsigned int unix_bsd_hash(struct inode *i)
217 {
218 return i->i_ino & UNIX_HASH_MOD;
219 }
220
unix_abstract_hash(struct sockaddr_un * sunaddr,int addr_len,int type)221 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
222 int addr_len, int type)
223 {
224 __wsum csum = csum_partial(sunaddr, addr_len, 0);
225 unsigned int hash;
226
227 hash = (__force unsigned int)csum_fold(csum);
228 hash ^= hash >> 8;
229 hash ^= type;
230
231 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
232 }
233
unix_table_double_lock(struct net * net,unsigned int hash1,unsigned int hash2)234 static void unix_table_double_lock(struct net *net,
235 unsigned int hash1, unsigned int hash2)
236 {
237 if (hash1 == hash2) {
238 spin_lock(&net->unx.table.locks[hash1]);
239 return;
240 }
241
242 if (hash1 > hash2)
243 swap(hash1, hash2);
244
245 spin_lock(&net->unx.table.locks[hash1]);
246 spin_lock(&net->unx.table.locks[hash2]);
247 }
248
unix_table_double_unlock(struct net * net,unsigned int hash1,unsigned int hash2)249 static void unix_table_double_unlock(struct net *net,
250 unsigned int hash1, unsigned int hash2)
251 {
252 if (hash1 == hash2) {
253 spin_unlock(&net->unx.table.locks[hash1]);
254 return;
255 }
256
257 spin_unlock(&net->unx.table.locks[hash1]);
258 spin_unlock(&net->unx.table.locks[hash2]);
259 }
260
261 #ifdef CONFIG_SECURITY_NETWORK
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)262 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
263 {
264 UNIXCB(skb).secid = scm->secid;
265 }
266
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)267 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
268 {
269 scm->secid = UNIXCB(skb).secid;
270 }
271
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)272 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
273 {
274 return (scm->secid == UNIXCB(skb).secid);
275 }
276 #else
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)277 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
278 { }
279
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)280 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
281 { }
282
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)283 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
284 {
285 return true;
286 }
287 #endif /* CONFIG_SECURITY_NETWORK */
288
unix_our_peer(struct sock * sk,struct sock * osk)289 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
290 {
291 return unix_peer(osk) == sk;
292 }
293
unix_may_send(struct sock * sk,struct sock * osk)294 static inline int unix_may_send(struct sock *sk, struct sock *osk)
295 {
296 return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
297 }
298
unix_recvq_full_lockless(const struct sock * sk)299 static inline int unix_recvq_full_lockless(const struct sock *sk)
300 {
301 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
302 }
303
unix_peer_get(struct sock * s)304 struct sock *unix_peer_get(struct sock *s)
305 {
306 struct sock *peer;
307
308 unix_state_lock(s);
309 peer = unix_peer(s);
310 if (peer)
311 sock_hold(peer);
312 unix_state_unlock(s);
313 return peer;
314 }
315 EXPORT_SYMBOL_GPL(unix_peer_get);
316
unix_create_addr(struct sockaddr_un * sunaddr,int addr_len)317 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
318 int addr_len)
319 {
320 struct unix_address *addr;
321
322 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
323 if (!addr)
324 return NULL;
325
326 refcount_set(&addr->refcnt, 1);
327 addr->len = addr_len;
328 memcpy(addr->name, sunaddr, addr_len);
329
330 return addr;
331 }
332
unix_release_addr(struct unix_address * addr)333 static inline void unix_release_addr(struct unix_address *addr)
334 {
335 if (refcount_dec_and_test(&addr->refcnt))
336 kfree(addr);
337 }
338
339 /*
340 * Check unix socket name:
341 * - should be not zero length.
342 * - if started by not zero, should be NULL terminated (FS object)
343 * - if started by zero, it is abstract name.
344 */
345
unix_validate_addr(struct sockaddr_un * sunaddr,int addr_len)346 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
347 {
348 if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
349 addr_len > sizeof(*sunaddr))
350 return -EINVAL;
351
352 if (sunaddr->sun_family != AF_UNIX)
353 return -EINVAL;
354
355 return 0;
356 }
357
unix_mkname_bsd(struct sockaddr_un * sunaddr,int addr_len)358 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
359 {
360 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
361 short offset = offsetof(struct sockaddr_storage, __data);
362
363 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
364
365 /* This may look like an off by one error but it is a bit more
366 * subtle. 108 is the longest valid AF_UNIX path for a binding.
367 * sun_path[108] doesn't as such exist. However in kernel space
368 * we are guaranteed that it is a valid memory location in our
369 * kernel address buffer because syscall functions always pass
370 * a pointer of struct sockaddr_storage which has a bigger buffer
371 * than 108. Also, we must terminate sun_path for strlen() in
372 * getname_kernel().
373 */
374 addr->__data[addr_len - offset] = 0;
375
376 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will
377 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen()
378 * know the actual buffer.
379 */
380 return strlen(addr->__data) + offset + 1;
381 }
382
__unix_remove_socket(struct sock * sk)383 static void __unix_remove_socket(struct sock *sk)
384 {
385 sk_del_node_init(sk);
386 }
387
__unix_insert_socket(struct net * net,struct sock * sk)388 static void __unix_insert_socket(struct net *net, struct sock *sk)
389 {
390 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
391 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
392 }
393
__unix_set_addr_hash(struct net * net,struct sock * sk,struct unix_address * addr,unsigned int hash)394 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
395 struct unix_address *addr, unsigned int hash)
396 {
397 __unix_remove_socket(sk);
398 smp_store_release(&unix_sk(sk)->addr, addr);
399
400 sk->sk_hash = hash;
401 __unix_insert_socket(net, sk);
402 }
403
unix_remove_socket(struct net * net,struct sock * sk)404 static void unix_remove_socket(struct net *net, struct sock *sk)
405 {
406 spin_lock(&net->unx.table.locks[sk->sk_hash]);
407 __unix_remove_socket(sk);
408 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
409 }
410
unix_insert_unbound_socket(struct net * net,struct sock * sk)411 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
412 {
413 spin_lock(&net->unx.table.locks[sk->sk_hash]);
414 __unix_insert_socket(net, sk);
415 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
416 }
417
unix_insert_bsd_socket(struct sock * sk)418 static void unix_insert_bsd_socket(struct sock *sk)
419 {
420 spin_lock(&bsd_socket_locks[sk->sk_hash]);
421 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
422 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
423 }
424
unix_remove_bsd_socket(struct sock * sk)425 static void unix_remove_bsd_socket(struct sock *sk)
426 {
427 if (!hlist_unhashed(&sk->sk_bind_node)) {
428 spin_lock(&bsd_socket_locks[sk->sk_hash]);
429 __sk_del_bind_node(sk);
430 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
431
432 sk_node_init(&sk->sk_bind_node);
433 }
434 }
435
__unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)436 static struct sock *__unix_find_socket_byname(struct net *net,
437 struct sockaddr_un *sunname,
438 int len, unsigned int hash)
439 {
440 struct sock *s;
441
442 sk_for_each(s, &net->unx.table.buckets[hash]) {
443 struct unix_sock *u = unix_sk(s);
444
445 if (u->addr->len == len &&
446 !memcmp(u->addr->name, sunname, len))
447 return s;
448 }
449 return NULL;
450 }
451
unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)452 static inline struct sock *unix_find_socket_byname(struct net *net,
453 struct sockaddr_un *sunname,
454 int len, unsigned int hash)
455 {
456 struct sock *s;
457
458 spin_lock(&net->unx.table.locks[hash]);
459 s = __unix_find_socket_byname(net, sunname, len, hash);
460 if (s)
461 sock_hold(s);
462 spin_unlock(&net->unx.table.locks[hash]);
463 return s;
464 }
465
unix_find_socket_byinode(struct inode * i)466 static struct sock *unix_find_socket_byinode(struct inode *i)
467 {
468 unsigned int hash = unix_bsd_hash(i);
469 struct sock *s;
470
471 spin_lock(&bsd_socket_locks[hash]);
472 sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
473 struct dentry *dentry = unix_sk(s)->path.dentry;
474
475 if (dentry && d_backing_inode(dentry) == i) {
476 sock_hold(s);
477 spin_unlock(&bsd_socket_locks[hash]);
478 return s;
479 }
480 }
481 spin_unlock(&bsd_socket_locks[hash]);
482 return NULL;
483 }
484
485 /* Support code for asymmetrically connected dgram sockets
486 *
487 * If a datagram socket is connected to a socket not itself connected
488 * to the first socket (eg, /dev/log), clients may only enqueue more
489 * messages if the present receive queue of the server socket is not
490 * "too large". This means there's a second writeability condition
491 * poll and sendmsg need to test. The dgram recv code will do a wake
492 * up on the peer_wait wait queue of a socket upon reception of a
493 * datagram which needs to be propagated to sleeping would-be writers
494 * since these might not have sent anything so far. This can't be
495 * accomplished via poll_wait because the lifetime of the server
496 * socket might be less than that of its clients if these break their
497 * association with it or if the server socket is closed while clients
498 * are still connected to it and there's no way to inform "a polling
499 * implementation" that it should let go of a certain wait queue
500 *
501 * In order to propagate a wake up, a wait_queue_entry_t of the client
502 * socket is enqueued on the peer_wait queue of the server socket
503 * whose wake function does a wake_up on the ordinary client socket
504 * wait queue. This connection is established whenever a write (or
505 * poll for write) hit the flow control condition and broken when the
506 * association to the server socket is dissolved or after a wake up
507 * was relayed.
508 */
509
unix_dgram_peer_wake_relay(wait_queue_entry_t * q,unsigned mode,int flags,void * key)510 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
511 void *key)
512 {
513 struct unix_sock *u;
514 wait_queue_head_t *u_sleep;
515
516 u = container_of(q, struct unix_sock, peer_wake);
517
518 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
519 q);
520 u->peer_wake.private = NULL;
521
522 /* relaying can only happen while the wq still exists */
523 u_sleep = sk_sleep(&u->sk);
524 if (u_sleep)
525 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
526
527 return 0;
528 }
529
unix_dgram_peer_wake_connect(struct sock * sk,struct sock * other)530 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
531 {
532 struct unix_sock *u, *u_other;
533 int rc;
534
535 u = unix_sk(sk);
536 u_other = unix_sk(other);
537 rc = 0;
538 spin_lock(&u_other->peer_wait.lock);
539
540 if (!u->peer_wake.private) {
541 u->peer_wake.private = other;
542 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
543
544 rc = 1;
545 }
546
547 spin_unlock(&u_other->peer_wait.lock);
548 return rc;
549 }
550
unix_dgram_peer_wake_disconnect(struct sock * sk,struct sock * other)551 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
552 struct sock *other)
553 {
554 struct unix_sock *u, *u_other;
555
556 u = unix_sk(sk);
557 u_other = unix_sk(other);
558 spin_lock(&u_other->peer_wait.lock);
559
560 if (u->peer_wake.private == other) {
561 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
562 u->peer_wake.private = NULL;
563 }
564
565 spin_unlock(&u_other->peer_wait.lock);
566 }
567
unix_dgram_peer_wake_disconnect_wakeup(struct sock * sk,struct sock * other)568 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
569 struct sock *other)
570 {
571 unix_dgram_peer_wake_disconnect(sk, other);
572 wake_up_interruptible_poll(sk_sleep(sk),
573 EPOLLOUT |
574 EPOLLWRNORM |
575 EPOLLWRBAND);
576 }
577
578 /* preconditions:
579 * - unix_peer(sk) == other
580 * - association is stable
581 */
unix_dgram_peer_wake_me(struct sock * sk,struct sock * other)582 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
583 {
584 int connected;
585
586 connected = unix_dgram_peer_wake_connect(sk, other);
587
588 /* If other is SOCK_DEAD, we want to make sure we signal
589 * POLLOUT, such that a subsequent write() can get a
590 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
591 * to other and its full, we will hang waiting for POLLOUT.
592 */
593 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
594 return 1;
595
596 if (connected)
597 unix_dgram_peer_wake_disconnect(sk, other);
598
599 return 0;
600 }
601
unix_writable(const struct sock * sk,unsigned char state)602 static int unix_writable(const struct sock *sk, unsigned char state)
603 {
604 return state != TCP_LISTEN &&
605 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
606 }
607
unix_write_space(struct sock * sk)608 static void unix_write_space(struct sock *sk)
609 {
610 struct socket_wq *wq;
611
612 rcu_read_lock();
613 if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
614 wq = rcu_dereference(sk->sk_wq);
615 if (skwq_has_sleeper(wq))
616 wake_up_interruptible_sync_poll(&wq->wait,
617 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
618 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
619 }
620 rcu_read_unlock();
621 }
622
623 /* When dgram socket disconnects (or changes its peer), we clear its receive
624 * queue of packets arrived from previous peer. First, it allows to do
625 * flow control based only on wmem_alloc; second, sk connected to peer
626 * may receive messages only from that peer. */
unix_dgram_disconnected(struct sock * sk,struct sock * other)627 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
628 {
629 if (!skb_queue_empty(&sk->sk_receive_queue)) {
630 skb_queue_purge(&sk->sk_receive_queue);
631 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
632
633 /* If one link of bidirectional dgram pipe is disconnected,
634 * we signal error. Messages are lost. Do not make this,
635 * when peer was not connected to us.
636 */
637 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
638 WRITE_ONCE(other->sk_err, ECONNRESET);
639 sk_error_report(other);
640 }
641 }
642 }
643
unix_sock_destructor(struct sock * sk)644 static void unix_sock_destructor(struct sock *sk)
645 {
646 struct unix_sock *u = unix_sk(sk);
647
648 skb_queue_purge(&sk->sk_receive_queue);
649
650 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
651 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
652 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
653 if (!sock_flag(sk, SOCK_DEAD)) {
654 pr_info("Attempt to release alive unix socket: %p\n", sk);
655 return;
656 }
657
658 if (u->addr)
659 unix_release_addr(u->addr);
660
661 atomic_long_dec(&unix_nr_socks);
662 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
663 #ifdef UNIX_REFCNT_DEBUG
664 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
665 atomic_long_read(&unix_nr_socks));
666 #endif
667 }
668
unix_skb_len(const struct sk_buff * skb)669 static unsigned int unix_skb_len(const struct sk_buff *skb)
670 {
671 return skb->len - UNIXCB(skb).consumed;
672 }
673
unix_release_sock(struct sock * sk,int embrion)674 static void unix_release_sock(struct sock *sk, int embrion)
675 {
676 struct unix_sock *u = unix_sk(sk);
677 struct sock *skpair;
678 struct sk_buff *skb;
679 struct path path;
680 int state;
681
682 unix_remove_socket(sock_net(sk), sk);
683 unix_remove_bsd_socket(sk);
684
685 /* Clear state */
686 unix_state_lock(sk);
687 sock_orphan(sk);
688 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
689 path = u->path;
690 u->path.dentry = NULL;
691 u->path.mnt = NULL;
692 state = sk->sk_state;
693 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
694
695 skpair = unix_peer(sk);
696 unix_peer(sk) = NULL;
697
698 unix_state_unlock(sk);
699
700 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
701 u->oob_skb = NULL;
702 #endif
703
704 wake_up_interruptible_all(&u->peer_wait);
705
706 if (skpair != NULL) {
707 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
708 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
709
710 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
711 if (skb && !unix_skb_len(skb))
712 skb = skb_peek_next(skb, &sk->sk_receive_queue);
713 #endif
714 unix_state_lock(skpair);
715 /* No more writes */
716 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
717 if (skb || embrion)
718 WRITE_ONCE(skpair->sk_err, ECONNRESET);
719 unix_state_unlock(skpair);
720 skpair->sk_state_change(skpair);
721 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
722 }
723
724 unix_dgram_peer_wake_disconnect(sk, skpair);
725 sock_put(skpair); /* It may now die */
726 }
727
728 /* Try to flush out this socket. Throw out buffers at least */
729
730 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
731 if (state == TCP_LISTEN)
732 unix_release_sock(skb->sk, 1);
733
734 /* passed fds are erased in the kfree_skb hook */
735 kfree_skb(skb);
736 }
737
738 if (path.dentry)
739 path_put(&path);
740
741 sock_put(sk);
742
743 /* ---- Socket is dead now and most probably destroyed ---- */
744
745 /*
746 * Fixme: BSD difference: In BSD all sockets connected to us get
747 * ECONNRESET and we die on the spot. In Linux we behave
748 * like files and pipes do and wait for the last
749 * dereference.
750 *
751 * Can't we simply set sock->err?
752 *
753 * What the above comment does talk about? --ANK(980817)
754 */
755
756 if (READ_ONCE(unix_tot_inflight))
757 unix_gc(); /* Garbage collect fds */
758 }
759
init_peercred(struct sock * sk)760 static void init_peercred(struct sock *sk)
761 {
762 sk->sk_peer_pid = get_pid(task_tgid(current));
763 sk->sk_peer_cred = get_current_cred();
764 }
765
update_peercred(struct sock * sk)766 static void update_peercred(struct sock *sk)
767 {
768 const struct cred *old_cred;
769 struct pid *old_pid;
770
771 spin_lock(&sk->sk_peer_lock);
772 old_pid = sk->sk_peer_pid;
773 old_cred = sk->sk_peer_cred;
774 init_peercred(sk);
775 spin_unlock(&sk->sk_peer_lock);
776
777 put_pid(old_pid);
778 put_cred(old_cred);
779 }
780
copy_peercred(struct sock * sk,struct sock * peersk)781 static void copy_peercred(struct sock *sk, struct sock *peersk)
782 {
783 lockdep_assert_held(&unix_sk(peersk)->lock);
784
785 spin_lock(&sk->sk_peer_lock);
786 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
787 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
788 spin_unlock(&sk->sk_peer_lock);
789 }
790
unix_listen(struct socket * sock,int backlog)791 static int unix_listen(struct socket *sock, int backlog)
792 {
793 int err;
794 struct sock *sk = sock->sk;
795 struct unix_sock *u = unix_sk(sk);
796
797 err = -EOPNOTSUPP;
798 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
799 goto out; /* Only stream/seqpacket sockets accept */
800 err = -EINVAL;
801 if (!READ_ONCE(u->addr))
802 goto out; /* No listens on an unbound socket */
803 unix_state_lock(sk);
804 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
805 goto out_unlock;
806 if (backlog > sk->sk_max_ack_backlog)
807 wake_up_interruptible_all(&u->peer_wait);
808 sk->sk_max_ack_backlog = backlog;
809 WRITE_ONCE(sk->sk_state, TCP_LISTEN);
810
811 /* set credentials so connect can copy them */
812 update_peercred(sk);
813 err = 0;
814
815 out_unlock:
816 unix_state_unlock(sk);
817 out:
818 return err;
819 }
820
821 static int unix_release(struct socket *);
822 static int unix_bind(struct socket *, struct sockaddr *, int);
823 static int unix_stream_connect(struct socket *, struct sockaddr *,
824 int addr_len, int flags);
825 static int unix_socketpair(struct socket *, struct socket *);
826 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
827 static int unix_getname(struct socket *, struct sockaddr *, int);
828 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
829 static __poll_t unix_dgram_poll(struct file *, struct socket *,
830 poll_table *);
831 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
832 #ifdef CONFIG_COMPAT
833 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
834 #endif
835 static int unix_shutdown(struct socket *, int);
836 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
837 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
838 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
839 struct pipe_inode_info *, size_t size,
840 unsigned int flags);
841 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
842 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
843 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
844 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
845 static int unix_dgram_connect(struct socket *, struct sockaddr *,
846 int, int);
847 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
848 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
849 int);
850
851 #ifdef CONFIG_PROC_FS
unix_count_nr_fds(struct sock * sk)852 static int unix_count_nr_fds(struct sock *sk)
853 {
854 struct sk_buff *skb;
855 struct unix_sock *u;
856 int nr_fds = 0;
857
858 spin_lock(&sk->sk_receive_queue.lock);
859 skb = skb_peek(&sk->sk_receive_queue);
860 while (skb) {
861 u = unix_sk(skb->sk);
862 nr_fds += atomic_read(&u->scm_stat.nr_fds);
863 skb = skb_peek_next(skb, &sk->sk_receive_queue);
864 }
865 spin_unlock(&sk->sk_receive_queue.lock);
866
867 return nr_fds;
868 }
869
unix_show_fdinfo(struct seq_file * m,struct socket * sock)870 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
871 {
872 struct sock *sk = sock->sk;
873 unsigned char s_state;
874 struct unix_sock *u;
875 int nr_fds = 0;
876
877 if (sk) {
878 s_state = READ_ONCE(sk->sk_state);
879 u = unix_sk(sk);
880
881 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
882 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
883 * SOCK_DGRAM is ordinary. So, no lock is needed.
884 */
885 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
886 nr_fds = atomic_read(&u->scm_stat.nr_fds);
887 else if (s_state == TCP_LISTEN)
888 nr_fds = unix_count_nr_fds(sk);
889
890 seq_printf(m, "scm_fds: %u\n", nr_fds);
891 }
892 }
893 #else
894 #define unix_show_fdinfo NULL
895 #endif
896
897 static const struct proto_ops unix_stream_ops = {
898 .family = PF_UNIX,
899 .owner = THIS_MODULE,
900 .release = unix_release,
901 .bind = unix_bind,
902 .connect = unix_stream_connect,
903 .socketpair = unix_socketpair,
904 .accept = unix_accept,
905 .getname = unix_getname,
906 .poll = unix_poll,
907 .ioctl = unix_ioctl,
908 #ifdef CONFIG_COMPAT
909 .compat_ioctl = unix_compat_ioctl,
910 #endif
911 .listen = unix_listen,
912 .shutdown = unix_shutdown,
913 .sendmsg = unix_stream_sendmsg,
914 .recvmsg = unix_stream_recvmsg,
915 .read_skb = unix_stream_read_skb,
916 .mmap = sock_no_mmap,
917 .splice_read = unix_stream_splice_read,
918 .set_peek_off = sk_set_peek_off,
919 .show_fdinfo = unix_show_fdinfo,
920 };
921
922 static const struct proto_ops unix_dgram_ops = {
923 .family = PF_UNIX,
924 .owner = THIS_MODULE,
925 .release = unix_release,
926 .bind = unix_bind,
927 .connect = unix_dgram_connect,
928 .socketpair = unix_socketpair,
929 .accept = sock_no_accept,
930 .getname = unix_getname,
931 .poll = unix_dgram_poll,
932 .ioctl = unix_ioctl,
933 #ifdef CONFIG_COMPAT
934 .compat_ioctl = unix_compat_ioctl,
935 #endif
936 .listen = sock_no_listen,
937 .shutdown = unix_shutdown,
938 .sendmsg = unix_dgram_sendmsg,
939 .read_skb = unix_read_skb,
940 .recvmsg = unix_dgram_recvmsg,
941 .mmap = sock_no_mmap,
942 .set_peek_off = sk_set_peek_off,
943 .show_fdinfo = unix_show_fdinfo,
944 };
945
946 static const struct proto_ops unix_seqpacket_ops = {
947 .family = PF_UNIX,
948 .owner = THIS_MODULE,
949 .release = unix_release,
950 .bind = unix_bind,
951 .connect = unix_stream_connect,
952 .socketpair = unix_socketpair,
953 .accept = unix_accept,
954 .getname = unix_getname,
955 .poll = unix_dgram_poll,
956 .ioctl = unix_ioctl,
957 #ifdef CONFIG_COMPAT
958 .compat_ioctl = unix_compat_ioctl,
959 #endif
960 .listen = unix_listen,
961 .shutdown = unix_shutdown,
962 .sendmsg = unix_seqpacket_sendmsg,
963 .recvmsg = unix_seqpacket_recvmsg,
964 .mmap = sock_no_mmap,
965 .set_peek_off = sk_set_peek_off,
966 .show_fdinfo = unix_show_fdinfo,
967 };
968
unix_close(struct sock * sk,long timeout)969 static void unix_close(struct sock *sk, long timeout)
970 {
971 /* Nothing to do here, unix socket does not need a ->close().
972 * This is merely for sockmap.
973 */
974 }
975
unix_unhash(struct sock * sk)976 static void unix_unhash(struct sock *sk)
977 {
978 /* Nothing to do here, unix socket does not need a ->unhash().
979 * This is merely for sockmap.
980 */
981 }
982
unix_bpf_bypass_getsockopt(int level,int optname)983 static bool unix_bpf_bypass_getsockopt(int level, int optname)
984 {
985 if (level == SOL_SOCKET) {
986 switch (optname) {
987 case SO_PEERPIDFD:
988 return true;
989 default:
990 return false;
991 }
992 }
993
994 return false;
995 }
996
997 struct proto unix_dgram_proto = {
998 .name = "UNIX",
999 .owner = THIS_MODULE,
1000 .obj_size = sizeof(struct unix_sock),
1001 .close = unix_close,
1002 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
1003 #ifdef CONFIG_BPF_SYSCALL
1004 .psock_update_sk_prot = unix_dgram_bpf_update_proto,
1005 #endif
1006 };
1007
1008 struct proto unix_stream_proto = {
1009 .name = "UNIX-STREAM",
1010 .owner = THIS_MODULE,
1011 .obj_size = sizeof(struct unix_sock),
1012 .close = unix_close,
1013 .unhash = unix_unhash,
1014 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
1015 #ifdef CONFIG_BPF_SYSCALL
1016 .psock_update_sk_prot = unix_stream_bpf_update_proto,
1017 #endif
1018 };
1019
unix_create1(struct net * net,struct socket * sock,int kern,int type)1020 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1021 {
1022 struct unix_sock *u;
1023 struct sock *sk;
1024 int err;
1025
1026 atomic_long_inc(&unix_nr_socks);
1027 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1028 err = -ENFILE;
1029 goto err;
1030 }
1031
1032 if (type == SOCK_STREAM)
1033 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1034 else /*dgram and seqpacket */
1035 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1036
1037 if (!sk) {
1038 err = -ENOMEM;
1039 goto err;
1040 }
1041
1042 sock_init_data(sock, sk);
1043
1044 sk->sk_hash = unix_unbound_hash(sk);
1045 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
1046 sk->sk_write_space = unix_write_space;
1047 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1048 sk->sk_destruct = unix_sock_destructor;
1049 lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1050
1051 u = unix_sk(sk);
1052 u->listener = NULL;
1053 u->vertex = NULL;
1054 u->path.dentry = NULL;
1055 u->path.mnt = NULL;
1056 spin_lock_init(&u->lock);
1057 lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1058 mutex_init(&u->iolock); /* single task reading lock */
1059 mutex_init(&u->bindlock); /* single task binding lock */
1060 init_waitqueue_head(&u->peer_wait);
1061 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1062 memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1063 unix_insert_unbound_socket(net, sk);
1064
1065 sock_prot_inuse_add(net, sk->sk_prot, 1);
1066
1067 return sk;
1068
1069 err:
1070 atomic_long_dec(&unix_nr_socks);
1071 return ERR_PTR(err);
1072 }
1073
unix_create(struct net * net,struct socket * sock,int protocol,int kern)1074 static int unix_create(struct net *net, struct socket *sock, int protocol,
1075 int kern)
1076 {
1077 struct sock *sk;
1078
1079 if (protocol && protocol != PF_UNIX)
1080 return -EPROTONOSUPPORT;
1081
1082 sock->state = SS_UNCONNECTED;
1083
1084 switch (sock->type) {
1085 case SOCK_STREAM:
1086 sock->ops = &unix_stream_ops;
1087 break;
1088 /*
1089 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
1090 * nothing uses it.
1091 */
1092 case SOCK_RAW:
1093 sock->type = SOCK_DGRAM;
1094 fallthrough;
1095 case SOCK_DGRAM:
1096 sock->ops = &unix_dgram_ops;
1097 break;
1098 case SOCK_SEQPACKET:
1099 sock->ops = &unix_seqpacket_ops;
1100 break;
1101 default:
1102 return -ESOCKTNOSUPPORT;
1103 }
1104
1105 sk = unix_create1(net, sock, kern, sock->type);
1106 if (IS_ERR(sk))
1107 return PTR_ERR(sk);
1108
1109 return 0;
1110 }
1111
unix_release(struct socket * sock)1112 static int unix_release(struct socket *sock)
1113 {
1114 struct sock *sk = sock->sk;
1115
1116 if (!sk)
1117 return 0;
1118
1119 sk->sk_prot->close(sk, 0);
1120 unix_release_sock(sk, 0);
1121 sock->sk = NULL;
1122
1123 return 0;
1124 }
1125
unix_find_bsd(struct sockaddr_un * sunaddr,int addr_len,int type)1126 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1127 int type)
1128 {
1129 struct inode *inode;
1130 struct path path;
1131 struct sock *sk;
1132 int err;
1133
1134 unix_mkname_bsd(sunaddr, addr_len);
1135 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1136 if (err)
1137 goto fail;
1138
1139 err = path_permission(&path, MAY_WRITE);
1140 if (err)
1141 goto path_put;
1142
1143 err = -ECONNREFUSED;
1144 inode = d_backing_inode(path.dentry);
1145 if (!S_ISSOCK(inode->i_mode))
1146 goto path_put;
1147
1148 sk = unix_find_socket_byinode(inode);
1149 if (!sk)
1150 goto path_put;
1151
1152 err = -EPROTOTYPE;
1153 if (sk->sk_type == type)
1154 touch_atime(&path);
1155 else
1156 goto sock_put;
1157
1158 path_put(&path);
1159
1160 return sk;
1161
1162 sock_put:
1163 sock_put(sk);
1164 path_put:
1165 path_put(&path);
1166 fail:
1167 return ERR_PTR(err);
1168 }
1169
unix_find_abstract(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1170 static struct sock *unix_find_abstract(struct net *net,
1171 struct sockaddr_un *sunaddr,
1172 int addr_len, int type)
1173 {
1174 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1175 struct dentry *dentry;
1176 struct sock *sk;
1177
1178 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1179 if (!sk)
1180 return ERR_PTR(-ECONNREFUSED);
1181
1182 dentry = unix_sk(sk)->path.dentry;
1183 if (dentry)
1184 touch_atime(&unix_sk(sk)->path);
1185
1186 return sk;
1187 }
1188
unix_find_other(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1189 static struct sock *unix_find_other(struct net *net,
1190 struct sockaddr_un *sunaddr,
1191 int addr_len, int type)
1192 {
1193 struct sock *sk;
1194
1195 if (sunaddr->sun_path[0])
1196 sk = unix_find_bsd(sunaddr, addr_len, type);
1197 else
1198 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1199
1200 return sk;
1201 }
1202
unix_autobind(struct sock * sk)1203 static int unix_autobind(struct sock *sk)
1204 {
1205 struct unix_sock *u = unix_sk(sk);
1206 unsigned int new_hash, old_hash;
1207 struct net *net = sock_net(sk);
1208 struct unix_address *addr;
1209 u32 lastnum, ordernum;
1210 int err;
1211
1212 err = mutex_lock_interruptible(&u->bindlock);
1213 if (err)
1214 return err;
1215
1216 if (u->addr)
1217 goto out;
1218
1219 err = -ENOMEM;
1220 addr = kzalloc(sizeof(*addr) +
1221 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1222 if (!addr)
1223 goto out;
1224
1225 addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1226 addr->name->sun_family = AF_UNIX;
1227 refcount_set(&addr->refcnt, 1);
1228
1229 old_hash = sk->sk_hash;
1230 ordernum = get_random_u32();
1231 lastnum = ordernum & 0xFFFFF;
1232 retry:
1233 ordernum = (ordernum + 1) & 0xFFFFF;
1234 sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1235
1236 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1237 unix_table_double_lock(net, old_hash, new_hash);
1238
1239 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1240 unix_table_double_unlock(net, old_hash, new_hash);
1241
1242 /* __unix_find_socket_byname() may take long time if many names
1243 * are already in use.
1244 */
1245 cond_resched();
1246
1247 if (ordernum == lastnum) {
1248 /* Give up if all names seems to be in use. */
1249 err = -ENOSPC;
1250 unix_release_addr(addr);
1251 goto out;
1252 }
1253
1254 goto retry;
1255 }
1256
1257 __unix_set_addr_hash(net, sk, addr, new_hash);
1258 unix_table_double_unlock(net, old_hash, new_hash);
1259 err = 0;
1260
1261 out: mutex_unlock(&u->bindlock);
1262 return err;
1263 }
1264
unix_bind_bsd(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1265 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1266 int addr_len)
1267 {
1268 umode_t mode = S_IFSOCK |
1269 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1270 struct unix_sock *u = unix_sk(sk);
1271 unsigned int new_hash, old_hash;
1272 struct net *net = sock_net(sk);
1273 struct mnt_idmap *idmap;
1274 struct unix_address *addr;
1275 struct dentry *dentry;
1276 struct path parent;
1277 int err;
1278
1279 addr_len = unix_mkname_bsd(sunaddr, addr_len);
1280 addr = unix_create_addr(sunaddr, addr_len);
1281 if (!addr)
1282 return -ENOMEM;
1283
1284 /*
1285 * Get the parent directory, calculate the hash for last
1286 * component.
1287 */
1288 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1289 if (IS_ERR(dentry)) {
1290 err = PTR_ERR(dentry);
1291 goto out;
1292 }
1293
1294 /*
1295 * All right, let's create it.
1296 */
1297 idmap = mnt_idmap(parent.mnt);
1298 err = security_path_mknod(&parent, dentry, mode, 0);
1299 if (!err)
1300 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1301 if (err)
1302 goto out_path;
1303 err = mutex_lock_interruptible(&u->bindlock);
1304 if (err)
1305 goto out_unlink;
1306 if (u->addr)
1307 goto out_unlock;
1308
1309 old_hash = sk->sk_hash;
1310 new_hash = unix_bsd_hash(d_backing_inode(dentry));
1311 unix_table_double_lock(net, old_hash, new_hash);
1312 u->path.mnt = mntget(parent.mnt);
1313 u->path.dentry = dget(dentry);
1314 __unix_set_addr_hash(net, sk, addr, new_hash);
1315 unix_table_double_unlock(net, old_hash, new_hash);
1316 unix_insert_bsd_socket(sk);
1317 mutex_unlock(&u->bindlock);
1318 done_path_create(&parent, dentry);
1319 return 0;
1320
1321 out_unlock:
1322 mutex_unlock(&u->bindlock);
1323 err = -EINVAL;
1324 out_unlink:
1325 /* failed after successful mknod? unlink what we'd created... */
1326 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1327 out_path:
1328 done_path_create(&parent, dentry);
1329 out:
1330 unix_release_addr(addr);
1331 return err == -EEXIST ? -EADDRINUSE : err;
1332 }
1333
unix_bind_abstract(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1334 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1335 int addr_len)
1336 {
1337 struct unix_sock *u = unix_sk(sk);
1338 unsigned int new_hash, old_hash;
1339 struct net *net = sock_net(sk);
1340 struct unix_address *addr;
1341 int err;
1342
1343 addr = unix_create_addr(sunaddr, addr_len);
1344 if (!addr)
1345 return -ENOMEM;
1346
1347 err = mutex_lock_interruptible(&u->bindlock);
1348 if (err)
1349 goto out;
1350
1351 if (u->addr) {
1352 err = -EINVAL;
1353 goto out_mutex;
1354 }
1355
1356 old_hash = sk->sk_hash;
1357 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1358 unix_table_double_lock(net, old_hash, new_hash);
1359
1360 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1361 goto out_spin;
1362
1363 __unix_set_addr_hash(net, sk, addr, new_hash);
1364 unix_table_double_unlock(net, old_hash, new_hash);
1365 mutex_unlock(&u->bindlock);
1366 return 0;
1367
1368 out_spin:
1369 unix_table_double_unlock(net, old_hash, new_hash);
1370 err = -EADDRINUSE;
1371 out_mutex:
1372 mutex_unlock(&u->bindlock);
1373 out:
1374 unix_release_addr(addr);
1375 return err;
1376 }
1377
unix_bind(struct socket * sock,struct sockaddr * uaddr,int addr_len)1378 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1379 {
1380 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1381 struct sock *sk = sock->sk;
1382 int err;
1383
1384 if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1385 sunaddr->sun_family == AF_UNIX)
1386 return unix_autobind(sk);
1387
1388 err = unix_validate_addr(sunaddr, addr_len);
1389 if (err)
1390 return err;
1391
1392 if (sunaddr->sun_path[0])
1393 err = unix_bind_bsd(sk, sunaddr, addr_len);
1394 else
1395 err = unix_bind_abstract(sk, sunaddr, addr_len);
1396
1397 return err;
1398 }
1399
unix_state_double_lock(struct sock * sk1,struct sock * sk2)1400 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1401 {
1402 if (unlikely(sk1 == sk2) || !sk2) {
1403 unix_state_lock(sk1);
1404 return;
1405 }
1406
1407 if (sk1 > sk2)
1408 swap(sk1, sk2);
1409
1410 unix_state_lock(sk1);
1411 unix_state_lock(sk2);
1412 }
1413
unix_state_double_unlock(struct sock * sk1,struct sock * sk2)1414 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1415 {
1416 if (unlikely(sk1 == sk2) || !sk2) {
1417 unix_state_unlock(sk1);
1418 return;
1419 }
1420 unix_state_unlock(sk1);
1421 unix_state_unlock(sk2);
1422 }
1423
unix_dgram_connect(struct socket * sock,struct sockaddr * addr,int alen,int flags)1424 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1425 int alen, int flags)
1426 {
1427 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1428 struct sock *sk = sock->sk;
1429 struct sock *other;
1430 int err;
1431
1432 err = -EINVAL;
1433 if (alen < offsetofend(struct sockaddr, sa_family))
1434 goto out;
1435
1436 if (addr->sa_family != AF_UNSPEC) {
1437 err = unix_validate_addr(sunaddr, alen);
1438 if (err)
1439 goto out;
1440
1441 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1442 if (err)
1443 goto out;
1444
1445 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1446 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1447 !READ_ONCE(unix_sk(sk)->addr)) {
1448 err = unix_autobind(sk);
1449 if (err)
1450 goto out;
1451 }
1452
1453 restart:
1454 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1455 if (IS_ERR(other)) {
1456 err = PTR_ERR(other);
1457 goto out;
1458 }
1459
1460 unix_state_double_lock(sk, other);
1461
1462 /* Apparently VFS overslept socket death. Retry. */
1463 if (sock_flag(other, SOCK_DEAD)) {
1464 unix_state_double_unlock(sk, other);
1465 sock_put(other);
1466 goto restart;
1467 }
1468
1469 err = -EPERM;
1470 if (!unix_may_send(sk, other))
1471 goto out_unlock;
1472
1473 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1474 if (err)
1475 goto out_unlock;
1476
1477 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1478 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1479 } else {
1480 /*
1481 * 1003.1g breaking connected state with AF_UNSPEC
1482 */
1483 other = NULL;
1484 unix_state_double_lock(sk, other);
1485 }
1486
1487 /*
1488 * If it was connected, reconnect.
1489 */
1490 if (unix_peer(sk)) {
1491 struct sock *old_peer = unix_peer(sk);
1492
1493 unix_peer(sk) = other;
1494 if (!other)
1495 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1496 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1497
1498 unix_state_double_unlock(sk, other);
1499
1500 if (other != old_peer) {
1501 unix_dgram_disconnected(sk, old_peer);
1502
1503 unix_state_lock(old_peer);
1504 if (!unix_peer(old_peer))
1505 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1506 unix_state_unlock(old_peer);
1507 }
1508
1509 sock_put(old_peer);
1510 } else {
1511 unix_peer(sk) = other;
1512 unix_state_double_unlock(sk, other);
1513 }
1514
1515 return 0;
1516
1517 out_unlock:
1518 unix_state_double_unlock(sk, other);
1519 sock_put(other);
1520 out:
1521 return err;
1522 }
1523
unix_wait_for_peer(struct sock * other,long timeo)1524 static long unix_wait_for_peer(struct sock *other, long timeo)
1525 __releases(&unix_sk(other)->lock)
1526 {
1527 struct unix_sock *u = unix_sk(other);
1528 int sched;
1529 DEFINE_WAIT(wait);
1530
1531 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1532
1533 sched = !sock_flag(other, SOCK_DEAD) &&
1534 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1535 unix_recvq_full_lockless(other);
1536
1537 unix_state_unlock(other);
1538
1539 if (sched)
1540 timeo = schedule_timeout(timeo);
1541
1542 finish_wait(&u->peer_wait, &wait);
1543 return timeo;
1544 }
1545
unix_stream_connect(struct socket * sock,struct sockaddr * uaddr,int addr_len,int flags)1546 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1547 int addr_len, int flags)
1548 {
1549 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1550 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1551 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1552 struct net *net = sock_net(sk);
1553 struct sk_buff *skb = NULL;
1554 unsigned char state;
1555 long timeo;
1556 int err;
1557
1558 err = unix_validate_addr(sunaddr, addr_len);
1559 if (err)
1560 goto out;
1561
1562 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1563 if (err)
1564 goto out;
1565
1566 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1567 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1568 !READ_ONCE(u->addr)) {
1569 err = unix_autobind(sk);
1570 if (err)
1571 goto out;
1572 }
1573
1574 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1575
1576 /* First of all allocate resources.
1577 If we will make it after state is locked,
1578 we will have to recheck all again in any case.
1579 */
1580
1581 /* create new sock for complete connection */
1582 newsk = unix_create1(net, NULL, 0, sock->type);
1583 if (IS_ERR(newsk)) {
1584 err = PTR_ERR(newsk);
1585 newsk = NULL;
1586 goto out;
1587 }
1588
1589 err = -ENOMEM;
1590
1591 /* Allocate skb for sending to listening sock */
1592 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1593 if (skb == NULL)
1594 goto out;
1595
1596 restart:
1597 /* Find listening sock. */
1598 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1599 if (IS_ERR(other)) {
1600 err = PTR_ERR(other);
1601 other = NULL;
1602 goto out;
1603 }
1604
1605 unix_state_lock(other);
1606
1607 /* Apparently VFS overslept socket death. Retry. */
1608 if (sock_flag(other, SOCK_DEAD)) {
1609 unix_state_unlock(other);
1610 sock_put(other);
1611 goto restart;
1612 }
1613
1614 err = -ECONNREFUSED;
1615 if (other->sk_state != TCP_LISTEN)
1616 goto out_unlock;
1617 if (other->sk_shutdown & RCV_SHUTDOWN)
1618 goto out_unlock;
1619
1620 if (unix_recvq_full_lockless(other)) {
1621 err = -EAGAIN;
1622 if (!timeo)
1623 goto out_unlock;
1624
1625 timeo = unix_wait_for_peer(other, timeo);
1626
1627 err = sock_intr_errno(timeo);
1628 if (signal_pending(current))
1629 goto out;
1630 sock_put(other);
1631 goto restart;
1632 }
1633
1634 /* self connect and simultaneous connect are eliminated
1635 * by rejecting TCP_LISTEN socket to avoid deadlock.
1636 */
1637 state = READ_ONCE(sk->sk_state);
1638 if (unlikely(state != TCP_CLOSE)) {
1639 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1640 goto out_unlock;
1641 }
1642
1643 unix_state_lock(sk);
1644
1645 if (unlikely(sk->sk_state != TCP_CLOSE)) {
1646 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1647 unix_state_unlock(sk);
1648 goto out_unlock;
1649 }
1650
1651 err = security_unix_stream_connect(sk, other, newsk);
1652 if (err) {
1653 unix_state_unlock(sk);
1654 goto out_unlock;
1655 }
1656
1657 /* The way is open! Fastly set all the necessary fields... */
1658
1659 sock_hold(sk);
1660 unix_peer(newsk) = sk;
1661 newsk->sk_state = TCP_ESTABLISHED;
1662 newsk->sk_type = sk->sk_type;
1663 init_peercred(newsk);
1664 newu = unix_sk(newsk);
1665 newu->listener = other;
1666 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1667 otheru = unix_sk(other);
1668
1669 /* copy address information from listening to new sock
1670 *
1671 * The contents of *(otheru->addr) and otheru->path
1672 * are seen fully set up here, since we have found
1673 * otheru in hash under its lock. Insertion into the
1674 * hash chain we'd found it in had been done in an
1675 * earlier critical area protected by the chain's lock,
1676 * the same one where we'd set *(otheru->addr) contents,
1677 * as well as otheru->path and otheru->addr itself.
1678 *
1679 * Using smp_store_release() here to set newu->addr
1680 * is enough to make those stores, as well as stores
1681 * to newu->path visible to anyone who gets newu->addr
1682 * by smp_load_acquire(). IOW, the same warranties
1683 * as for unix_sock instances bound in unix_bind() or
1684 * in unix_autobind().
1685 */
1686 if (otheru->path.dentry) {
1687 path_get(&otheru->path);
1688 newu->path = otheru->path;
1689 }
1690 refcount_inc(&otheru->addr->refcnt);
1691 smp_store_release(&newu->addr, otheru->addr);
1692
1693 /* Set credentials */
1694 copy_peercred(sk, other);
1695
1696 sock->state = SS_CONNECTED;
1697 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1698 sock_hold(newsk);
1699
1700 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1701 unix_peer(sk) = newsk;
1702
1703 unix_state_unlock(sk);
1704
1705 /* take ten and send info to listening sock */
1706 spin_lock(&other->sk_receive_queue.lock);
1707 __skb_queue_tail(&other->sk_receive_queue, skb);
1708 spin_unlock(&other->sk_receive_queue.lock);
1709 unix_state_unlock(other);
1710 other->sk_data_ready(other);
1711 sock_put(other);
1712 return 0;
1713
1714 out_unlock:
1715 if (other)
1716 unix_state_unlock(other);
1717
1718 out:
1719 kfree_skb(skb);
1720 if (newsk)
1721 unix_release_sock(newsk, 0);
1722 if (other)
1723 sock_put(other);
1724 return err;
1725 }
1726
unix_socketpair(struct socket * socka,struct socket * sockb)1727 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1728 {
1729 struct sock *ska = socka->sk, *skb = sockb->sk;
1730
1731 /* Join our sockets back to back */
1732 sock_hold(ska);
1733 sock_hold(skb);
1734 unix_peer(ska) = skb;
1735 unix_peer(skb) = ska;
1736 init_peercred(ska);
1737 init_peercred(skb);
1738
1739 ska->sk_state = TCP_ESTABLISHED;
1740 skb->sk_state = TCP_ESTABLISHED;
1741 socka->state = SS_CONNECTED;
1742 sockb->state = SS_CONNECTED;
1743 return 0;
1744 }
1745
unix_sock_inherit_flags(const struct socket * old,struct socket * new)1746 static void unix_sock_inherit_flags(const struct socket *old,
1747 struct socket *new)
1748 {
1749 if (test_bit(SOCK_PASSCRED, &old->flags))
1750 set_bit(SOCK_PASSCRED, &new->flags);
1751 if (test_bit(SOCK_PASSPIDFD, &old->flags))
1752 set_bit(SOCK_PASSPIDFD, &new->flags);
1753 if (test_bit(SOCK_PASSSEC, &old->flags))
1754 set_bit(SOCK_PASSSEC, &new->flags);
1755 }
1756
unix_accept(struct socket * sock,struct socket * newsock,struct proto_accept_arg * arg)1757 static int unix_accept(struct socket *sock, struct socket *newsock,
1758 struct proto_accept_arg *arg)
1759 {
1760 struct sock *sk = sock->sk;
1761 struct sk_buff *skb;
1762 struct sock *tsk;
1763
1764 arg->err = -EOPNOTSUPP;
1765 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1766 goto out;
1767
1768 arg->err = -EINVAL;
1769 if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1770 goto out;
1771
1772 /* If socket state is TCP_LISTEN it cannot change (for now...),
1773 * so that no locks are necessary.
1774 */
1775
1776 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1777 &arg->err);
1778 if (!skb) {
1779 /* This means receive shutdown. */
1780 if (arg->err == 0)
1781 arg->err = -EINVAL;
1782 goto out;
1783 }
1784
1785 tsk = skb->sk;
1786 skb_free_datagram(sk, skb);
1787 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1788
1789 /* attach accepted sock to socket */
1790 unix_state_lock(tsk);
1791 unix_update_edges(unix_sk(tsk));
1792 newsock->state = SS_CONNECTED;
1793 unix_sock_inherit_flags(sock, newsock);
1794 sock_graft(tsk, newsock);
1795 unix_state_unlock(tsk);
1796 return 0;
1797
1798 out:
1799 return arg->err;
1800 }
1801
1802
unix_getname(struct socket * sock,struct sockaddr * uaddr,int peer)1803 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1804 {
1805 struct sock *sk = sock->sk;
1806 struct unix_address *addr;
1807 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1808 int err = 0;
1809
1810 if (peer) {
1811 sk = unix_peer_get(sk);
1812
1813 err = -ENOTCONN;
1814 if (!sk)
1815 goto out;
1816 err = 0;
1817 } else {
1818 sock_hold(sk);
1819 }
1820
1821 addr = smp_load_acquire(&unix_sk(sk)->addr);
1822 if (!addr) {
1823 sunaddr->sun_family = AF_UNIX;
1824 sunaddr->sun_path[0] = 0;
1825 err = offsetof(struct sockaddr_un, sun_path);
1826 } else {
1827 err = addr->len;
1828 memcpy(sunaddr, addr->name, addr->len);
1829
1830 if (peer)
1831 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1832 CGROUP_UNIX_GETPEERNAME);
1833 else
1834 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1835 CGROUP_UNIX_GETSOCKNAME);
1836 }
1837 sock_put(sk);
1838 out:
1839 return err;
1840 }
1841
1842 /* The "user->unix_inflight" variable is protected by the garbage
1843 * collection lock, and we just read it locklessly here. If you go
1844 * over the limit, there might be a tiny race in actually noticing
1845 * it across threads. Tough.
1846 */
too_many_unix_fds(struct task_struct * p)1847 static inline bool too_many_unix_fds(struct task_struct *p)
1848 {
1849 struct user_struct *user = current_user();
1850
1851 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1852 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1853 return false;
1854 }
1855
unix_attach_fds(struct scm_cookie * scm,struct sk_buff * skb)1856 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1857 {
1858 if (too_many_unix_fds(current))
1859 return -ETOOMANYREFS;
1860
1861 UNIXCB(skb).fp = scm->fp;
1862 scm->fp = NULL;
1863
1864 if (unix_prepare_fpl(UNIXCB(skb).fp))
1865 return -ENOMEM;
1866
1867 return 0;
1868 }
1869
unix_detach_fds(struct scm_cookie * scm,struct sk_buff * skb)1870 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1871 {
1872 scm->fp = UNIXCB(skb).fp;
1873 UNIXCB(skb).fp = NULL;
1874
1875 unix_destroy_fpl(scm->fp);
1876 }
1877
unix_peek_fds(struct scm_cookie * scm,struct sk_buff * skb)1878 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1879 {
1880 scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1881 }
1882
unix_destruct_scm(struct sk_buff * skb)1883 static void unix_destruct_scm(struct sk_buff *skb)
1884 {
1885 struct scm_cookie scm;
1886
1887 memset(&scm, 0, sizeof(scm));
1888 scm.pid = UNIXCB(skb).pid;
1889 if (UNIXCB(skb).fp)
1890 unix_detach_fds(&scm, skb);
1891
1892 /* Alas, it calls VFS */
1893 /* So fscking what? fput() had been SMP-safe since the last Summer */
1894 scm_destroy(&scm);
1895 sock_wfree(skb);
1896 }
1897
unix_scm_to_skb(struct scm_cookie * scm,struct sk_buff * skb,bool send_fds)1898 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1899 {
1900 int err = 0;
1901
1902 UNIXCB(skb).pid = get_pid(scm->pid);
1903 UNIXCB(skb).uid = scm->creds.uid;
1904 UNIXCB(skb).gid = scm->creds.gid;
1905 UNIXCB(skb).fp = NULL;
1906 unix_get_secdata(scm, skb);
1907 if (scm->fp && send_fds)
1908 err = unix_attach_fds(scm, skb);
1909
1910 skb->destructor = unix_destruct_scm;
1911 return err;
1912 }
1913
unix_passcred_enabled(const struct socket * sock,const struct sock * other)1914 static bool unix_passcred_enabled(const struct socket *sock,
1915 const struct sock *other)
1916 {
1917 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1918 test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1919 !other->sk_socket ||
1920 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1921 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1922 }
1923
1924 /*
1925 * Some apps rely on write() giving SCM_CREDENTIALS
1926 * We include credentials if source or destination socket
1927 * asserted SOCK_PASSCRED.
1928 */
maybe_add_creds(struct sk_buff * skb,const struct socket * sock,const struct sock * other)1929 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1930 const struct sock *other)
1931 {
1932 if (UNIXCB(skb).pid)
1933 return;
1934 if (unix_passcred_enabled(sock, other)) {
1935 UNIXCB(skb).pid = get_pid(task_tgid(current));
1936 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1937 }
1938 }
1939
unix_skb_scm_eq(struct sk_buff * skb,struct scm_cookie * scm)1940 static bool unix_skb_scm_eq(struct sk_buff *skb,
1941 struct scm_cookie *scm)
1942 {
1943 return UNIXCB(skb).pid == scm->pid &&
1944 uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1945 gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1946 unix_secdata_eq(scm, skb);
1947 }
1948
scm_stat_add(struct sock * sk,struct sk_buff * skb)1949 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1950 {
1951 struct scm_fp_list *fp = UNIXCB(skb).fp;
1952 struct unix_sock *u = unix_sk(sk);
1953
1954 if (unlikely(fp && fp->count)) {
1955 atomic_add(fp->count, &u->scm_stat.nr_fds);
1956 unix_add_edges(fp, u);
1957 }
1958 }
1959
scm_stat_del(struct sock * sk,struct sk_buff * skb)1960 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1961 {
1962 struct scm_fp_list *fp = UNIXCB(skb).fp;
1963 struct unix_sock *u = unix_sk(sk);
1964
1965 if (unlikely(fp && fp->count)) {
1966 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1967 unix_del_edges(fp);
1968 }
1969 }
1970
1971 /*
1972 * Send AF_UNIX data.
1973 */
1974
unix_dgram_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)1975 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1976 size_t len)
1977 {
1978 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1979 struct sock *sk = sock->sk, *other = NULL;
1980 struct unix_sock *u = unix_sk(sk);
1981 struct scm_cookie scm;
1982 struct sk_buff *skb;
1983 int data_len = 0;
1984 int sk_locked;
1985 long timeo;
1986 int err;
1987
1988 err = scm_send(sock, msg, &scm, false);
1989 if (err < 0)
1990 return err;
1991
1992 wait_for_unix_gc(scm.fp);
1993
1994 err = -EOPNOTSUPP;
1995 if (msg->msg_flags&MSG_OOB)
1996 goto out;
1997
1998 if (msg->msg_namelen) {
1999 err = unix_validate_addr(sunaddr, msg->msg_namelen);
2000 if (err)
2001 goto out;
2002
2003 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
2004 msg->msg_name,
2005 &msg->msg_namelen,
2006 NULL);
2007 if (err)
2008 goto out;
2009 } else {
2010 sunaddr = NULL;
2011 err = -ENOTCONN;
2012 other = unix_peer_get(sk);
2013 if (!other)
2014 goto out;
2015 }
2016
2017 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
2018 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
2019 !READ_ONCE(u->addr)) {
2020 err = unix_autobind(sk);
2021 if (err)
2022 goto out;
2023 }
2024
2025 err = -EMSGSIZE;
2026 if (len > READ_ONCE(sk->sk_sndbuf) - 32)
2027 goto out;
2028
2029 if (len > SKB_MAX_ALLOC) {
2030 data_len = min_t(size_t,
2031 len - SKB_MAX_ALLOC,
2032 MAX_SKB_FRAGS * PAGE_SIZE);
2033 data_len = PAGE_ALIGN(data_len);
2034
2035 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2036 }
2037
2038 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2039 msg->msg_flags & MSG_DONTWAIT, &err,
2040 PAGE_ALLOC_COSTLY_ORDER);
2041 if (skb == NULL)
2042 goto out;
2043
2044 err = unix_scm_to_skb(&scm, skb, true);
2045 if (err < 0)
2046 goto out_free;
2047
2048 skb_put(skb, len - data_len);
2049 skb->data_len = data_len;
2050 skb->len = len;
2051 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2052 if (err)
2053 goto out_free;
2054
2055 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2056
2057 restart:
2058 if (!other) {
2059 err = -ECONNRESET;
2060 if (sunaddr == NULL)
2061 goto out_free;
2062
2063 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2064 sk->sk_type);
2065 if (IS_ERR(other)) {
2066 err = PTR_ERR(other);
2067 other = NULL;
2068 goto out_free;
2069 }
2070 }
2071
2072 if (sk_filter(other, skb) < 0) {
2073 /* Toss the packet but do not return any error to the sender */
2074 err = len;
2075 goto out_free;
2076 }
2077
2078 sk_locked = 0;
2079 unix_state_lock(other);
2080 restart_locked:
2081 err = -EPERM;
2082 if (!unix_may_send(sk, other))
2083 goto out_unlock;
2084
2085 if (unlikely(sock_flag(other, SOCK_DEAD))) {
2086 /*
2087 * Check with 1003.1g - what should
2088 * datagram error
2089 */
2090 unix_state_unlock(other);
2091 sock_put(other);
2092
2093 if (!sk_locked)
2094 unix_state_lock(sk);
2095
2096 err = 0;
2097 if (sk->sk_type == SOCK_SEQPACKET) {
2098 /* We are here only when racing with unix_release_sock()
2099 * is clearing @other. Never change state to TCP_CLOSE
2100 * unlike SOCK_DGRAM wants.
2101 */
2102 unix_state_unlock(sk);
2103 err = -EPIPE;
2104 } else if (unix_peer(sk) == other) {
2105 unix_peer(sk) = NULL;
2106 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2107
2108 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2109 unix_state_unlock(sk);
2110
2111 unix_dgram_disconnected(sk, other);
2112 sock_put(other);
2113 err = -ECONNREFUSED;
2114 } else {
2115 unix_state_unlock(sk);
2116 }
2117
2118 other = NULL;
2119 if (err)
2120 goto out_free;
2121 goto restart;
2122 }
2123
2124 err = -EPIPE;
2125 if (other->sk_shutdown & RCV_SHUTDOWN)
2126 goto out_unlock;
2127
2128 if (sk->sk_type != SOCK_SEQPACKET) {
2129 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2130 if (err)
2131 goto out_unlock;
2132 }
2133
2134 /* other == sk && unix_peer(other) != sk if
2135 * - unix_peer(sk) == NULL, destination address bound to sk
2136 * - unix_peer(sk) == sk by time of get but disconnected before lock
2137 */
2138 if (other != sk &&
2139 unlikely(unix_peer(other) != sk &&
2140 unix_recvq_full_lockless(other))) {
2141 if (timeo) {
2142 timeo = unix_wait_for_peer(other, timeo);
2143
2144 err = sock_intr_errno(timeo);
2145 if (signal_pending(current))
2146 goto out_free;
2147
2148 goto restart;
2149 }
2150
2151 if (!sk_locked) {
2152 unix_state_unlock(other);
2153 unix_state_double_lock(sk, other);
2154 }
2155
2156 if (unix_peer(sk) != other ||
2157 unix_dgram_peer_wake_me(sk, other)) {
2158 err = -EAGAIN;
2159 sk_locked = 1;
2160 goto out_unlock;
2161 }
2162
2163 if (!sk_locked) {
2164 sk_locked = 1;
2165 goto restart_locked;
2166 }
2167 }
2168
2169 if (unlikely(sk_locked))
2170 unix_state_unlock(sk);
2171
2172 if (sock_flag(other, SOCK_RCVTSTAMP))
2173 __net_timestamp(skb);
2174 maybe_add_creds(skb, sock, other);
2175 scm_stat_add(other, skb);
2176 skb_queue_tail(&other->sk_receive_queue, skb);
2177 unix_state_unlock(other);
2178 other->sk_data_ready(other);
2179 sock_put(other);
2180 scm_destroy(&scm);
2181 return len;
2182
2183 out_unlock:
2184 if (sk_locked)
2185 unix_state_unlock(sk);
2186 unix_state_unlock(other);
2187 out_free:
2188 kfree_skb(skb);
2189 out:
2190 if (other)
2191 sock_put(other);
2192 scm_destroy(&scm);
2193 return err;
2194 }
2195
2196 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2197 * bytes, and a minimum of a full page.
2198 */
2199 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2200
2201 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
queue_oob(struct socket * sock,struct msghdr * msg,struct sock * other,struct scm_cookie * scm,bool fds_sent)2202 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2203 struct scm_cookie *scm, bool fds_sent)
2204 {
2205 struct unix_sock *ousk = unix_sk(other);
2206 struct sk_buff *skb;
2207 int err = 0;
2208
2209 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2210
2211 if (!skb)
2212 return err;
2213
2214 err = unix_scm_to_skb(scm, skb, !fds_sent);
2215 if (err < 0) {
2216 kfree_skb(skb);
2217 return err;
2218 }
2219 skb_put(skb, 1);
2220 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2221
2222 if (err) {
2223 kfree_skb(skb);
2224 return err;
2225 }
2226
2227 unix_state_lock(other);
2228
2229 if (sock_flag(other, SOCK_DEAD) ||
2230 (other->sk_shutdown & RCV_SHUTDOWN)) {
2231 unix_state_unlock(other);
2232 kfree_skb(skb);
2233 return -EPIPE;
2234 }
2235
2236 maybe_add_creds(skb, sock, other);
2237 scm_stat_add(other, skb);
2238
2239 spin_lock(&other->sk_receive_queue.lock);
2240 WRITE_ONCE(ousk->oob_skb, skb);
2241 __skb_queue_tail(&other->sk_receive_queue, skb);
2242 spin_unlock(&other->sk_receive_queue.lock);
2243
2244 sk_send_sigurg(other);
2245 unix_state_unlock(other);
2246 other->sk_data_ready(other);
2247
2248 return err;
2249 }
2250 #endif
2251
unix_stream_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2252 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2253 size_t len)
2254 {
2255 struct sock *sk = sock->sk;
2256 struct sock *other = NULL;
2257 int err, size;
2258 struct sk_buff *skb;
2259 int sent = 0;
2260 struct scm_cookie scm;
2261 bool fds_sent = false;
2262 int data_len;
2263
2264 err = scm_send(sock, msg, &scm, false);
2265 if (err < 0)
2266 return err;
2267
2268 wait_for_unix_gc(scm.fp);
2269
2270 err = -EOPNOTSUPP;
2271 if (msg->msg_flags & MSG_OOB) {
2272 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2273 if (len)
2274 len--;
2275 else
2276 #endif
2277 goto out_err;
2278 }
2279
2280 if (msg->msg_namelen) {
2281 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2282 goto out_err;
2283 } else {
2284 err = -ENOTCONN;
2285 other = unix_peer(sk);
2286 if (!other)
2287 goto out_err;
2288 }
2289
2290 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2291 goto pipe_err;
2292
2293 while (sent < len) {
2294 size = len - sent;
2295
2296 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2297 skb = sock_alloc_send_pskb(sk, 0, 0,
2298 msg->msg_flags & MSG_DONTWAIT,
2299 &err, 0);
2300 } else {
2301 /* Keep two messages in the pipe so it schedules better */
2302 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2303
2304 /* allow fallback to order-0 allocations */
2305 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2306
2307 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2308
2309 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2310
2311 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2312 msg->msg_flags & MSG_DONTWAIT, &err,
2313 get_order(UNIX_SKB_FRAGS_SZ));
2314 }
2315 if (!skb)
2316 goto out_err;
2317
2318 /* Only send the fds in the first buffer */
2319 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2320 if (err < 0) {
2321 kfree_skb(skb);
2322 goto out_err;
2323 }
2324 fds_sent = true;
2325
2326 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2327 skb->ip_summed = CHECKSUM_UNNECESSARY;
2328 err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2329 sk->sk_allocation);
2330 if (err < 0) {
2331 kfree_skb(skb);
2332 goto out_err;
2333 }
2334 size = err;
2335 refcount_add(size, &sk->sk_wmem_alloc);
2336 } else {
2337 skb_put(skb, size - data_len);
2338 skb->data_len = data_len;
2339 skb->len = size;
2340 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2341 if (err) {
2342 kfree_skb(skb);
2343 goto out_err;
2344 }
2345 }
2346
2347 unix_state_lock(other);
2348
2349 if (sock_flag(other, SOCK_DEAD) ||
2350 (other->sk_shutdown & RCV_SHUTDOWN))
2351 goto pipe_err_free;
2352
2353 maybe_add_creds(skb, sock, other);
2354 scm_stat_add(other, skb);
2355 skb_queue_tail(&other->sk_receive_queue, skb);
2356 unix_state_unlock(other);
2357 other->sk_data_ready(other);
2358 sent += size;
2359 }
2360
2361 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2362 if (msg->msg_flags & MSG_OOB) {
2363 err = queue_oob(sock, msg, other, &scm, fds_sent);
2364 if (err)
2365 goto out_err;
2366 sent++;
2367 }
2368 #endif
2369
2370 scm_destroy(&scm);
2371
2372 return sent;
2373
2374 pipe_err_free:
2375 unix_state_unlock(other);
2376 kfree_skb(skb);
2377 pipe_err:
2378 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2379 send_sig(SIGPIPE, current, 0);
2380 err = -EPIPE;
2381 out_err:
2382 scm_destroy(&scm);
2383 return sent ? : err;
2384 }
2385
unix_seqpacket_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2386 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2387 size_t len)
2388 {
2389 int err;
2390 struct sock *sk = sock->sk;
2391
2392 err = sock_error(sk);
2393 if (err)
2394 return err;
2395
2396 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2397 return -ENOTCONN;
2398
2399 if (msg->msg_namelen)
2400 msg->msg_namelen = 0;
2401
2402 return unix_dgram_sendmsg(sock, msg, len);
2403 }
2404
unix_seqpacket_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2405 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2406 size_t size, int flags)
2407 {
2408 struct sock *sk = sock->sk;
2409
2410 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2411 return -ENOTCONN;
2412
2413 return unix_dgram_recvmsg(sock, msg, size, flags);
2414 }
2415
unix_copy_addr(struct msghdr * msg,struct sock * sk)2416 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2417 {
2418 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2419
2420 if (addr) {
2421 msg->msg_namelen = addr->len;
2422 memcpy(msg->msg_name, addr->name, addr->len);
2423 }
2424 }
2425
__unix_dgram_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2426 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2427 int flags)
2428 {
2429 struct scm_cookie scm;
2430 struct socket *sock = sk->sk_socket;
2431 struct unix_sock *u = unix_sk(sk);
2432 struct sk_buff *skb, *last;
2433 long timeo;
2434 int skip;
2435 int err;
2436
2437 err = -EOPNOTSUPP;
2438 if (flags&MSG_OOB)
2439 goto out;
2440
2441 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2442
2443 do {
2444 mutex_lock(&u->iolock);
2445
2446 skip = sk_peek_offset(sk, flags);
2447 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2448 &skip, &err, &last);
2449 if (skb) {
2450 if (!(flags & MSG_PEEK))
2451 scm_stat_del(sk, skb);
2452 break;
2453 }
2454
2455 mutex_unlock(&u->iolock);
2456
2457 if (err != -EAGAIN)
2458 break;
2459 } while (timeo &&
2460 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2461 &err, &timeo, last));
2462
2463 if (!skb) { /* implies iolock unlocked */
2464 unix_state_lock(sk);
2465 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2466 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2467 (sk->sk_shutdown & RCV_SHUTDOWN))
2468 err = 0;
2469 unix_state_unlock(sk);
2470 goto out;
2471 }
2472
2473 if (wq_has_sleeper(&u->peer_wait))
2474 wake_up_interruptible_sync_poll(&u->peer_wait,
2475 EPOLLOUT | EPOLLWRNORM |
2476 EPOLLWRBAND);
2477
2478 if (msg->msg_name) {
2479 unix_copy_addr(msg, skb->sk);
2480
2481 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2482 msg->msg_name,
2483 &msg->msg_namelen);
2484 }
2485
2486 if (size > skb->len - skip)
2487 size = skb->len - skip;
2488 else if (size < skb->len - skip)
2489 msg->msg_flags |= MSG_TRUNC;
2490
2491 err = skb_copy_datagram_msg(skb, skip, msg, size);
2492 if (err)
2493 goto out_free;
2494
2495 if (sock_flag(sk, SOCK_RCVTSTAMP))
2496 __sock_recv_timestamp(msg, sk, skb);
2497
2498 memset(&scm, 0, sizeof(scm));
2499
2500 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2501 unix_set_secdata(&scm, skb);
2502
2503 if (!(flags & MSG_PEEK)) {
2504 if (UNIXCB(skb).fp)
2505 unix_detach_fds(&scm, skb);
2506
2507 sk_peek_offset_bwd(sk, skb->len);
2508 } else {
2509 /* It is questionable: on PEEK we could:
2510 - do not return fds - good, but too simple 8)
2511 - return fds, and do not return them on read (old strategy,
2512 apparently wrong)
2513 - clone fds (I chose it for now, it is the most universal
2514 solution)
2515
2516 POSIX 1003.1g does not actually define this clearly
2517 at all. POSIX 1003.1g doesn't define a lot of things
2518 clearly however!
2519
2520 */
2521
2522 sk_peek_offset_fwd(sk, size);
2523
2524 if (UNIXCB(skb).fp)
2525 unix_peek_fds(&scm, skb);
2526 }
2527 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2528
2529 scm_recv_unix(sock, msg, &scm, flags);
2530
2531 out_free:
2532 skb_free_datagram(sk, skb);
2533 mutex_unlock(&u->iolock);
2534 out:
2535 return err;
2536 }
2537
unix_dgram_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2538 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2539 int flags)
2540 {
2541 struct sock *sk = sock->sk;
2542
2543 #ifdef CONFIG_BPF_SYSCALL
2544 const struct proto *prot = READ_ONCE(sk->sk_prot);
2545
2546 if (prot != &unix_dgram_proto)
2547 return prot->recvmsg(sk, msg, size, flags, NULL);
2548 #endif
2549 return __unix_dgram_recvmsg(sk, msg, size, flags);
2550 }
2551
unix_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2552 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2553 {
2554 struct unix_sock *u = unix_sk(sk);
2555 struct sk_buff *skb;
2556 int err;
2557
2558 mutex_lock(&u->iolock);
2559 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2560 mutex_unlock(&u->iolock);
2561 if (!skb)
2562 return err;
2563
2564 return recv_actor(sk, skb);
2565 }
2566
2567 /*
2568 * Sleep until more data has arrived. But check for races..
2569 */
unix_stream_data_wait(struct sock * sk,long timeo,struct sk_buff * last,unsigned int last_len,bool freezable)2570 static long unix_stream_data_wait(struct sock *sk, long timeo,
2571 struct sk_buff *last, unsigned int last_len,
2572 bool freezable)
2573 {
2574 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2575 struct sk_buff *tail;
2576 DEFINE_WAIT(wait);
2577
2578 unix_state_lock(sk);
2579
2580 for (;;) {
2581 prepare_to_wait(sk_sleep(sk), &wait, state);
2582
2583 tail = skb_peek_tail(&sk->sk_receive_queue);
2584 if (tail != last ||
2585 (tail && tail->len != last_len) ||
2586 sk->sk_err ||
2587 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2588 signal_pending(current) ||
2589 !timeo)
2590 break;
2591
2592 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2593 unix_state_unlock(sk);
2594 timeo = schedule_timeout(timeo);
2595 unix_state_lock(sk);
2596
2597 if (sock_flag(sk, SOCK_DEAD))
2598 break;
2599
2600 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2601 }
2602
2603 finish_wait(sk_sleep(sk), &wait);
2604 unix_state_unlock(sk);
2605 return timeo;
2606 }
2607
2608 struct unix_stream_read_state {
2609 int (*recv_actor)(struct sk_buff *, int, int,
2610 struct unix_stream_read_state *);
2611 struct socket *socket;
2612 struct msghdr *msg;
2613 struct pipe_inode_info *pipe;
2614 size_t size;
2615 int flags;
2616 unsigned int splice_flags;
2617 };
2618
2619 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
unix_stream_recv_urg(struct unix_stream_read_state * state)2620 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2621 {
2622 struct sk_buff *oob_skb, *read_skb = NULL;
2623 struct socket *sock = state->socket;
2624 struct sock *sk = sock->sk;
2625 struct unix_sock *u = unix_sk(sk);
2626 int chunk = 1;
2627
2628 mutex_lock(&u->iolock);
2629 unix_state_lock(sk);
2630 spin_lock(&sk->sk_receive_queue.lock);
2631
2632 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2633 spin_unlock(&sk->sk_receive_queue.lock);
2634 unix_state_unlock(sk);
2635 mutex_unlock(&u->iolock);
2636 return -EINVAL;
2637 }
2638
2639 oob_skb = u->oob_skb;
2640
2641 if (!(state->flags & MSG_PEEK)) {
2642 WRITE_ONCE(u->oob_skb, NULL);
2643
2644 if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue &&
2645 !unix_skb_len(oob_skb->prev)) {
2646 read_skb = oob_skb->prev;
2647 __skb_unlink(read_skb, &sk->sk_receive_queue);
2648 }
2649 }
2650
2651 spin_unlock(&sk->sk_receive_queue.lock);
2652 unix_state_unlock(sk);
2653
2654 chunk = state->recv_actor(oob_skb, 0, chunk, state);
2655
2656 if (!(state->flags & MSG_PEEK))
2657 UNIXCB(oob_skb).consumed += 1;
2658
2659 mutex_unlock(&u->iolock);
2660
2661 consume_skb(read_skb);
2662
2663 if (chunk < 0)
2664 return -EFAULT;
2665
2666 state->msg->msg_flags |= MSG_OOB;
2667 return 1;
2668 }
2669
manage_oob(struct sk_buff * skb,struct sock * sk,int flags,int copied)2670 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2671 int flags, int copied)
2672 {
2673 struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2674 struct unix_sock *u = unix_sk(sk);
2675
2676 if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2677 return skb;
2678
2679 spin_lock(&sk->sk_receive_queue.lock);
2680
2681 if (!unix_skb_len(skb)) {
2682 if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2683 skb = NULL;
2684 } else if (flags & MSG_PEEK) {
2685 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2686 } else {
2687 read_skb = skb;
2688 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2689 __skb_unlink(read_skb, &sk->sk_receive_queue);
2690 }
2691
2692 if (!skb)
2693 goto unlock;
2694 }
2695
2696 if (skb != u->oob_skb)
2697 goto unlock;
2698
2699 if (copied) {
2700 skb = NULL;
2701 } else if (!(flags & MSG_PEEK)) {
2702 WRITE_ONCE(u->oob_skb, NULL);
2703
2704 if (!sock_flag(sk, SOCK_URGINLINE)) {
2705 __skb_unlink(skb, &sk->sk_receive_queue);
2706 unread_skb = skb;
2707 skb = skb_peek(&sk->sk_receive_queue);
2708 }
2709 } else if (!sock_flag(sk, SOCK_URGINLINE)) {
2710 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2711 }
2712
2713 unlock:
2714 spin_unlock(&sk->sk_receive_queue.lock);
2715
2716 consume_skb(read_skb);
2717 kfree_skb(unread_skb);
2718
2719 return skb;
2720 }
2721 #endif
2722
unix_stream_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2723 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2724 {
2725 struct unix_sock *u = unix_sk(sk);
2726 struct sk_buff *skb;
2727 int err;
2728
2729 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2730 return -ENOTCONN;
2731
2732 mutex_lock(&u->iolock);
2733 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2734 mutex_unlock(&u->iolock);
2735 if (!skb)
2736 return err;
2737
2738 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2739 if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2740 bool drop = false;
2741
2742 unix_state_lock(sk);
2743
2744 if (sock_flag(sk, SOCK_DEAD)) {
2745 unix_state_unlock(sk);
2746 kfree_skb(skb);
2747 return -ECONNRESET;
2748 }
2749
2750 spin_lock(&sk->sk_receive_queue.lock);
2751 if (likely(skb == u->oob_skb)) {
2752 WRITE_ONCE(u->oob_skb, NULL);
2753 drop = true;
2754 }
2755 spin_unlock(&sk->sk_receive_queue.lock);
2756
2757 unix_state_unlock(sk);
2758
2759 if (drop) {
2760 kfree_skb(skb);
2761 return -EAGAIN;
2762 }
2763 }
2764 #endif
2765
2766 return recv_actor(sk, skb);
2767 }
2768
unix_stream_read_generic(struct unix_stream_read_state * state,bool freezable)2769 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2770 bool freezable)
2771 {
2772 struct scm_cookie scm;
2773 struct socket *sock = state->socket;
2774 struct sock *sk = sock->sk;
2775 struct unix_sock *u = unix_sk(sk);
2776 int copied = 0;
2777 int flags = state->flags;
2778 int noblock = flags & MSG_DONTWAIT;
2779 bool check_creds = false;
2780 int target;
2781 int err = 0;
2782 long timeo;
2783 int skip;
2784 size_t size = state->size;
2785 unsigned int last_len;
2786
2787 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2788 err = -EINVAL;
2789 goto out;
2790 }
2791
2792 if (unlikely(flags & MSG_OOB)) {
2793 err = -EOPNOTSUPP;
2794 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2795 err = unix_stream_recv_urg(state);
2796 #endif
2797 goto out;
2798 }
2799
2800 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2801 timeo = sock_rcvtimeo(sk, noblock);
2802
2803 memset(&scm, 0, sizeof(scm));
2804
2805 /* Lock the socket to prevent queue disordering
2806 * while sleeps in memcpy_tomsg
2807 */
2808 mutex_lock(&u->iolock);
2809
2810 skip = max(sk_peek_offset(sk, flags), 0);
2811
2812 do {
2813 struct sk_buff *skb, *last;
2814 int chunk;
2815
2816 redo:
2817 unix_state_lock(sk);
2818 if (sock_flag(sk, SOCK_DEAD)) {
2819 err = -ECONNRESET;
2820 goto unlock;
2821 }
2822 last = skb = skb_peek(&sk->sk_receive_queue);
2823 last_len = last ? last->len : 0;
2824
2825 again:
2826 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2827 if (skb) {
2828 skb = manage_oob(skb, sk, flags, copied);
2829 if (!skb && copied) {
2830 unix_state_unlock(sk);
2831 break;
2832 }
2833 }
2834 #endif
2835 if (skb == NULL) {
2836 if (copied >= target)
2837 goto unlock;
2838
2839 /*
2840 * POSIX 1003.1g mandates this order.
2841 */
2842
2843 err = sock_error(sk);
2844 if (err)
2845 goto unlock;
2846 if (sk->sk_shutdown & RCV_SHUTDOWN)
2847 goto unlock;
2848
2849 unix_state_unlock(sk);
2850 if (!timeo) {
2851 err = -EAGAIN;
2852 break;
2853 }
2854
2855 mutex_unlock(&u->iolock);
2856
2857 timeo = unix_stream_data_wait(sk, timeo, last,
2858 last_len, freezable);
2859
2860 if (signal_pending(current)) {
2861 err = sock_intr_errno(timeo);
2862 scm_destroy(&scm);
2863 goto out;
2864 }
2865
2866 mutex_lock(&u->iolock);
2867 goto redo;
2868 unlock:
2869 unix_state_unlock(sk);
2870 break;
2871 }
2872
2873 while (skip >= unix_skb_len(skb)) {
2874 skip -= unix_skb_len(skb);
2875 last = skb;
2876 last_len = skb->len;
2877 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2878 if (!skb)
2879 goto again;
2880 }
2881
2882 unix_state_unlock(sk);
2883
2884 if (check_creds) {
2885 /* Never glue messages from different writers */
2886 if (!unix_skb_scm_eq(skb, &scm))
2887 break;
2888 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2889 test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2890 /* Copy credentials */
2891 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2892 unix_set_secdata(&scm, skb);
2893 check_creds = true;
2894 }
2895
2896 /* Copy address just once */
2897 if (state->msg && state->msg->msg_name) {
2898 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2899 state->msg->msg_name);
2900 unix_copy_addr(state->msg, skb->sk);
2901
2902 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2903 state->msg->msg_name,
2904 &state->msg->msg_namelen);
2905
2906 sunaddr = NULL;
2907 }
2908
2909 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2910 chunk = state->recv_actor(skb, skip, chunk, state);
2911 if (chunk < 0) {
2912 if (copied == 0)
2913 copied = -EFAULT;
2914 break;
2915 }
2916 copied += chunk;
2917 size -= chunk;
2918
2919 /* Mark read part of skb as used */
2920 if (!(flags & MSG_PEEK)) {
2921 UNIXCB(skb).consumed += chunk;
2922
2923 sk_peek_offset_bwd(sk, chunk);
2924
2925 if (UNIXCB(skb).fp) {
2926 scm_stat_del(sk, skb);
2927 unix_detach_fds(&scm, skb);
2928 }
2929
2930 if (unix_skb_len(skb))
2931 break;
2932
2933 skb_unlink(skb, &sk->sk_receive_queue);
2934 consume_skb(skb);
2935
2936 if (scm.fp)
2937 break;
2938 } else {
2939 /* It is questionable, see note in unix_dgram_recvmsg.
2940 */
2941 if (UNIXCB(skb).fp)
2942 unix_peek_fds(&scm, skb);
2943
2944 sk_peek_offset_fwd(sk, chunk);
2945
2946 if (UNIXCB(skb).fp)
2947 break;
2948
2949 skip = 0;
2950 last = skb;
2951 last_len = skb->len;
2952 unix_state_lock(sk);
2953 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2954 if (skb)
2955 goto again;
2956 unix_state_unlock(sk);
2957 break;
2958 }
2959 } while (size);
2960
2961 mutex_unlock(&u->iolock);
2962 if (state->msg)
2963 scm_recv_unix(sock, state->msg, &scm, flags);
2964 else
2965 scm_destroy(&scm);
2966 out:
2967 return copied ? : err;
2968 }
2969
unix_stream_read_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2970 static int unix_stream_read_actor(struct sk_buff *skb,
2971 int skip, int chunk,
2972 struct unix_stream_read_state *state)
2973 {
2974 int ret;
2975
2976 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2977 state->msg, chunk);
2978 return ret ?: chunk;
2979 }
2980
__unix_stream_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2981 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2982 size_t size, int flags)
2983 {
2984 struct unix_stream_read_state state = {
2985 .recv_actor = unix_stream_read_actor,
2986 .socket = sk->sk_socket,
2987 .msg = msg,
2988 .size = size,
2989 .flags = flags
2990 };
2991
2992 return unix_stream_read_generic(&state, true);
2993 }
2994
unix_stream_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2995 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2996 size_t size, int flags)
2997 {
2998 struct unix_stream_read_state state = {
2999 .recv_actor = unix_stream_read_actor,
3000 .socket = sock,
3001 .msg = msg,
3002 .size = size,
3003 .flags = flags
3004 };
3005
3006 #ifdef CONFIG_BPF_SYSCALL
3007 struct sock *sk = sock->sk;
3008 const struct proto *prot = READ_ONCE(sk->sk_prot);
3009
3010 if (prot != &unix_stream_proto)
3011 return prot->recvmsg(sk, msg, size, flags, NULL);
3012 #endif
3013 return unix_stream_read_generic(&state, true);
3014 }
3015
unix_stream_splice_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)3016 static int unix_stream_splice_actor(struct sk_buff *skb,
3017 int skip, int chunk,
3018 struct unix_stream_read_state *state)
3019 {
3020 return skb_splice_bits(skb, state->socket->sk,
3021 UNIXCB(skb).consumed + skip,
3022 state->pipe, chunk, state->splice_flags);
3023 }
3024
unix_stream_splice_read(struct socket * sock,loff_t * ppos,struct pipe_inode_info * pipe,size_t size,unsigned int flags)3025 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
3026 struct pipe_inode_info *pipe,
3027 size_t size, unsigned int flags)
3028 {
3029 struct unix_stream_read_state state = {
3030 .recv_actor = unix_stream_splice_actor,
3031 .socket = sock,
3032 .pipe = pipe,
3033 .size = size,
3034 .splice_flags = flags,
3035 };
3036
3037 if (unlikely(*ppos))
3038 return -ESPIPE;
3039
3040 if (sock->file->f_flags & O_NONBLOCK ||
3041 flags & SPLICE_F_NONBLOCK)
3042 state.flags = MSG_DONTWAIT;
3043
3044 return unix_stream_read_generic(&state, false);
3045 }
3046
unix_shutdown(struct socket * sock,int mode)3047 static int unix_shutdown(struct socket *sock, int mode)
3048 {
3049 struct sock *sk = sock->sk;
3050 struct sock *other;
3051
3052 if (mode < SHUT_RD || mode > SHUT_RDWR)
3053 return -EINVAL;
3054 /* This maps:
3055 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
3056 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
3057 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3058 */
3059 ++mode;
3060
3061 unix_state_lock(sk);
3062 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3063 other = unix_peer(sk);
3064 if (other)
3065 sock_hold(other);
3066 unix_state_unlock(sk);
3067 sk->sk_state_change(sk);
3068
3069 if (other &&
3070 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3071
3072 int peer_mode = 0;
3073 const struct proto *prot = READ_ONCE(other->sk_prot);
3074
3075 if (prot->unhash)
3076 prot->unhash(other);
3077 if (mode&RCV_SHUTDOWN)
3078 peer_mode |= SEND_SHUTDOWN;
3079 if (mode&SEND_SHUTDOWN)
3080 peer_mode |= RCV_SHUTDOWN;
3081 unix_state_lock(other);
3082 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3083 unix_state_unlock(other);
3084 other->sk_state_change(other);
3085 if (peer_mode == SHUTDOWN_MASK)
3086 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3087 else if (peer_mode & RCV_SHUTDOWN)
3088 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3089 }
3090 if (other)
3091 sock_put(other);
3092
3093 return 0;
3094 }
3095
unix_inq_len(struct sock * sk)3096 long unix_inq_len(struct sock *sk)
3097 {
3098 struct sk_buff *skb;
3099 long amount = 0;
3100
3101 if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3102 return -EINVAL;
3103
3104 spin_lock(&sk->sk_receive_queue.lock);
3105 if (sk->sk_type == SOCK_STREAM ||
3106 sk->sk_type == SOCK_SEQPACKET) {
3107 skb_queue_walk(&sk->sk_receive_queue, skb)
3108 amount += unix_skb_len(skb);
3109 } else {
3110 skb = skb_peek(&sk->sk_receive_queue);
3111 if (skb)
3112 amount = skb->len;
3113 }
3114 spin_unlock(&sk->sk_receive_queue.lock);
3115
3116 return amount;
3117 }
3118 EXPORT_SYMBOL_GPL(unix_inq_len);
3119
unix_outq_len(struct sock * sk)3120 long unix_outq_len(struct sock *sk)
3121 {
3122 return sk_wmem_alloc_get(sk);
3123 }
3124 EXPORT_SYMBOL_GPL(unix_outq_len);
3125
unix_open_file(struct sock * sk)3126 static int unix_open_file(struct sock *sk)
3127 {
3128 struct path path;
3129 struct file *f;
3130 int fd;
3131
3132 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3133 return -EPERM;
3134
3135 if (!smp_load_acquire(&unix_sk(sk)->addr))
3136 return -ENOENT;
3137
3138 path = unix_sk(sk)->path;
3139 if (!path.dentry)
3140 return -ENOENT;
3141
3142 path_get(&path);
3143
3144 fd = get_unused_fd_flags(O_CLOEXEC);
3145 if (fd < 0)
3146 goto out;
3147
3148 f = dentry_open(&path, O_PATH, current_cred());
3149 if (IS_ERR(f)) {
3150 put_unused_fd(fd);
3151 fd = PTR_ERR(f);
3152 goto out;
3153 }
3154
3155 fd_install(fd, f);
3156 out:
3157 path_put(&path);
3158
3159 return fd;
3160 }
3161
unix_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3162 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3163 {
3164 struct sock *sk = sock->sk;
3165 long amount = 0;
3166 int err;
3167
3168 switch (cmd) {
3169 case SIOCOUTQ:
3170 amount = unix_outq_len(sk);
3171 err = put_user(amount, (int __user *)arg);
3172 break;
3173 case SIOCINQ:
3174 amount = unix_inq_len(sk);
3175 if (amount < 0)
3176 err = amount;
3177 else
3178 err = put_user(amount, (int __user *)arg);
3179 break;
3180 case SIOCUNIXFILE:
3181 err = unix_open_file(sk);
3182 break;
3183 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3184 case SIOCATMARK:
3185 {
3186 struct unix_sock *u = unix_sk(sk);
3187 struct sk_buff *skb;
3188 int answ = 0;
3189
3190 mutex_lock(&u->iolock);
3191
3192 skb = skb_peek(&sk->sk_receive_queue);
3193 if (skb) {
3194 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3195 struct sk_buff *next_skb;
3196
3197 next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3198
3199 if (skb == oob_skb ||
3200 (!unix_skb_len(skb) &&
3201 (!oob_skb || next_skb == oob_skb)))
3202 answ = 1;
3203 }
3204
3205 mutex_unlock(&u->iolock);
3206
3207 err = put_user(answ, (int __user *)arg);
3208 }
3209 break;
3210 #endif
3211 default:
3212 err = -ENOIOCTLCMD;
3213 break;
3214 }
3215 return err;
3216 }
3217
3218 #ifdef CONFIG_COMPAT
unix_compat_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3219 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3220 {
3221 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3222 }
3223 #endif
3224
unix_poll(struct file * file,struct socket * sock,poll_table * wait)3225 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3226 {
3227 struct sock *sk = sock->sk;
3228 unsigned char state;
3229 __poll_t mask;
3230 u8 shutdown;
3231
3232 sock_poll_wait(file, sock, wait);
3233 mask = 0;
3234 shutdown = READ_ONCE(sk->sk_shutdown);
3235 state = READ_ONCE(sk->sk_state);
3236
3237 /* exceptional events? */
3238 if (READ_ONCE(sk->sk_err))
3239 mask |= EPOLLERR;
3240 if (shutdown == SHUTDOWN_MASK)
3241 mask |= EPOLLHUP;
3242 if (shutdown & RCV_SHUTDOWN)
3243 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3244
3245 /* readable? */
3246 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3247 mask |= EPOLLIN | EPOLLRDNORM;
3248 if (sk_is_readable(sk))
3249 mask |= EPOLLIN | EPOLLRDNORM;
3250 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3251 if (READ_ONCE(unix_sk(sk)->oob_skb))
3252 mask |= EPOLLPRI;
3253 #endif
3254
3255 /* Connection-based need to check for termination and startup */
3256 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3257 state == TCP_CLOSE)
3258 mask |= EPOLLHUP;
3259
3260 /*
3261 * we set writable also when the other side has shut down the
3262 * connection. This prevents stuck sockets.
3263 */
3264 if (unix_writable(sk, state))
3265 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3266
3267 return mask;
3268 }
3269
unix_dgram_poll(struct file * file,struct socket * sock,poll_table * wait)3270 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3271 poll_table *wait)
3272 {
3273 struct sock *sk = sock->sk, *other;
3274 unsigned int writable;
3275 unsigned char state;
3276 __poll_t mask;
3277 u8 shutdown;
3278
3279 sock_poll_wait(file, sock, wait);
3280 mask = 0;
3281 shutdown = READ_ONCE(sk->sk_shutdown);
3282 state = READ_ONCE(sk->sk_state);
3283
3284 /* exceptional events? */
3285 if (READ_ONCE(sk->sk_err) ||
3286 !skb_queue_empty_lockless(&sk->sk_error_queue))
3287 mask |= EPOLLERR |
3288 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3289
3290 if (shutdown & RCV_SHUTDOWN)
3291 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3292 if (shutdown == SHUTDOWN_MASK)
3293 mask |= EPOLLHUP;
3294
3295 /* readable? */
3296 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3297 mask |= EPOLLIN | EPOLLRDNORM;
3298 if (sk_is_readable(sk))
3299 mask |= EPOLLIN | EPOLLRDNORM;
3300
3301 /* Connection-based need to check for termination and startup */
3302 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3303 mask |= EPOLLHUP;
3304
3305 /* No write status requested, avoid expensive OUT tests. */
3306 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3307 return mask;
3308
3309 writable = unix_writable(sk, state);
3310 if (writable) {
3311 unix_state_lock(sk);
3312
3313 other = unix_peer(sk);
3314 if (other && unix_peer(other) != sk &&
3315 unix_recvq_full_lockless(other) &&
3316 unix_dgram_peer_wake_me(sk, other))
3317 writable = 0;
3318
3319 unix_state_unlock(sk);
3320 }
3321
3322 if (writable)
3323 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3324 else
3325 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3326
3327 return mask;
3328 }
3329
3330 #ifdef CONFIG_PROC_FS
3331
3332 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3333
3334 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3335 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3336 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3337
unix_from_bucket(struct seq_file * seq,loff_t * pos)3338 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3339 {
3340 unsigned long offset = get_offset(*pos);
3341 unsigned long bucket = get_bucket(*pos);
3342 unsigned long count = 0;
3343 struct sock *sk;
3344
3345 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3346 sk; sk = sk_next(sk)) {
3347 if (++count == offset)
3348 break;
3349 }
3350
3351 return sk;
3352 }
3353
unix_get_first(struct seq_file * seq,loff_t * pos)3354 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3355 {
3356 unsigned long bucket = get_bucket(*pos);
3357 struct net *net = seq_file_net(seq);
3358 struct sock *sk;
3359
3360 while (bucket < UNIX_HASH_SIZE) {
3361 spin_lock(&net->unx.table.locks[bucket]);
3362
3363 sk = unix_from_bucket(seq, pos);
3364 if (sk)
3365 return sk;
3366
3367 spin_unlock(&net->unx.table.locks[bucket]);
3368
3369 *pos = set_bucket_offset(++bucket, 1);
3370 }
3371
3372 return NULL;
3373 }
3374
unix_get_next(struct seq_file * seq,struct sock * sk,loff_t * pos)3375 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3376 loff_t *pos)
3377 {
3378 unsigned long bucket = get_bucket(*pos);
3379
3380 sk = sk_next(sk);
3381 if (sk)
3382 return sk;
3383
3384
3385 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3386
3387 *pos = set_bucket_offset(++bucket, 1);
3388
3389 return unix_get_first(seq, pos);
3390 }
3391
unix_seq_start(struct seq_file * seq,loff_t * pos)3392 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3393 {
3394 if (!*pos)
3395 return SEQ_START_TOKEN;
3396
3397 return unix_get_first(seq, pos);
3398 }
3399
unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3400 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3401 {
3402 ++*pos;
3403
3404 if (v == SEQ_START_TOKEN)
3405 return unix_get_first(seq, pos);
3406
3407 return unix_get_next(seq, v, pos);
3408 }
3409
unix_seq_stop(struct seq_file * seq,void * v)3410 static void unix_seq_stop(struct seq_file *seq, void *v)
3411 {
3412 struct sock *sk = v;
3413
3414 if (sk)
3415 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3416 }
3417
unix_seq_show(struct seq_file * seq,void * v)3418 static int unix_seq_show(struct seq_file *seq, void *v)
3419 {
3420
3421 if (v == SEQ_START_TOKEN)
3422 seq_puts(seq, "Num RefCount Protocol Flags Type St "
3423 "Inode Path\n");
3424 else {
3425 struct sock *s = v;
3426 struct unix_sock *u = unix_sk(s);
3427 unix_state_lock(s);
3428
3429 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3430 s,
3431 refcount_read(&s->sk_refcnt),
3432 0,
3433 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3434 s->sk_type,
3435 s->sk_socket ?
3436 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3437 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3438 sock_i_ino(s));
3439
3440 if (u->addr) { // under a hash table lock here
3441 int i, len;
3442 seq_putc(seq, ' ');
3443
3444 i = 0;
3445 len = u->addr->len -
3446 offsetof(struct sockaddr_un, sun_path);
3447 if (u->addr->name->sun_path[0]) {
3448 len--;
3449 } else {
3450 seq_putc(seq, '@');
3451 i++;
3452 }
3453 for ( ; i < len; i++)
3454 seq_putc(seq, u->addr->name->sun_path[i] ?:
3455 '@');
3456 }
3457 unix_state_unlock(s);
3458 seq_putc(seq, '\n');
3459 }
3460
3461 return 0;
3462 }
3463
3464 static const struct seq_operations unix_seq_ops = {
3465 .start = unix_seq_start,
3466 .next = unix_seq_next,
3467 .stop = unix_seq_stop,
3468 .show = unix_seq_show,
3469 };
3470
3471 #ifdef CONFIG_BPF_SYSCALL
3472 struct bpf_unix_iter_state {
3473 struct seq_net_private p;
3474 unsigned int cur_sk;
3475 unsigned int end_sk;
3476 unsigned int max_sk;
3477 struct sock **batch;
3478 bool st_bucket_done;
3479 };
3480
3481 struct bpf_iter__unix {
3482 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3483 __bpf_md_ptr(struct unix_sock *, unix_sk);
3484 uid_t uid __aligned(8);
3485 };
3486
unix_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3487 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3488 struct unix_sock *unix_sk, uid_t uid)
3489 {
3490 struct bpf_iter__unix ctx;
3491
3492 meta->seq_num--; /* skip SEQ_START_TOKEN */
3493 ctx.meta = meta;
3494 ctx.unix_sk = unix_sk;
3495 ctx.uid = uid;
3496 return bpf_iter_run_prog(prog, &ctx);
3497 }
3498
bpf_iter_unix_hold_batch(struct seq_file * seq,struct sock * start_sk)3499 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3500
3501 {
3502 struct bpf_unix_iter_state *iter = seq->private;
3503 unsigned int expected = 1;
3504 struct sock *sk;
3505
3506 sock_hold(start_sk);
3507 iter->batch[iter->end_sk++] = start_sk;
3508
3509 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3510 if (iter->end_sk < iter->max_sk) {
3511 sock_hold(sk);
3512 iter->batch[iter->end_sk++] = sk;
3513 }
3514
3515 expected++;
3516 }
3517
3518 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3519
3520 return expected;
3521 }
3522
bpf_iter_unix_put_batch(struct bpf_unix_iter_state * iter)3523 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3524 {
3525 while (iter->cur_sk < iter->end_sk)
3526 sock_put(iter->batch[iter->cur_sk++]);
3527 }
3528
bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state * iter,unsigned int new_batch_sz)3529 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3530 unsigned int new_batch_sz)
3531 {
3532 struct sock **new_batch;
3533
3534 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3535 GFP_USER | __GFP_NOWARN);
3536 if (!new_batch)
3537 return -ENOMEM;
3538
3539 bpf_iter_unix_put_batch(iter);
3540 kvfree(iter->batch);
3541 iter->batch = new_batch;
3542 iter->max_sk = new_batch_sz;
3543
3544 return 0;
3545 }
3546
bpf_iter_unix_batch(struct seq_file * seq,loff_t * pos)3547 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3548 loff_t *pos)
3549 {
3550 struct bpf_unix_iter_state *iter = seq->private;
3551 unsigned int expected;
3552 bool resized = false;
3553 struct sock *sk;
3554
3555 if (iter->st_bucket_done)
3556 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3557
3558 again:
3559 /* Get a new batch */
3560 iter->cur_sk = 0;
3561 iter->end_sk = 0;
3562
3563 sk = unix_get_first(seq, pos);
3564 if (!sk)
3565 return NULL; /* Done */
3566
3567 expected = bpf_iter_unix_hold_batch(seq, sk);
3568
3569 if (iter->end_sk == expected) {
3570 iter->st_bucket_done = true;
3571 return sk;
3572 }
3573
3574 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3575 resized = true;
3576 goto again;
3577 }
3578
3579 return sk;
3580 }
3581
bpf_iter_unix_seq_start(struct seq_file * seq,loff_t * pos)3582 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3583 {
3584 if (!*pos)
3585 return SEQ_START_TOKEN;
3586
3587 /* bpf iter does not support lseek, so it always
3588 * continue from where it was stop()-ped.
3589 */
3590 return bpf_iter_unix_batch(seq, pos);
3591 }
3592
bpf_iter_unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3593 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3594 {
3595 struct bpf_unix_iter_state *iter = seq->private;
3596 struct sock *sk;
3597
3598 /* Whenever seq_next() is called, the iter->cur_sk is
3599 * done with seq_show(), so advance to the next sk in
3600 * the batch.
3601 */
3602 if (iter->cur_sk < iter->end_sk)
3603 sock_put(iter->batch[iter->cur_sk++]);
3604
3605 ++*pos;
3606
3607 if (iter->cur_sk < iter->end_sk)
3608 sk = iter->batch[iter->cur_sk];
3609 else
3610 sk = bpf_iter_unix_batch(seq, pos);
3611
3612 return sk;
3613 }
3614
bpf_iter_unix_seq_show(struct seq_file * seq,void * v)3615 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3616 {
3617 struct bpf_iter_meta meta;
3618 struct bpf_prog *prog;
3619 struct sock *sk = v;
3620 uid_t uid;
3621 bool slow;
3622 int ret;
3623
3624 if (v == SEQ_START_TOKEN)
3625 return 0;
3626
3627 slow = lock_sock_fast(sk);
3628
3629 if (unlikely(sk_unhashed(sk))) {
3630 ret = SEQ_SKIP;
3631 goto unlock;
3632 }
3633
3634 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3635 meta.seq = seq;
3636 prog = bpf_iter_get_info(&meta, false);
3637 ret = unix_prog_seq_show(prog, &meta, v, uid);
3638 unlock:
3639 unlock_sock_fast(sk, slow);
3640 return ret;
3641 }
3642
bpf_iter_unix_seq_stop(struct seq_file * seq,void * v)3643 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3644 {
3645 struct bpf_unix_iter_state *iter = seq->private;
3646 struct bpf_iter_meta meta;
3647 struct bpf_prog *prog;
3648
3649 if (!v) {
3650 meta.seq = seq;
3651 prog = bpf_iter_get_info(&meta, true);
3652 if (prog)
3653 (void)unix_prog_seq_show(prog, &meta, v, 0);
3654 }
3655
3656 if (iter->cur_sk < iter->end_sk)
3657 bpf_iter_unix_put_batch(iter);
3658 }
3659
3660 static const struct seq_operations bpf_iter_unix_seq_ops = {
3661 .start = bpf_iter_unix_seq_start,
3662 .next = bpf_iter_unix_seq_next,
3663 .stop = bpf_iter_unix_seq_stop,
3664 .show = bpf_iter_unix_seq_show,
3665 };
3666 #endif
3667 #endif
3668
3669 static const struct net_proto_family unix_family_ops = {
3670 .family = PF_UNIX,
3671 .create = unix_create,
3672 .owner = THIS_MODULE,
3673 };
3674
3675
unix_net_init(struct net * net)3676 static int __net_init unix_net_init(struct net *net)
3677 {
3678 int i;
3679
3680 net->unx.sysctl_max_dgram_qlen = 10;
3681 if (unix_sysctl_register(net))
3682 goto out;
3683
3684 #ifdef CONFIG_PROC_FS
3685 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3686 sizeof(struct seq_net_private)))
3687 goto err_sysctl;
3688 #endif
3689
3690 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3691 sizeof(spinlock_t), GFP_KERNEL);
3692 if (!net->unx.table.locks)
3693 goto err_proc;
3694
3695 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3696 sizeof(struct hlist_head),
3697 GFP_KERNEL);
3698 if (!net->unx.table.buckets)
3699 goto free_locks;
3700
3701 for (i = 0; i < UNIX_HASH_SIZE; i++) {
3702 spin_lock_init(&net->unx.table.locks[i]);
3703 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3704 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3705 }
3706
3707 return 0;
3708
3709 free_locks:
3710 kvfree(net->unx.table.locks);
3711 err_proc:
3712 #ifdef CONFIG_PROC_FS
3713 remove_proc_entry("unix", net->proc_net);
3714 err_sysctl:
3715 #endif
3716 unix_sysctl_unregister(net);
3717 out:
3718 return -ENOMEM;
3719 }
3720
unix_net_exit(struct net * net)3721 static void __net_exit unix_net_exit(struct net *net)
3722 {
3723 kvfree(net->unx.table.buckets);
3724 kvfree(net->unx.table.locks);
3725 unix_sysctl_unregister(net);
3726 remove_proc_entry("unix", net->proc_net);
3727 }
3728
3729 static struct pernet_operations unix_net_ops = {
3730 .init = unix_net_init,
3731 .exit = unix_net_exit,
3732 };
3733
3734 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(unix,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3735 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3736 struct unix_sock *unix_sk, uid_t uid)
3737
3738 #define INIT_BATCH_SZ 16
3739
3740 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3741 {
3742 struct bpf_unix_iter_state *iter = priv_data;
3743 int err;
3744
3745 err = bpf_iter_init_seq_net(priv_data, aux);
3746 if (err)
3747 return err;
3748
3749 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3750 if (err) {
3751 bpf_iter_fini_seq_net(priv_data);
3752 return err;
3753 }
3754
3755 return 0;
3756 }
3757
bpf_iter_fini_unix(void * priv_data)3758 static void bpf_iter_fini_unix(void *priv_data)
3759 {
3760 struct bpf_unix_iter_state *iter = priv_data;
3761
3762 bpf_iter_fini_seq_net(priv_data);
3763 kvfree(iter->batch);
3764 }
3765
3766 static const struct bpf_iter_seq_info unix_seq_info = {
3767 .seq_ops = &bpf_iter_unix_seq_ops,
3768 .init_seq_private = bpf_iter_init_unix,
3769 .fini_seq_private = bpf_iter_fini_unix,
3770 .seq_priv_size = sizeof(struct bpf_unix_iter_state),
3771 };
3772
3773 static const struct bpf_func_proto *
bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3774 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3775 const struct bpf_prog *prog)
3776 {
3777 switch (func_id) {
3778 case BPF_FUNC_setsockopt:
3779 return &bpf_sk_setsockopt_proto;
3780 case BPF_FUNC_getsockopt:
3781 return &bpf_sk_getsockopt_proto;
3782 default:
3783 return NULL;
3784 }
3785 }
3786
3787 static struct bpf_iter_reg unix_reg_info = {
3788 .target = "unix",
3789 .ctx_arg_info_size = 1,
3790 .ctx_arg_info = {
3791 { offsetof(struct bpf_iter__unix, unix_sk),
3792 PTR_TO_BTF_ID_OR_NULL },
3793 },
3794 .get_func_proto = bpf_iter_unix_get_func_proto,
3795 .seq_info = &unix_seq_info,
3796 };
3797
bpf_iter_register(void)3798 static void __init bpf_iter_register(void)
3799 {
3800 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3801 if (bpf_iter_reg_target(&unix_reg_info))
3802 pr_warn("Warning: could not register bpf iterator unix\n");
3803 }
3804 #endif
3805
af_unix_init(void)3806 static int __init af_unix_init(void)
3807 {
3808 int i, rc = -1;
3809
3810 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3811
3812 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3813 spin_lock_init(&bsd_socket_locks[i]);
3814 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3815 }
3816
3817 rc = proto_register(&unix_dgram_proto, 1);
3818 if (rc != 0) {
3819 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3820 goto out;
3821 }
3822
3823 rc = proto_register(&unix_stream_proto, 1);
3824 if (rc != 0) {
3825 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3826 proto_unregister(&unix_dgram_proto);
3827 goto out;
3828 }
3829
3830 sock_register(&unix_family_ops);
3831 register_pernet_subsys(&unix_net_ops);
3832 unix_bpf_build_proto();
3833
3834 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3835 bpf_iter_register();
3836 #endif
3837
3838 out:
3839 return rc;
3840 }
3841
3842 /* Later than subsys_initcall() because we depend on stuff initialised there */
3843 fs_initcall(af_unix_init);
3844