1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * NET4: Implementation of BSD Unix domain sockets.
4 *
5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
6 *
7 * Fixes:
8 * Linus Torvalds : Assorted bug cures.
9 * Niibe Yutaka : async I/O support.
10 * Carsten Paeth : PF_UNIX check, address fixes.
11 * Alan Cox : Limit size of allocated blocks.
12 * Alan Cox : Fixed the stupid socketpair bug.
13 * Alan Cox : BSD compatibility fine tuning.
14 * Alan Cox : Fixed a bug in connect when interrupted.
15 * Alan Cox : Sorted out a proper draft version of
16 * file descriptor passing hacked up from
17 * Mike Shaver's work.
18 * Marty Leisner : Fixes to fd passing
19 * Nick Nevin : recvmsg bugfix.
20 * Alan Cox : Started proper garbage collector
21 * Heiko EiBfeldt : Missing verify_area check
22 * Alan Cox : Started POSIXisms
23 * Andreas Schwab : Replace inode by dentry for proper
24 * reference counting
25 * Kirk Petersen : Made this a module
26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
27 * Lots of bug fixes.
28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
29 * by above two patches.
30 * Andrea Arcangeli : If possible we block in connect(2)
31 * if the max backlog of the listen socket
32 * is been reached. This won't break
33 * old apps and it will avoid huge amount
34 * of socks hashed (this for unix_gc()
35 * performances reasons).
36 * Security fix that limits the max
37 * number of socks to 2*max_files and
38 * the number of skb queueable in the
39 * dgram receiver.
40 * Artur Skawina : Hash function optimizations
41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42 * Malcolm Beattie : Set peercred for socketpair
43 * Michal Ostrowski : Module initialization cleanup.
44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45 * the core infrastructure is doing that
46 * for all net proto families now (2.5.69+)
47 *
48 * Known differences from reference BSD that was tested:
49 *
50 * [TO FIX]
51 * ECONNREFUSED is not returned from one end of a connected() socket to the
52 * other the moment one end closes.
53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
55 * [NOT TO FIX]
56 * accept() returns a path name even if the connecting socket has closed
57 * in the meantime (BSD loses the path and gives up).
58 * accept() returns 0 length path for an unbound connector. BSD returns 16
59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 * BSD af_unix apparently has connect forgetting to block properly.
62 * (need to check this with the POSIX spec in detail)
63 *
64 * Differences from 2.0.0-11-... (ANK)
65 * Bug fixes and improvements.
66 * - client shutdown killed server socket.
67 * - removed all useless cli/sti pairs.
68 *
69 * Semantic changes/extensions.
70 * - generic control message passing.
71 * - SCM_CREDENTIALS control message.
72 * - "Abstract" (not FS based) socket bindings.
73 * Abstract names are sequences of bytes (not zero terminated)
74 * started by 0, so that this name space does not intersect
75 * with BSD names.
76 */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119
120 #include "scm.h"
121
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
125
126 /* SMP locking strategy:
127 * hash table is protected with spinlock.
128 * each socket state is protected by separate spinlock.
129 */
130
unix_unbound_hash(struct sock * sk)131 static unsigned int unix_unbound_hash(struct sock *sk)
132 {
133 unsigned long hash = (unsigned long)sk;
134
135 hash ^= hash >> 16;
136 hash ^= hash >> 8;
137 hash ^= sk->sk_type;
138
139 return hash & UNIX_HASH_MOD;
140 }
141
unix_bsd_hash(struct inode * i)142 static unsigned int unix_bsd_hash(struct inode *i)
143 {
144 return i->i_ino & UNIX_HASH_MOD;
145 }
146
unix_abstract_hash(struct sockaddr_un * sunaddr,int addr_len,int type)147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 int addr_len, int type)
149 {
150 __wsum csum = csum_partial(sunaddr, addr_len, 0);
151 unsigned int hash;
152
153 hash = (__force unsigned int)csum_fold(csum);
154 hash ^= hash >> 8;
155 hash ^= type;
156
157 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
158 }
159
unix_table_double_lock(struct net * net,unsigned int hash1,unsigned int hash2)160 static void unix_table_double_lock(struct net *net,
161 unsigned int hash1, unsigned int hash2)
162 {
163 if (hash1 == hash2) {
164 spin_lock(&net->unx.table.locks[hash1]);
165 return;
166 }
167
168 if (hash1 > hash2)
169 swap(hash1, hash2);
170
171 spin_lock(&net->unx.table.locks[hash1]);
172 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
173 }
174
unix_table_double_unlock(struct net * net,unsigned int hash1,unsigned int hash2)175 static void unix_table_double_unlock(struct net *net,
176 unsigned int hash1, unsigned int hash2)
177 {
178 if (hash1 == hash2) {
179 spin_unlock(&net->unx.table.locks[hash1]);
180 return;
181 }
182
183 spin_unlock(&net->unx.table.locks[hash1]);
184 spin_unlock(&net->unx.table.locks[hash2]);
185 }
186
187 #ifdef CONFIG_SECURITY_NETWORK
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
189 {
190 UNIXCB(skb).secid = scm->secid;
191 }
192
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
194 {
195 scm->secid = UNIXCB(skb).secid;
196 }
197
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
199 {
200 return (scm->secid == UNIXCB(skb).secid);
201 }
202 #else
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
204 { }
205
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
207 { }
208
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
210 {
211 return true;
212 }
213 #endif /* CONFIG_SECURITY_NETWORK */
214
unix_our_peer(struct sock * sk,struct sock * osk)215 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
216 {
217 return unix_peer(osk) == sk;
218 }
219
unix_may_send(struct sock * sk,struct sock * osk)220 static inline int unix_may_send(struct sock *sk, struct sock *osk)
221 {
222 return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
223 }
224
unix_recvq_full_lockless(const struct sock * sk)225 static inline int unix_recvq_full_lockless(const struct sock *sk)
226 {
227 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
228 }
229
unix_peer_get(struct sock * s)230 struct sock *unix_peer_get(struct sock *s)
231 {
232 struct sock *peer;
233
234 unix_state_lock(s);
235 peer = unix_peer(s);
236 if (peer)
237 sock_hold(peer);
238 unix_state_unlock(s);
239 return peer;
240 }
241 EXPORT_SYMBOL_GPL(unix_peer_get);
242
unix_create_addr(struct sockaddr_un * sunaddr,int addr_len)243 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
244 int addr_len)
245 {
246 struct unix_address *addr;
247
248 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
249 if (!addr)
250 return NULL;
251
252 refcount_set(&addr->refcnt, 1);
253 addr->len = addr_len;
254 memcpy(addr->name, sunaddr, addr_len);
255
256 return addr;
257 }
258
unix_release_addr(struct unix_address * addr)259 static inline void unix_release_addr(struct unix_address *addr)
260 {
261 if (refcount_dec_and_test(&addr->refcnt))
262 kfree(addr);
263 }
264
265 /*
266 * Check unix socket name:
267 * - should be not zero length.
268 * - if started by not zero, should be NULL terminated (FS object)
269 * - if started by zero, it is abstract name.
270 */
271
unix_validate_addr(struct sockaddr_un * sunaddr,int addr_len)272 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
273 {
274 if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
275 addr_len > sizeof(*sunaddr))
276 return -EINVAL;
277
278 if (sunaddr->sun_family != AF_UNIX)
279 return -EINVAL;
280
281 return 0;
282 }
283
unix_mkname_bsd(struct sockaddr_un * sunaddr,int addr_len)284 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
285 {
286 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
287 short offset = offsetof(struct sockaddr_storage, __data);
288
289 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
290
291 /* This may look like an off by one error but it is a bit more
292 * subtle. 108 is the longest valid AF_UNIX path for a binding.
293 * sun_path[108] doesn't as such exist. However in kernel space
294 * we are guaranteed that it is a valid memory location in our
295 * kernel address buffer because syscall functions always pass
296 * a pointer of struct sockaddr_storage which has a bigger buffer
297 * than 108. Also, we must terminate sun_path for strlen() in
298 * getname_kernel().
299 */
300 addr->__data[addr_len - offset] = 0;
301
302 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will
303 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen()
304 * know the actual buffer.
305 */
306 return strlen(addr->__data) + offset + 1;
307 }
308
__unix_remove_socket(struct sock * sk)309 static void __unix_remove_socket(struct sock *sk)
310 {
311 sk_del_node_init(sk);
312 }
313
__unix_insert_socket(struct net * net,struct sock * sk)314 static void __unix_insert_socket(struct net *net, struct sock *sk)
315 {
316 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
317 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
318 }
319
__unix_set_addr_hash(struct net * net,struct sock * sk,struct unix_address * addr,unsigned int hash)320 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
321 struct unix_address *addr, unsigned int hash)
322 {
323 __unix_remove_socket(sk);
324 smp_store_release(&unix_sk(sk)->addr, addr);
325
326 sk->sk_hash = hash;
327 __unix_insert_socket(net, sk);
328 }
329
unix_remove_socket(struct net * net,struct sock * sk)330 static void unix_remove_socket(struct net *net, struct sock *sk)
331 {
332 spin_lock(&net->unx.table.locks[sk->sk_hash]);
333 __unix_remove_socket(sk);
334 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
335 }
336
unix_insert_unbound_socket(struct net * net,struct sock * sk)337 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
338 {
339 spin_lock(&net->unx.table.locks[sk->sk_hash]);
340 __unix_insert_socket(net, sk);
341 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
342 }
343
unix_insert_bsd_socket(struct sock * sk)344 static void unix_insert_bsd_socket(struct sock *sk)
345 {
346 spin_lock(&bsd_socket_locks[sk->sk_hash]);
347 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
348 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
349 }
350
unix_remove_bsd_socket(struct sock * sk)351 static void unix_remove_bsd_socket(struct sock *sk)
352 {
353 if (!hlist_unhashed(&sk->sk_bind_node)) {
354 spin_lock(&bsd_socket_locks[sk->sk_hash]);
355 __sk_del_bind_node(sk);
356 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
357
358 sk_node_init(&sk->sk_bind_node);
359 }
360 }
361
__unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)362 static struct sock *__unix_find_socket_byname(struct net *net,
363 struct sockaddr_un *sunname,
364 int len, unsigned int hash)
365 {
366 struct sock *s;
367
368 sk_for_each(s, &net->unx.table.buckets[hash]) {
369 struct unix_sock *u = unix_sk(s);
370
371 if (u->addr->len == len &&
372 !memcmp(u->addr->name, sunname, len))
373 return s;
374 }
375 return NULL;
376 }
377
unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)378 static inline struct sock *unix_find_socket_byname(struct net *net,
379 struct sockaddr_un *sunname,
380 int len, unsigned int hash)
381 {
382 struct sock *s;
383
384 spin_lock(&net->unx.table.locks[hash]);
385 s = __unix_find_socket_byname(net, sunname, len, hash);
386 if (s)
387 sock_hold(s);
388 spin_unlock(&net->unx.table.locks[hash]);
389 return s;
390 }
391
unix_find_socket_byinode(struct inode * i)392 static struct sock *unix_find_socket_byinode(struct inode *i)
393 {
394 unsigned int hash = unix_bsd_hash(i);
395 struct sock *s;
396
397 spin_lock(&bsd_socket_locks[hash]);
398 sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
399 struct dentry *dentry = unix_sk(s)->path.dentry;
400
401 if (dentry && d_backing_inode(dentry) == i) {
402 sock_hold(s);
403 spin_unlock(&bsd_socket_locks[hash]);
404 return s;
405 }
406 }
407 spin_unlock(&bsd_socket_locks[hash]);
408 return NULL;
409 }
410
411 /* Support code for asymmetrically connected dgram sockets
412 *
413 * If a datagram socket is connected to a socket not itself connected
414 * to the first socket (eg, /dev/log), clients may only enqueue more
415 * messages if the present receive queue of the server socket is not
416 * "too large". This means there's a second writeability condition
417 * poll and sendmsg need to test. The dgram recv code will do a wake
418 * up on the peer_wait wait queue of a socket upon reception of a
419 * datagram which needs to be propagated to sleeping would-be writers
420 * since these might not have sent anything so far. This can't be
421 * accomplished via poll_wait because the lifetime of the server
422 * socket might be less than that of its clients if these break their
423 * association with it or if the server socket is closed while clients
424 * are still connected to it and there's no way to inform "a polling
425 * implementation" that it should let go of a certain wait queue
426 *
427 * In order to propagate a wake up, a wait_queue_entry_t of the client
428 * socket is enqueued on the peer_wait queue of the server socket
429 * whose wake function does a wake_up on the ordinary client socket
430 * wait queue. This connection is established whenever a write (or
431 * poll for write) hit the flow control condition and broken when the
432 * association to the server socket is dissolved or after a wake up
433 * was relayed.
434 */
435
unix_dgram_peer_wake_relay(wait_queue_entry_t * q,unsigned mode,int flags,void * key)436 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
437 void *key)
438 {
439 struct unix_sock *u;
440 wait_queue_head_t *u_sleep;
441
442 u = container_of(q, struct unix_sock, peer_wake);
443
444 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
445 q);
446 u->peer_wake.private = NULL;
447
448 /* relaying can only happen while the wq still exists */
449 u_sleep = sk_sleep(&u->sk);
450 if (u_sleep)
451 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
452
453 return 0;
454 }
455
unix_dgram_peer_wake_connect(struct sock * sk,struct sock * other)456 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
457 {
458 struct unix_sock *u, *u_other;
459 int rc;
460
461 u = unix_sk(sk);
462 u_other = unix_sk(other);
463 rc = 0;
464 spin_lock(&u_other->peer_wait.lock);
465
466 if (!u->peer_wake.private) {
467 u->peer_wake.private = other;
468 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
469
470 rc = 1;
471 }
472
473 spin_unlock(&u_other->peer_wait.lock);
474 return rc;
475 }
476
unix_dgram_peer_wake_disconnect(struct sock * sk,struct sock * other)477 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
478 struct sock *other)
479 {
480 struct unix_sock *u, *u_other;
481
482 u = unix_sk(sk);
483 u_other = unix_sk(other);
484 spin_lock(&u_other->peer_wait.lock);
485
486 if (u->peer_wake.private == other) {
487 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
488 u->peer_wake.private = NULL;
489 }
490
491 spin_unlock(&u_other->peer_wait.lock);
492 }
493
unix_dgram_peer_wake_disconnect_wakeup(struct sock * sk,struct sock * other)494 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
495 struct sock *other)
496 {
497 unix_dgram_peer_wake_disconnect(sk, other);
498 wake_up_interruptible_poll(sk_sleep(sk),
499 EPOLLOUT |
500 EPOLLWRNORM |
501 EPOLLWRBAND);
502 }
503
504 /* preconditions:
505 * - unix_peer(sk) == other
506 * - association is stable
507 */
unix_dgram_peer_wake_me(struct sock * sk,struct sock * other)508 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
509 {
510 int connected;
511
512 connected = unix_dgram_peer_wake_connect(sk, other);
513
514 /* If other is SOCK_DEAD, we want to make sure we signal
515 * POLLOUT, such that a subsequent write() can get a
516 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
517 * to other and its full, we will hang waiting for POLLOUT.
518 */
519 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
520 return 1;
521
522 if (connected)
523 unix_dgram_peer_wake_disconnect(sk, other);
524
525 return 0;
526 }
527
unix_writable(const struct sock * sk,unsigned char state)528 static int unix_writable(const struct sock *sk, unsigned char state)
529 {
530 return state != TCP_LISTEN &&
531 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
532 }
533
unix_write_space(struct sock * sk)534 static void unix_write_space(struct sock *sk)
535 {
536 struct socket_wq *wq;
537
538 rcu_read_lock();
539 if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
540 wq = rcu_dereference(sk->sk_wq);
541 if (skwq_has_sleeper(wq))
542 wake_up_interruptible_sync_poll(&wq->wait,
543 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
544 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
545 }
546 rcu_read_unlock();
547 }
548
549 /* When dgram socket disconnects (or changes its peer), we clear its receive
550 * queue of packets arrived from previous peer. First, it allows to do
551 * flow control based only on wmem_alloc; second, sk connected to peer
552 * may receive messages only from that peer. */
unix_dgram_disconnected(struct sock * sk,struct sock * other)553 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
554 {
555 if (!skb_queue_empty(&sk->sk_receive_queue)) {
556 skb_queue_purge(&sk->sk_receive_queue);
557 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
558
559 /* If one link of bidirectional dgram pipe is disconnected,
560 * we signal error. Messages are lost. Do not make this,
561 * when peer was not connected to us.
562 */
563 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
564 WRITE_ONCE(other->sk_err, ECONNRESET);
565 sk_error_report(other);
566 }
567 }
568 }
569
unix_sock_destructor(struct sock * sk)570 static void unix_sock_destructor(struct sock *sk)
571 {
572 struct unix_sock *u = unix_sk(sk);
573
574 skb_queue_purge(&sk->sk_receive_queue);
575
576 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
577 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
578 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
579 if (!sock_flag(sk, SOCK_DEAD)) {
580 pr_info("Attempt to release alive unix socket: %p\n", sk);
581 return;
582 }
583
584 if (u->addr)
585 unix_release_addr(u->addr);
586
587 atomic_long_dec(&unix_nr_socks);
588 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
589 #ifdef UNIX_REFCNT_DEBUG
590 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
591 atomic_long_read(&unix_nr_socks));
592 #endif
593 }
594
unix_release_sock(struct sock * sk,int embrion)595 static void unix_release_sock(struct sock *sk, int embrion)
596 {
597 struct unix_sock *u = unix_sk(sk);
598 struct sock *skpair;
599 struct sk_buff *skb;
600 struct path path;
601 int state;
602
603 unix_remove_socket(sock_net(sk), sk);
604 unix_remove_bsd_socket(sk);
605
606 /* Clear state */
607 unix_state_lock(sk);
608 sock_orphan(sk);
609 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
610 path = u->path;
611 u->path.dentry = NULL;
612 u->path.mnt = NULL;
613 state = sk->sk_state;
614 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
615
616 skpair = unix_peer(sk);
617 unix_peer(sk) = NULL;
618
619 unix_state_unlock(sk);
620
621 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
622 if (u->oob_skb) {
623 kfree_skb(u->oob_skb);
624 u->oob_skb = NULL;
625 }
626 #endif
627
628 wake_up_interruptible_all(&u->peer_wait);
629
630 if (skpair != NULL) {
631 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
632 unix_state_lock(skpair);
633 /* No more writes */
634 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
635 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
636 WRITE_ONCE(skpair->sk_err, ECONNRESET);
637 unix_state_unlock(skpair);
638 skpair->sk_state_change(skpair);
639 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
640 }
641
642 unix_dgram_peer_wake_disconnect(sk, skpair);
643 sock_put(skpair); /* It may now die */
644 }
645
646 /* Try to flush out this socket. Throw out buffers at least */
647
648 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
649 if (state == TCP_LISTEN)
650 unix_release_sock(skb->sk, 1);
651 /* passed fds are erased in the kfree_skb hook */
652 UNIXCB(skb).consumed = skb->len;
653 kfree_skb(skb);
654 }
655
656 if (path.dentry)
657 path_put(&path);
658
659 sock_put(sk);
660
661 /* ---- Socket is dead now and most probably destroyed ---- */
662
663 /*
664 * Fixme: BSD difference: In BSD all sockets connected to us get
665 * ECONNRESET and we die on the spot. In Linux we behave
666 * like files and pipes do and wait for the last
667 * dereference.
668 *
669 * Can't we simply set sock->err?
670 *
671 * What the above comment does talk about? --ANK(980817)
672 */
673
674 if (READ_ONCE(unix_tot_inflight))
675 unix_gc(); /* Garbage collect fds */
676 }
677
init_peercred(struct sock * sk)678 static void init_peercred(struct sock *sk)
679 {
680 const struct cred *old_cred;
681 struct pid *old_pid;
682
683 spin_lock(&sk->sk_peer_lock);
684 old_pid = sk->sk_peer_pid;
685 old_cred = sk->sk_peer_cred;
686 sk->sk_peer_pid = get_pid(task_tgid(current));
687 sk->sk_peer_cred = get_current_cred();
688 spin_unlock(&sk->sk_peer_lock);
689
690 put_pid(old_pid);
691 put_cred(old_cred);
692 }
693
copy_peercred(struct sock * sk,struct sock * peersk)694 static void copy_peercred(struct sock *sk, struct sock *peersk)
695 {
696 const struct cred *old_cred;
697 struct pid *old_pid;
698
699 if (sk < peersk) {
700 spin_lock(&sk->sk_peer_lock);
701 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
702 } else {
703 spin_lock(&peersk->sk_peer_lock);
704 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
705 }
706 old_pid = sk->sk_peer_pid;
707 old_cred = sk->sk_peer_cred;
708 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
709 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
710
711 spin_unlock(&sk->sk_peer_lock);
712 spin_unlock(&peersk->sk_peer_lock);
713
714 put_pid(old_pid);
715 put_cred(old_cred);
716 }
717
unix_listen(struct socket * sock,int backlog)718 static int unix_listen(struct socket *sock, int backlog)
719 {
720 int err;
721 struct sock *sk = sock->sk;
722 struct unix_sock *u = unix_sk(sk);
723
724 err = -EOPNOTSUPP;
725 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
726 goto out; /* Only stream/seqpacket sockets accept */
727 err = -EINVAL;
728 if (!READ_ONCE(u->addr))
729 goto out; /* No listens on an unbound socket */
730 unix_state_lock(sk);
731 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
732 goto out_unlock;
733 if (backlog > sk->sk_max_ack_backlog)
734 wake_up_interruptible_all(&u->peer_wait);
735 sk->sk_max_ack_backlog = backlog;
736 WRITE_ONCE(sk->sk_state, TCP_LISTEN);
737
738 /* set credentials so connect can copy them */
739 init_peercred(sk);
740 err = 0;
741
742 out_unlock:
743 unix_state_unlock(sk);
744 out:
745 return err;
746 }
747
748 static int unix_release(struct socket *);
749 static int unix_bind(struct socket *, struct sockaddr *, int);
750 static int unix_stream_connect(struct socket *, struct sockaddr *,
751 int addr_len, int flags);
752 static int unix_socketpair(struct socket *, struct socket *);
753 static int unix_accept(struct socket *, struct socket *, int, bool);
754 static int unix_getname(struct socket *, struct sockaddr *, int);
755 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
756 static __poll_t unix_dgram_poll(struct file *, struct socket *,
757 poll_table *);
758 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
759 #ifdef CONFIG_COMPAT
760 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
761 #endif
762 static int unix_shutdown(struct socket *, int);
763 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
764 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
765 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
766 struct pipe_inode_info *, size_t size,
767 unsigned int flags);
768 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
769 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
770 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
771 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
772 static int unix_dgram_connect(struct socket *, struct sockaddr *,
773 int, int);
774 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
775 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
776 int);
777
unix_set_peek_off(struct sock * sk,int val)778 static int unix_set_peek_off(struct sock *sk, int val)
779 {
780 struct unix_sock *u = unix_sk(sk);
781
782 if (mutex_lock_interruptible(&u->iolock))
783 return -EINTR;
784
785 WRITE_ONCE(sk->sk_peek_off, val);
786 mutex_unlock(&u->iolock);
787
788 return 0;
789 }
790
791 #ifdef CONFIG_PROC_FS
unix_count_nr_fds(struct sock * sk)792 static int unix_count_nr_fds(struct sock *sk)
793 {
794 struct sk_buff *skb;
795 struct unix_sock *u;
796 int nr_fds = 0;
797
798 spin_lock(&sk->sk_receive_queue.lock);
799 skb = skb_peek(&sk->sk_receive_queue);
800 while (skb) {
801 u = unix_sk(skb->sk);
802 nr_fds += atomic_read(&u->scm_stat.nr_fds);
803 skb = skb_peek_next(skb, &sk->sk_receive_queue);
804 }
805 spin_unlock(&sk->sk_receive_queue.lock);
806
807 return nr_fds;
808 }
809
unix_show_fdinfo(struct seq_file * m,struct socket * sock)810 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
811 {
812 struct sock *sk = sock->sk;
813 unsigned char s_state;
814 struct unix_sock *u;
815 int nr_fds = 0;
816
817 if (sk) {
818 s_state = READ_ONCE(sk->sk_state);
819 u = unix_sk(sk);
820
821 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
822 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
823 * SOCK_DGRAM is ordinary. So, no lock is needed.
824 */
825 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
826 nr_fds = atomic_read(&u->scm_stat.nr_fds);
827 else if (s_state == TCP_LISTEN)
828 nr_fds = unix_count_nr_fds(sk);
829
830 seq_printf(m, "scm_fds: %u\n", nr_fds);
831 }
832 }
833 #else
834 #define unix_show_fdinfo NULL
835 #endif
836
837 static const struct proto_ops unix_stream_ops = {
838 .family = PF_UNIX,
839 .owner = THIS_MODULE,
840 .release = unix_release,
841 .bind = unix_bind,
842 .connect = unix_stream_connect,
843 .socketpair = unix_socketpair,
844 .accept = unix_accept,
845 .getname = unix_getname,
846 .poll = unix_poll,
847 .ioctl = unix_ioctl,
848 #ifdef CONFIG_COMPAT
849 .compat_ioctl = unix_compat_ioctl,
850 #endif
851 .listen = unix_listen,
852 .shutdown = unix_shutdown,
853 .sendmsg = unix_stream_sendmsg,
854 .recvmsg = unix_stream_recvmsg,
855 .read_skb = unix_stream_read_skb,
856 .mmap = sock_no_mmap,
857 .splice_read = unix_stream_splice_read,
858 .set_peek_off = unix_set_peek_off,
859 .show_fdinfo = unix_show_fdinfo,
860 };
861
862 static const struct proto_ops unix_dgram_ops = {
863 .family = PF_UNIX,
864 .owner = THIS_MODULE,
865 .release = unix_release,
866 .bind = unix_bind,
867 .connect = unix_dgram_connect,
868 .socketpair = unix_socketpair,
869 .accept = sock_no_accept,
870 .getname = unix_getname,
871 .poll = unix_dgram_poll,
872 .ioctl = unix_ioctl,
873 #ifdef CONFIG_COMPAT
874 .compat_ioctl = unix_compat_ioctl,
875 #endif
876 .listen = sock_no_listen,
877 .shutdown = unix_shutdown,
878 .sendmsg = unix_dgram_sendmsg,
879 .read_skb = unix_read_skb,
880 .recvmsg = unix_dgram_recvmsg,
881 .mmap = sock_no_mmap,
882 .set_peek_off = unix_set_peek_off,
883 .show_fdinfo = unix_show_fdinfo,
884 };
885
886 static const struct proto_ops unix_seqpacket_ops = {
887 .family = PF_UNIX,
888 .owner = THIS_MODULE,
889 .release = unix_release,
890 .bind = unix_bind,
891 .connect = unix_stream_connect,
892 .socketpair = unix_socketpair,
893 .accept = unix_accept,
894 .getname = unix_getname,
895 .poll = unix_dgram_poll,
896 .ioctl = unix_ioctl,
897 #ifdef CONFIG_COMPAT
898 .compat_ioctl = unix_compat_ioctl,
899 #endif
900 .listen = unix_listen,
901 .shutdown = unix_shutdown,
902 .sendmsg = unix_seqpacket_sendmsg,
903 .recvmsg = unix_seqpacket_recvmsg,
904 .mmap = sock_no_mmap,
905 .set_peek_off = unix_set_peek_off,
906 .show_fdinfo = unix_show_fdinfo,
907 };
908
unix_close(struct sock * sk,long timeout)909 static void unix_close(struct sock *sk, long timeout)
910 {
911 /* Nothing to do here, unix socket does not need a ->close().
912 * This is merely for sockmap.
913 */
914 }
915
unix_unhash(struct sock * sk)916 static void unix_unhash(struct sock *sk)
917 {
918 /* Nothing to do here, unix socket does not need a ->unhash().
919 * This is merely for sockmap.
920 */
921 }
922
unix_bpf_bypass_getsockopt(int level,int optname)923 static bool unix_bpf_bypass_getsockopt(int level, int optname)
924 {
925 if (level == SOL_SOCKET) {
926 switch (optname) {
927 case SO_PEERPIDFD:
928 return true;
929 default:
930 return false;
931 }
932 }
933
934 return false;
935 }
936
937 struct proto unix_dgram_proto = {
938 .name = "UNIX",
939 .owner = THIS_MODULE,
940 .obj_size = sizeof(struct unix_sock),
941 .close = unix_close,
942 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
943 #ifdef CONFIG_BPF_SYSCALL
944 .psock_update_sk_prot = unix_dgram_bpf_update_proto,
945 #endif
946 };
947
948 struct proto unix_stream_proto = {
949 .name = "UNIX-STREAM",
950 .owner = THIS_MODULE,
951 .obj_size = sizeof(struct unix_sock),
952 .close = unix_close,
953 .unhash = unix_unhash,
954 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
955 #ifdef CONFIG_BPF_SYSCALL
956 .psock_update_sk_prot = unix_stream_bpf_update_proto,
957 #endif
958 };
959
unix_create1(struct net * net,struct socket * sock,int kern,int type)960 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
961 {
962 struct unix_sock *u;
963 struct sock *sk;
964 int err;
965
966 atomic_long_inc(&unix_nr_socks);
967 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
968 err = -ENFILE;
969 goto err;
970 }
971
972 if (type == SOCK_STREAM)
973 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
974 else /*dgram and seqpacket */
975 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
976
977 if (!sk) {
978 err = -ENOMEM;
979 goto err;
980 }
981
982 sock_init_data(sock, sk);
983
984 sk->sk_hash = unix_unbound_hash(sk);
985 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
986 sk->sk_write_space = unix_write_space;
987 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
988 sk->sk_destruct = unix_sock_destructor;
989 u = unix_sk(sk);
990 u->inflight = 0;
991 u->path.dentry = NULL;
992 u->path.mnt = NULL;
993 spin_lock_init(&u->lock);
994 INIT_LIST_HEAD(&u->link);
995 mutex_init(&u->iolock); /* single task reading lock */
996 mutex_init(&u->bindlock); /* single task binding lock */
997 init_waitqueue_head(&u->peer_wait);
998 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
999 memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1000 unix_insert_unbound_socket(net, sk);
1001
1002 sock_prot_inuse_add(net, sk->sk_prot, 1);
1003
1004 return sk;
1005
1006 err:
1007 atomic_long_dec(&unix_nr_socks);
1008 return ERR_PTR(err);
1009 }
1010
unix_create(struct net * net,struct socket * sock,int protocol,int kern)1011 static int unix_create(struct net *net, struct socket *sock, int protocol,
1012 int kern)
1013 {
1014 struct sock *sk;
1015
1016 if (protocol && protocol != PF_UNIX)
1017 return -EPROTONOSUPPORT;
1018
1019 sock->state = SS_UNCONNECTED;
1020
1021 switch (sock->type) {
1022 case SOCK_STREAM:
1023 sock->ops = &unix_stream_ops;
1024 break;
1025 /*
1026 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
1027 * nothing uses it.
1028 */
1029 case SOCK_RAW:
1030 sock->type = SOCK_DGRAM;
1031 fallthrough;
1032 case SOCK_DGRAM:
1033 sock->ops = &unix_dgram_ops;
1034 break;
1035 case SOCK_SEQPACKET:
1036 sock->ops = &unix_seqpacket_ops;
1037 break;
1038 default:
1039 return -ESOCKTNOSUPPORT;
1040 }
1041
1042 sk = unix_create1(net, sock, kern, sock->type);
1043 if (IS_ERR(sk))
1044 return PTR_ERR(sk);
1045
1046 return 0;
1047 }
1048
unix_release(struct socket * sock)1049 static int unix_release(struct socket *sock)
1050 {
1051 struct sock *sk = sock->sk;
1052
1053 if (!sk)
1054 return 0;
1055
1056 sk->sk_prot->close(sk, 0);
1057 unix_release_sock(sk, 0);
1058 sock->sk = NULL;
1059
1060 return 0;
1061 }
1062
unix_find_bsd(struct sockaddr_un * sunaddr,int addr_len,int type)1063 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1064 int type)
1065 {
1066 struct inode *inode;
1067 struct path path;
1068 struct sock *sk;
1069 int err;
1070
1071 unix_mkname_bsd(sunaddr, addr_len);
1072 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1073 if (err)
1074 goto fail;
1075
1076 err = path_permission(&path, MAY_WRITE);
1077 if (err)
1078 goto path_put;
1079
1080 err = -ECONNREFUSED;
1081 inode = d_backing_inode(path.dentry);
1082 if (!S_ISSOCK(inode->i_mode))
1083 goto path_put;
1084
1085 sk = unix_find_socket_byinode(inode);
1086 if (!sk)
1087 goto path_put;
1088
1089 err = -EPROTOTYPE;
1090 if (sk->sk_type == type)
1091 touch_atime(&path);
1092 else
1093 goto sock_put;
1094
1095 path_put(&path);
1096
1097 return sk;
1098
1099 sock_put:
1100 sock_put(sk);
1101 path_put:
1102 path_put(&path);
1103 fail:
1104 return ERR_PTR(err);
1105 }
1106
unix_find_abstract(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1107 static struct sock *unix_find_abstract(struct net *net,
1108 struct sockaddr_un *sunaddr,
1109 int addr_len, int type)
1110 {
1111 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1112 struct dentry *dentry;
1113 struct sock *sk;
1114
1115 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1116 if (!sk)
1117 return ERR_PTR(-ECONNREFUSED);
1118
1119 dentry = unix_sk(sk)->path.dentry;
1120 if (dentry)
1121 touch_atime(&unix_sk(sk)->path);
1122
1123 return sk;
1124 }
1125
unix_find_other(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1126 static struct sock *unix_find_other(struct net *net,
1127 struct sockaddr_un *sunaddr,
1128 int addr_len, int type)
1129 {
1130 struct sock *sk;
1131
1132 if (sunaddr->sun_path[0])
1133 sk = unix_find_bsd(sunaddr, addr_len, type);
1134 else
1135 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1136
1137 return sk;
1138 }
1139
unix_autobind(struct sock * sk)1140 static int unix_autobind(struct sock *sk)
1141 {
1142 struct unix_sock *u = unix_sk(sk);
1143 unsigned int new_hash, old_hash;
1144 struct net *net = sock_net(sk);
1145 struct unix_address *addr;
1146 u32 lastnum, ordernum;
1147 int err;
1148
1149 err = mutex_lock_interruptible(&u->bindlock);
1150 if (err)
1151 return err;
1152
1153 if (u->addr)
1154 goto out;
1155
1156 err = -ENOMEM;
1157 addr = kzalloc(sizeof(*addr) +
1158 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1159 if (!addr)
1160 goto out;
1161
1162 addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1163 addr->name->sun_family = AF_UNIX;
1164 refcount_set(&addr->refcnt, 1);
1165
1166 old_hash = sk->sk_hash;
1167 ordernum = get_random_u32();
1168 lastnum = ordernum & 0xFFFFF;
1169 retry:
1170 ordernum = (ordernum + 1) & 0xFFFFF;
1171 sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1172
1173 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1174 unix_table_double_lock(net, old_hash, new_hash);
1175
1176 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1177 unix_table_double_unlock(net, old_hash, new_hash);
1178
1179 /* __unix_find_socket_byname() may take long time if many names
1180 * are already in use.
1181 */
1182 cond_resched();
1183
1184 if (ordernum == lastnum) {
1185 /* Give up if all names seems to be in use. */
1186 err = -ENOSPC;
1187 unix_release_addr(addr);
1188 goto out;
1189 }
1190
1191 goto retry;
1192 }
1193
1194 __unix_set_addr_hash(net, sk, addr, new_hash);
1195 unix_table_double_unlock(net, old_hash, new_hash);
1196 err = 0;
1197
1198 out: mutex_unlock(&u->bindlock);
1199 return err;
1200 }
1201
unix_bind_bsd(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1202 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1203 int addr_len)
1204 {
1205 umode_t mode = S_IFSOCK |
1206 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1207 struct unix_sock *u = unix_sk(sk);
1208 unsigned int new_hash, old_hash;
1209 struct net *net = sock_net(sk);
1210 struct mnt_idmap *idmap;
1211 struct unix_address *addr;
1212 struct dentry *dentry;
1213 struct path parent;
1214 int err;
1215
1216 addr_len = unix_mkname_bsd(sunaddr, addr_len);
1217 addr = unix_create_addr(sunaddr, addr_len);
1218 if (!addr)
1219 return -ENOMEM;
1220
1221 /*
1222 * Get the parent directory, calculate the hash for last
1223 * component.
1224 */
1225 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1226 if (IS_ERR(dentry)) {
1227 err = PTR_ERR(dentry);
1228 goto out;
1229 }
1230
1231 /*
1232 * All right, let's create it.
1233 */
1234 idmap = mnt_idmap(parent.mnt);
1235 err = security_path_mknod(&parent, dentry, mode, 0);
1236 if (!err)
1237 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1238 if (err)
1239 goto out_path;
1240 err = mutex_lock_interruptible(&u->bindlock);
1241 if (err)
1242 goto out_unlink;
1243 if (u->addr)
1244 goto out_unlock;
1245
1246 old_hash = sk->sk_hash;
1247 new_hash = unix_bsd_hash(d_backing_inode(dentry));
1248 unix_table_double_lock(net, old_hash, new_hash);
1249 u->path.mnt = mntget(parent.mnt);
1250 u->path.dentry = dget(dentry);
1251 __unix_set_addr_hash(net, sk, addr, new_hash);
1252 unix_table_double_unlock(net, old_hash, new_hash);
1253 unix_insert_bsd_socket(sk);
1254 mutex_unlock(&u->bindlock);
1255 done_path_create(&parent, dentry);
1256 return 0;
1257
1258 out_unlock:
1259 mutex_unlock(&u->bindlock);
1260 err = -EINVAL;
1261 out_unlink:
1262 /* failed after successful mknod? unlink what we'd created... */
1263 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1264 out_path:
1265 done_path_create(&parent, dentry);
1266 out:
1267 unix_release_addr(addr);
1268 return err == -EEXIST ? -EADDRINUSE : err;
1269 }
1270
unix_bind_abstract(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1271 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1272 int addr_len)
1273 {
1274 struct unix_sock *u = unix_sk(sk);
1275 unsigned int new_hash, old_hash;
1276 struct net *net = sock_net(sk);
1277 struct unix_address *addr;
1278 int err;
1279
1280 addr = unix_create_addr(sunaddr, addr_len);
1281 if (!addr)
1282 return -ENOMEM;
1283
1284 err = mutex_lock_interruptible(&u->bindlock);
1285 if (err)
1286 goto out;
1287
1288 if (u->addr) {
1289 err = -EINVAL;
1290 goto out_mutex;
1291 }
1292
1293 old_hash = sk->sk_hash;
1294 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1295 unix_table_double_lock(net, old_hash, new_hash);
1296
1297 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1298 goto out_spin;
1299
1300 __unix_set_addr_hash(net, sk, addr, new_hash);
1301 unix_table_double_unlock(net, old_hash, new_hash);
1302 mutex_unlock(&u->bindlock);
1303 return 0;
1304
1305 out_spin:
1306 unix_table_double_unlock(net, old_hash, new_hash);
1307 err = -EADDRINUSE;
1308 out_mutex:
1309 mutex_unlock(&u->bindlock);
1310 out:
1311 unix_release_addr(addr);
1312 return err;
1313 }
1314
unix_bind(struct socket * sock,struct sockaddr * uaddr,int addr_len)1315 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1316 {
1317 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1318 struct sock *sk = sock->sk;
1319 int err;
1320
1321 if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1322 sunaddr->sun_family == AF_UNIX)
1323 return unix_autobind(sk);
1324
1325 err = unix_validate_addr(sunaddr, addr_len);
1326 if (err)
1327 return err;
1328
1329 if (sunaddr->sun_path[0])
1330 err = unix_bind_bsd(sk, sunaddr, addr_len);
1331 else
1332 err = unix_bind_abstract(sk, sunaddr, addr_len);
1333
1334 return err;
1335 }
1336
unix_state_double_lock(struct sock * sk1,struct sock * sk2)1337 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1338 {
1339 if (unlikely(sk1 == sk2) || !sk2) {
1340 unix_state_lock(sk1);
1341 return;
1342 }
1343 if (sk1 > sk2)
1344 swap(sk1, sk2);
1345
1346 unix_state_lock(sk1);
1347 unix_state_lock_nested(sk2, U_LOCK_SECOND);
1348 }
1349
unix_state_double_unlock(struct sock * sk1,struct sock * sk2)1350 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1351 {
1352 if (unlikely(sk1 == sk2) || !sk2) {
1353 unix_state_unlock(sk1);
1354 return;
1355 }
1356 unix_state_unlock(sk1);
1357 unix_state_unlock(sk2);
1358 }
1359
unix_dgram_connect(struct socket * sock,struct sockaddr * addr,int alen,int flags)1360 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1361 int alen, int flags)
1362 {
1363 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1364 struct sock *sk = sock->sk;
1365 struct sock *other;
1366 int err;
1367
1368 err = -EINVAL;
1369 if (alen < offsetofend(struct sockaddr, sa_family))
1370 goto out;
1371
1372 if (addr->sa_family != AF_UNSPEC) {
1373 err = unix_validate_addr(sunaddr, alen);
1374 if (err)
1375 goto out;
1376
1377 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1378 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1379 !READ_ONCE(unix_sk(sk)->addr)) {
1380 err = unix_autobind(sk);
1381 if (err)
1382 goto out;
1383 }
1384
1385 restart:
1386 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1387 if (IS_ERR(other)) {
1388 err = PTR_ERR(other);
1389 goto out;
1390 }
1391
1392 unix_state_double_lock(sk, other);
1393
1394 /* Apparently VFS overslept socket death. Retry. */
1395 if (sock_flag(other, SOCK_DEAD)) {
1396 unix_state_double_unlock(sk, other);
1397 sock_put(other);
1398 goto restart;
1399 }
1400
1401 err = -EPERM;
1402 if (!unix_may_send(sk, other))
1403 goto out_unlock;
1404
1405 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1406 if (err)
1407 goto out_unlock;
1408
1409 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1410 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1411 } else {
1412 /*
1413 * 1003.1g breaking connected state with AF_UNSPEC
1414 */
1415 other = NULL;
1416 unix_state_double_lock(sk, other);
1417 }
1418
1419 /*
1420 * If it was connected, reconnect.
1421 */
1422 if (unix_peer(sk)) {
1423 struct sock *old_peer = unix_peer(sk);
1424
1425 unix_peer(sk) = other;
1426 if (!other)
1427 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1428 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1429
1430 unix_state_double_unlock(sk, other);
1431
1432 if (other != old_peer) {
1433 unix_dgram_disconnected(sk, old_peer);
1434
1435 unix_state_lock(old_peer);
1436 if (!unix_peer(old_peer))
1437 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1438 unix_state_unlock(old_peer);
1439 }
1440
1441 sock_put(old_peer);
1442 } else {
1443 unix_peer(sk) = other;
1444 unix_state_double_unlock(sk, other);
1445 }
1446
1447 return 0;
1448
1449 out_unlock:
1450 unix_state_double_unlock(sk, other);
1451 sock_put(other);
1452 out:
1453 return err;
1454 }
1455
unix_wait_for_peer(struct sock * other,long timeo)1456 static long unix_wait_for_peer(struct sock *other, long timeo)
1457 __releases(&unix_sk(other)->lock)
1458 {
1459 struct unix_sock *u = unix_sk(other);
1460 int sched;
1461 DEFINE_WAIT(wait);
1462
1463 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1464
1465 sched = !sock_flag(other, SOCK_DEAD) &&
1466 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1467 unix_recvq_full_lockless(other);
1468
1469 unix_state_unlock(other);
1470
1471 if (sched)
1472 timeo = schedule_timeout(timeo);
1473
1474 finish_wait(&u->peer_wait, &wait);
1475 return timeo;
1476 }
1477
unix_stream_connect(struct socket * sock,struct sockaddr * uaddr,int addr_len,int flags)1478 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1479 int addr_len, int flags)
1480 {
1481 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1482 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1483 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1484 struct net *net = sock_net(sk);
1485 struct sk_buff *skb = NULL;
1486 unsigned char state;
1487 long timeo;
1488 int err;
1489
1490 err = unix_validate_addr(sunaddr, addr_len);
1491 if (err)
1492 goto out;
1493
1494 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1495 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1496 !READ_ONCE(u->addr)) {
1497 err = unix_autobind(sk);
1498 if (err)
1499 goto out;
1500 }
1501
1502 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1503
1504 /* First of all allocate resources.
1505 If we will make it after state is locked,
1506 we will have to recheck all again in any case.
1507 */
1508
1509 /* create new sock for complete connection */
1510 newsk = unix_create1(net, NULL, 0, sock->type);
1511 if (IS_ERR(newsk)) {
1512 err = PTR_ERR(newsk);
1513 newsk = NULL;
1514 goto out;
1515 }
1516
1517 err = -ENOMEM;
1518
1519 /* Allocate skb for sending to listening sock */
1520 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1521 if (skb == NULL)
1522 goto out;
1523
1524 restart:
1525 /* Find listening sock. */
1526 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1527 if (IS_ERR(other)) {
1528 err = PTR_ERR(other);
1529 other = NULL;
1530 goto out;
1531 }
1532
1533 unix_state_lock(other);
1534
1535 /* Apparently VFS overslept socket death. Retry. */
1536 if (sock_flag(other, SOCK_DEAD)) {
1537 unix_state_unlock(other);
1538 sock_put(other);
1539 goto restart;
1540 }
1541
1542 err = -ECONNREFUSED;
1543 if (other->sk_state != TCP_LISTEN)
1544 goto out_unlock;
1545 if (other->sk_shutdown & RCV_SHUTDOWN)
1546 goto out_unlock;
1547
1548 if (unix_recvq_full_lockless(other)) {
1549 err = -EAGAIN;
1550 if (!timeo)
1551 goto out_unlock;
1552
1553 timeo = unix_wait_for_peer(other, timeo);
1554
1555 err = sock_intr_errno(timeo);
1556 if (signal_pending(current))
1557 goto out;
1558 sock_put(other);
1559 goto restart;
1560 }
1561
1562 /* self connect and simultaneous connect are eliminated
1563 * by rejecting TCP_LISTEN socket to avoid deadlock.
1564 */
1565 state = READ_ONCE(sk->sk_state);
1566 if (unlikely(state != TCP_CLOSE)) {
1567 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1568 goto out_unlock;
1569 }
1570
1571 unix_state_lock_nested(sk, U_LOCK_SECOND);
1572
1573 if (unlikely(sk->sk_state != TCP_CLOSE)) {
1574 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1575 unix_state_unlock(sk);
1576 goto out_unlock;
1577 }
1578
1579 err = security_unix_stream_connect(sk, other, newsk);
1580 if (err) {
1581 unix_state_unlock(sk);
1582 goto out_unlock;
1583 }
1584
1585 /* The way is open! Fastly set all the necessary fields... */
1586
1587 sock_hold(sk);
1588 unix_peer(newsk) = sk;
1589 newsk->sk_state = TCP_ESTABLISHED;
1590 newsk->sk_type = sk->sk_type;
1591 init_peercred(newsk);
1592 newu = unix_sk(newsk);
1593 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1594 otheru = unix_sk(other);
1595
1596 /* copy address information from listening to new sock
1597 *
1598 * The contents of *(otheru->addr) and otheru->path
1599 * are seen fully set up here, since we have found
1600 * otheru in hash under its lock. Insertion into the
1601 * hash chain we'd found it in had been done in an
1602 * earlier critical area protected by the chain's lock,
1603 * the same one where we'd set *(otheru->addr) contents,
1604 * as well as otheru->path and otheru->addr itself.
1605 *
1606 * Using smp_store_release() here to set newu->addr
1607 * is enough to make those stores, as well as stores
1608 * to newu->path visible to anyone who gets newu->addr
1609 * by smp_load_acquire(). IOW, the same warranties
1610 * as for unix_sock instances bound in unix_bind() or
1611 * in unix_autobind().
1612 */
1613 if (otheru->path.dentry) {
1614 path_get(&otheru->path);
1615 newu->path = otheru->path;
1616 }
1617 refcount_inc(&otheru->addr->refcnt);
1618 smp_store_release(&newu->addr, otheru->addr);
1619
1620 /* Set credentials */
1621 copy_peercred(sk, other);
1622
1623 sock->state = SS_CONNECTED;
1624 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1625 sock_hold(newsk);
1626
1627 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1628 unix_peer(sk) = newsk;
1629
1630 unix_state_unlock(sk);
1631
1632 /* take ten and send info to listening sock */
1633 spin_lock(&other->sk_receive_queue.lock);
1634 __skb_queue_tail(&other->sk_receive_queue, skb);
1635 spin_unlock(&other->sk_receive_queue.lock);
1636 unix_state_unlock(other);
1637 other->sk_data_ready(other);
1638 sock_put(other);
1639 return 0;
1640
1641 out_unlock:
1642 if (other)
1643 unix_state_unlock(other);
1644
1645 out:
1646 kfree_skb(skb);
1647 if (newsk)
1648 unix_release_sock(newsk, 0);
1649 if (other)
1650 sock_put(other);
1651 return err;
1652 }
1653
unix_socketpair(struct socket * socka,struct socket * sockb)1654 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1655 {
1656 struct sock *ska = socka->sk, *skb = sockb->sk;
1657
1658 /* Join our sockets back to back */
1659 sock_hold(ska);
1660 sock_hold(skb);
1661 unix_peer(ska) = skb;
1662 unix_peer(skb) = ska;
1663 init_peercred(ska);
1664 init_peercred(skb);
1665
1666 ska->sk_state = TCP_ESTABLISHED;
1667 skb->sk_state = TCP_ESTABLISHED;
1668 socka->state = SS_CONNECTED;
1669 sockb->state = SS_CONNECTED;
1670 return 0;
1671 }
1672
unix_sock_inherit_flags(const struct socket * old,struct socket * new)1673 static void unix_sock_inherit_flags(const struct socket *old,
1674 struct socket *new)
1675 {
1676 if (test_bit(SOCK_PASSCRED, &old->flags))
1677 set_bit(SOCK_PASSCRED, &new->flags);
1678 if (test_bit(SOCK_PASSPIDFD, &old->flags))
1679 set_bit(SOCK_PASSPIDFD, &new->flags);
1680 if (test_bit(SOCK_PASSSEC, &old->flags))
1681 set_bit(SOCK_PASSSEC, &new->flags);
1682 }
1683
unix_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)1684 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1685 bool kern)
1686 {
1687 struct sock *sk = sock->sk;
1688 struct sock *tsk;
1689 struct sk_buff *skb;
1690 int err;
1691
1692 err = -EOPNOTSUPP;
1693 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1694 goto out;
1695
1696 err = -EINVAL;
1697 if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1698 goto out;
1699
1700 /* If socket state is TCP_LISTEN it cannot change (for now...),
1701 * so that no locks are necessary.
1702 */
1703
1704 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1705 &err);
1706 if (!skb) {
1707 /* This means receive shutdown. */
1708 if (err == 0)
1709 err = -EINVAL;
1710 goto out;
1711 }
1712
1713 tsk = skb->sk;
1714 skb_free_datagram(sk, skb);
1715 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1716
1717 /* attach accepted sock to socket */
1718 unix_state_lock(tsk);
1719 newsock->state = SS_CONNECTED;
1720 unix_sock_inherit_flags(sock, newsock);
1721 sock_graft(tsk, newsock);
1722 unix_state_unlock(tsk);
1723 return 0;
1724
1725 out:
1726 return err;
1727 }
1728
1729
unix_getname(struct socket * sock,struct sockaddr * uaddr,int peer)1730 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1731 {
1732 struct sock *sk = sock->sk;
1733 struct unix_address *addr;
1734 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1735 int err = 0;
1736
1737 if (peer) {
1738 sk = unix_peer_get(sk);
1739
1740 err = -ENOTCONN;
1741 if (!sk)
1742 goto out;
1743 err = 0;
1744 } else {
1745 sock_hold(sk);
1746 }
1747
1748 addr = smp_load_acquire(&unix_sk(sk)->addr);
1749 if (!addr) {
1750 sunaddr->sun_family = AF_UNIX;
1751 sunaddr->sun_path[0] = 0;
1752 err = offsetof(struct sockaddr_un, sun_path);
1753 } else {
1754 err = addr->len;
1755 memcpy(sunaddr, addr->name, addr->len);
1756 }
1757 sock_put(sk);
1758 out:
1759 return err;
1760 }
1761
unix_peek_fds(struct scm_cookie * scm,struct sk_buff * skb)1762 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1763 {
1764 scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1765
1766 /*
1767 * Garbage collection of unix sockets starts by selecting a set of
1768 * candidate sockets which have reference only from being in flight
1769 * (total_refs == inflight_refs). This condition is checked once during
1770 * the candidate collection phase, and candidates are marked as such, so
1771 * that non-candidates can later be ignored. While inflight_refs is
1772 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1773 * is an instantaneous decision.
1774 *
1775 * Once a candidate, however, the socket must not be reinstalled into a
1776 * file descriptor while the garbage collection is in progress.
1777 *
1778 * If the above conditions are met, then the directed graph of
1779 * candidates (*) does not change while unix_gc_lock is held.
1780 *
1781 * Any operations that changes the file count through file descriptors
1782 * (dup, close, sendmsg) does not change the graph since candidates are
1783 * not installed in fds.
1784 *
1785 * Dequeing a candidate via recvmsg would install it into an fd, but
1786 * that takes unix_gc_lock to decrement the inflight count, so it's
1787 * serialized with garbage collection.
1788 *
1789 * MSG_PEEK is special in that it does not change the inflight count,
1790 * yet does install the socket into an fd. The following lock/unlock
1791 * pair is to ensure serialization with garbage collection. It must be
1792 * done between incrementing the file count and installing the file into
1793 * an fd.
1794 *
1795 * If garbage collection starts after the barrier provided by the
1796 * lock/unlock, then it will see the elevated refcount and not mark this
1797 * as a candidate. If a garbage collection is already in progress
1798 * before the file count was incremented, then the lock/unlock pair will
1799 * ensure that garbage collection is finished before progressing to
1800 * installing the fd.
1801 *
1802 * (*) A -> B where B is on the queue of A or B is on the queue of C
1803 * which is on the queue of listening socket A.
1804 */
1805 spin_lock(&unix_gc_lock);
1806 spin_unlock(&unix_gc_lock);
1807 }
1808
unix_scm_to_skb(struct scm_cookie * scm,struct sk_buff * skb,bool send_fds)1809 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1810 {
1811 int err = 0;
1812
1813 UNIXCB(skb).pid = get_pid(scm->pid);
1814 UNIXCB(skb).uid = scm->creds.uid;
1815 UNIXCB(skb).gid = scm->creds.gid;
1816 UNIXCB(skb).fp = NULL;
1817 unix_get_secdata(scm, skb);
1818 if (scm->fp && send_fds)
1819 err = unix_attach_fds(scm, skb);
1820
1821 skb->destructor = unix_destruct_scm;
1822 return err;
1823 }
1824
unix_passcred_enabled(const struct socket * sock,const struct sock * other)1825 static bool unix_passcred_enabled(const struct socket *sock,
1826 const struct sock *other)
1827 {
1828 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1829 test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1830 !other->sk_socket ||
1831 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1832 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1833 }
1834
1835 /*
1836 * Some apps rely on write() giving SCM_CREDENTIALS
1837 * We include credentials if source or destination socket
1838 * asserted SOCK_PASSCRED.
1839 */
maybe_add_creds(struct sk_buff * skb,const struct socket * sock,const struct sock * other)1840 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1841 const struct sock *other)
1842 {
1843 if (UNIXCB(skb).pid)
1844 return;
1845 if (unix_passcred_enabled(sock, other)) {
1846 UNIXCB(skb).pid = get_pid(task_tgid(current));
1847 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1848 }
1849 }
1850
unix_skb_scm_eq(struct sk_buff * skb,struct scm_cookie * scm)1851 static bool unix_skb_scm_eq(struct sk_buff *skb,
1852 struct scm_cookie *scm)
1853 {
1854 return UNIXCB(skb).pid == scm->pid &&
1855 uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1856 gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1857 unix_secdata_eq(scm, skb);
1858 }
1859
scm_stat_add(struct sock * sk,struct sk_buff * skb)1860 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1861 {
1862 struct scm_fp_list *fp = UNIXCB(skb).fp;
1863 struct unix_sock *u = unix_sk(sk);
1864
1865 if (unlikely(fp && fp->count))
1866 atomic_add(fp->count, &u->scm_stat.nr_fds);
1867 }
1868
scm_stat_del(struct sock * sk,struct sk_buff * skb)1869 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1870 {
1871 struct scm_fp_list *fp = UNIXCB(skb).fp;
1872 struct unix_sock *u = unix_sk(sk);
1873
1874 if (unlikely(fp && fp->count))
1875 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1876 }
1877
1878 /*
1879 * Send AF_UNIX data.
1880 */
1881
unix_dgram_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)1882 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1883 size_t len)
1884 {
1885 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1886 struct sock *sk = sock->sk, *other = NULL;
1887 struct unix_sock *u = unix_sk(sk);
1888 struct scm_cookie scm;
1889 struct sk_buff *skb;
1890 int data_len = 0;
1891 int sk_locked;
1892 long timeo;
1893 int err;
1894
1895 wait_for_unix_gc();
1896 err = scm_send(sock, msg, &scm, false);
1897 if (err < 0)
1898 return err;
1899
1900 err = -EOPNOTSUPP;
1901 if (msg->msg_flags&MSG_OOB)
1902 goto out;
1903
1904 if (msg->msg_namelen) {
1905 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1906 if (err)
1907 goto out;
1908 } else {
1909 sunaddr = NULL;
1910 err = -ENOTCONN;
1911 other = unix_peer_get(sk);
1912 if (!other)
1913 goto out;
1914 }
1915
1916 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1917 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1918 !READ_ONCE(u->addr)) {
1919 err = unix_autobind(sk);
1920 if (err)
1921 goto out;
1922 }
1923
1924 err = -EMSGSIZE;
1925 if (len > READ_ONCE(sk->sk_sndbuf) - 32)
1926 goto out;
1927
1928 if (len > SKB_MAX_ALLOC) {
1929 data_len = min_t(size_t,
1930 len - SKB_MAX_ALLOC,
1931 MAX_SKB_FRAGS * PAGE_SIZE);
1932 data_len = PAGE_ALIGN(data_len);
1933
1934 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1935 }
1936
1937 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1938 msg->msg_flags & MSG_DONTWAIT, &err,
1939 PAGE_ALLOC_COSTLY_ORDER);
1940 if (skb == NULL)
1941 goto out;
1942
1943 err = unix_scm_to_skb(&scm, skb, true);
1944 if (err < 0)
1945 goto out_free;
1946
1947 skb_put(skb, len - data_len);
1948 skb->data_len = data_len;
1949 skb->len = len;
1950 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1951 if (err)
1952 goto out_free;
1953
1954 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1955
1956 restart:
1957 if (!other) {
1958 err = -ECONNRESET;
1959 if (sunaddr == NULL)
1960 goto out_free;
1961
1962 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1963 sk->sk_type);
1964 if (IS_ERR(other)) {
1965 err = PTR_ERR(other);
1966 other = NULL;
1967 goto out_free;
1968 }
1969 }
1970
1971 if (sk_filter(other, skb) < 0) {
1972 /* Toss the packet but do not return any error to the sender */
1973 err = len;
1974 goto out_free;
1975 }
1976
1977 sk_locked = 0;
1978 unix_state_lock(other);
1979 restart_locked:
1980 err = -EPERM;
1981 if (!unix_may_send(sk, other))
1982 goto out_unlock;
1983
1984 if (unlikely(sock_flag(other, SOCK_DEAD))) {
1985 /*
1986 * Check with 1003.1g - what should
1987 * datagram error
1988 */
1989 unix_state_unlock(other);
1990 sock_put(other);
1991
1992 if (!sk_locked)
1993 unix_state_lock(sk);
1994
1995 err = 0;
1996 if (sk->sk_type == SOCK_SEQPACKET) {
1997 /* We are here only when racing with unix_release_sock()
1998 * is clearing @other. Never change state to TCP_CLOSE
1999 * unlike SOCK_DGRAM wants.
2000 */
2001 unix_state_unlock(sk);
2002 err = -EPIPE;
2003 } else if (unix_peer(sk) == other) {
2004 unix_peer(sk) = NULL;
2005 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2006
2007 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2008 unix_state_unlock(sk);
2009
2010 unix_dgram_disconnected(sk, other);
2011 sock_put(other);
2012 err = -ECONNREFUSED;
2013 } else {
2014 unix_state_unlock(sk);
2015 }
2016
2017 other = NULL;
2018 if (err)
2019 goto out_free;
2020 goto restart;
2021 }
2022
2023 err = -EPIPE;
2024 if (other->sk_shutdown & RCV_SHUTDOWN)
2025 goto out_unlock;
2026
2027 if (sk->sk_type != SOCK_SEQPACKET) {
2028 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2029 if (err)
2030 goto out_unlock;
2031 }
2032
2033 /* other == sk && unix_peer(other) != sk if
2034 * - unix_peer(sk) == NULL, destination address bound to sk
2035 * - unix_peer(sk) == sk by time of get but disconnected before lock
2036 */
2037 if (other != sk &&
2038 unlikely(unix_peer(other) != sk &&
2039 unix_recvq_full_lockless(other))) {
2040 if (timeo) {
2041 timeo = unix_wait_for_peer(other, timeo);
2042
2043 err = sock_intr_errno(timeo);
2044 if (signal_pending(current))
2045 goto out_free;
2046
2047 goto restart;
2048 }
2049
2050 if (!sk_locked) {
2051 unix_state_unlock(other);
2052 unix_state_double_lock(sk, other);
2053 }
2054
2055 if (unix_peer(sk) != other ||
2056 unix_dgram_peer_wake_me(sk, other)) {
2057 err = -EAGAIN;
2058 sk_locked = 1;
2059 goto out_unlock;
2060 }
2061
2062 if (!sk_locked) {
2063 sk_locked = 1;
2064 goto restart_locked;
2065 }
2066 }
2067
2068 if (unlikely(sk_locked))
2069 unix_state_unlock(sk);
2070
2071 if (sock_flag(other, SOCK_RCVTSTAMP))
2072 __net_timestamp(skb);
2073 maybe_add_creds(skb, sock, other);
2074 scm_stat_add(other, skb);
2075 skb_queue_tail(&other->sk_receive_queue, skb);
2076 unix_state_unlock(other);
2077 other->sk_data_ready(other);
2078 sock_put(other);
2079 scm_destroy(&scm);
2080 return len;
2081
2082 out_unlock:
2083 if (sk_locked)
2084 unix_state_unlock(sk);
2085 unix_state_unlock(other);
2086 out_free:
2087 kfree_skb(skb);
2088 out:
2089 if (other)
2090 sock_put(other);
2091 scm_destroy(&scm);
2092 return err;
2093 }
2094
2095 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2096 * bytes, and a minimum of a full page.
2097 */
2098 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2099
2100 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
queue_oob(struct socket * sock,struct msghdr * msg,struct sock * other,struct scm_cookie * scm,bool fds_sent)2101 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2102 struct scm_cookie *scm, bool fds_sent)
2103 {
2104 struct unix_sock *ousk = unix_sk(other);
2105 struct sk_buff *skb;
2106 int err = 0;
2107
2108 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2109
2110 if (!skb)
2111 return err;
2112
2113 err = unix_scm_to_skb(scm, skb, !fds_sent);
2114 if (err < 0) {
2115 kfree_skb(skb);
2116 return err;
2117 }
2118 skb_put(skb, 1);
2119 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2120
2121 if (err) {
2122 kfree_skb(skb);
2123 return err;
2124 }
2125
2126 unix_state_lock(other);
2127
2128 if (sock_flag(other, SOCK_DEAD) ||
2129 (other->sk_shutdown & RCV_SHUTDOWN)) {
2130 unix_state_unlock(other);
2131 kfree_skb(skb);
2132 return -EPIPE;
2133 }
2134
2135 maybe_add_creds(skb, sock, other);
2136 skb_get(skb);
2137
2138 scm_stat_add(other, skb);
2139
2140 spin_lock(&other->sk_receive_queue.lock);
2141 if (ousk->oob_skb)
2142 consume_skb(ousk->oob_skb);
2143 WRITE_ONCE(ousk->oob_skb, skb);
2144 __skb_queue_tail(&other->sk_receive_queue, skb);
2145 spin_unlock(&other->sk_receive_queue.lock);
2146
2147 sk_send_sigurg(other);
2148 unix_state_unlock(other);
2149 other->sk_data_ready(other);
2150
2151 return err;
2152 }
2153 #endif
2154
unix_stream_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2155 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2156 size_t len)
2157 {
2158 struct sock *sk = sock->sk;
2159 struct sock *other = NULL;
2160 int err, size;
2161 struct sk_buff *skb;
2162 int sent = 0;
2163 struct scm_cookie scm;
2164 bool fds_sent = false;
2165 int data_len;
2166
2167 wait_for_unix_gc();
2168 err = scm_send(sock, msg, &scm, false);
2169 if (err < 0)
2170 return err;
2171
2172 err = -EOPNOTSUPP;
2173 if (msg->msg_flags & MSG_OOB) {
2174 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2175 if (len)
2176 len--;
2177 else
2178 #endif
2179 goto out_err;
2180 }
2181
2182 if (msg->msg_namelen) {
2183 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2184 goto out_err;
2185 } else {
2186 err = -ENOTCONN;
2187 other = unix_peer(sk);
2188 if (!other)
2189 goto out_err;
2190 }
2191
2192 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2193 goto pipe_err;
2194
2195 while (sent < len) {
2196 size = len - sent;
2197
2198 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2199 skb = sock_alloc_send_pskb(sk, 0, 0,
2200 msg->msg_flags & MSG_DONTWAIT,
2201 &err, 0);
2202 } else {
2203 /* Keep two messages in the pipe so it schedules better */
2204 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2205
2206 /* allow fallback to order-0 allocations */
2207 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2208
2209 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2210
2211 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2212
2213 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2214 msg->msg_flags & MSG_DONTWAIT, &err,
2215 get_order(UNIX_SKB_FRAGS_SZ));
2216 }
2217 if (!skb)
2218 goto out_err;
2219
2220 /* Only send the fds in the first buffer */
2221 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2222 if (err < 0) {
2223 kfree_skb(skb);
2224 goto out_err;
2225 }
2226 fds_sent = true;
2227
2228 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2229 err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2230 sk->sk_allocation);
2231 if (err < 0) {
2232 kfree_skb(skb);
2233 goto out_err;
2234 }
2235 size = err;
2236 refcount_add(size, &sk->sk_wmem_alloc);
2237 } else {
2238 skb_put(skb, size - data_len);
2239 skb->data_len = data_len;
2240 skb->len = size;
2241 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2242 if (err) {
2243 kfree_skb(skb);
2244 goto out_err;
2245 }
2246 }
2247
2248 unix_state_lock(other);
2249
2250 if (sock_flag(other, SOCK_DEAD) ||
2251 (other->sk_shutdown & RCV_SHUTDOWN))
2252 goto pipe_err_free;
2253
2254 maybe_add_creds(skb, sock, other);
2255 scm_stat_add(other, skb);
2256 skb_queue_tail(&other->sk_receive_queue, skb);
2257 unix_state_unlock(other);
2258 other->sk_data_ready(other);
2259 sent += size;
2260 }
2261
2262 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2263 if (msg->msg_flags & MSG_OOB) {
2264 err = queue_oob(sock, msg, other, &scm, fds_sent);
2265 if (err)
2266 goto out_err;
2267 sent++;
2268 }
2269 #endif
2270
2271 scm_destroy(&scm);
2272
2273 return sent;
2274
2275 pipe_err_free:
2276 unix_state_unlock(other);
2277 kfree_skb(skb);
2278 pipe_err:
2279 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2280 send_sig(SIGPIPE, current, 0);
2281 err = -EPIPE;
2282 out_err:
2283 scm_destroy(&scm);
2284 return sent ? : err;
2285 }
2286
unix_seqpacket_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2287 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2288 size_t len)
2289 {
2290 int err;
2291 struct sock *sk = sock->sk;
2292
2293 err = sock_error(sk);
2294 if (err)
2295 return err;
2296
2297 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2298 return -ENOTCONN;
2299
2300 if (msg->msg_namelen)
2301 msg->msg_namelen = 0;
2302
2303 return unix_dgram_sendmsg(sock, msg, len);
2304 }
2305
unix_seqpacket_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2306 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2307 size_t size, int flags)
2308 {
2309 struct sock *sk = sock->sk;
2310
2311 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2312 return -ENOTCONN;
2313
2314 return unix_dgram_recvmsg(sock, msg, size, flags);
2315 }
2316
unix_copy_addr(struct msghdr * msg,struct sock * sk)2317 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2318 {
2319 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2320
2321 if (addr) {
2322 msg->msg_namelen = addr->len;
2323 memcpy(msg->msg_name, addr->name, addr->len);
2324 }
2325 }
2326
__unix_dgram_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2327 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2328 int flags)
2329 {
2330 struct scm_cookie scm;
2331 struct socket *sock = sk->sk_socket;
2332 struct unix_sock *u = unix_sk(sk);
2333 struct sk_buff *skb, *last;
2334 long timeo;
2335 int skip;
2336 int err;
2337
2338 err = -EOPNOTSUPP;
2339 if (flags&MSG_OOB)
2340 goto out;
2341
2342 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2343
2344 do {
2345 mutex_lock(&u->iolock);
2346
2347 skip = sk_peek_offset(sk, flags);
2348 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2349 &skip, &err, &last);
2350 if (skb) {
2351 if (!(flags & MSG_PEEK))
2352 scm_stat_del(sk, skb);
2353 break;
2354 }
2355
2356 mutex_unlock(&u->iolock);
2357
2358 if (err != -EAGAIN)
2359 break;
2360 } while (timeo &&
2361 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2362 &err, &timeo, last));
2363
2364 if (!skb) { /* implies iolock unlocked */
2365 unix_state_lock(sk);
2366 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2367 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2368 (sk->sk_shutdown & RCV_SHUTDOWN))
2369 err = 0;
2370 unix_state_unlock(sk);
2371 goto out;
2372 }
2373
2374 if (wq_has_sleeper(&u->peer_wait))
2375 wake_up_interruptible_sync_poll(&u->peer_wait,
2376 EPOLLOUT | EPOLLWRNORM |
2377 EPOLLWRBAND);
2378
2379 if (msg->msg_name)
2380 unix_copy_addr(msg, skb->sk);
2381
2382 if (size > skb->len - skip)
2383 size = skb->len - skip;
2384 else if (size < skb->len - skip)
2385 msg->msg_flags |= MSG_TRUNC;
2386
2387 err = skb_copy_datagram_msg(skb, skip, msg, size);
2388 if (err)
2389 goto out_free;
2390
2391 if (sock_flag(sk, SOCK_RCVTSTAMP))
2392 __sock_recv_timestamp(msg, sk, skb);
2393
2394 memset(&scm, 0, sizeof(scm));
2395
2396 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2397 unix_set_secdata(&scm, skb);
2398
2399 if (!(flags & MSG_PEEK)) {
2400 if (UNIXCB(skb).fp)
2401 unix_detach_fds(&scm, skb);
2402
2403 sk_peek_offset_bwd(sk, skb->len);
2404 } else {
2405 /* It is questionable: on PEEK we could:
2406 - do not return fds - good, but too simple 8)
2407 - return fds, and do not return them on read (old strategy,
2408 apparently wrong)
2409 - clone fds (I chose it for now, it is the most universal
2410 solution)
2411
2412 POSIX 1003.1g does not actually define this clearly
2413 at all. POSIX 1003.1g doesn't define a lot of things
2414 clearly however!
2415
2416 */
2417
2418 sk_peek_offset_fwd(sk, size);
2419
2420 if (UNIXCB(skb).fp)
2421 unix_peek_fds(&scm, skb);
2422 }
2423 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2424
2425 scm_recv_unix(sock, msg, &scm, flags);
2426
2427 out_free:
2428 skb_free_datagram(sk, skb);
2429 mutex_unlock(&u->iolock);
2430 out:
2431 return err;
2432 }
2433
unix_dgram_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2434 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2435 int flags)
2436 {
2437 struct sock *sk = sock->sk;
2438
2439 #ifdef CONFIG_BPF_SYSCALL
2440 const struct proto *prot = READ_ONCE(sk->sk_prot);
2441
2442 if (prot != &unix_dgram_proto)
2443 return prot->recvmsg(sk, msg, size, flags, NULL);
2444 #endif
2445 return __unix_dgram_recvmsg(sk, msg, size, flags);
2446 }
2447
unix_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2448 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2449 {
2450 struct unix_sock *u = unix_sk(sk);
2451 struct sk_buff *skb;
2452 int err;
2453
2454 mutex_lock(&u->iolock);
2455 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2456 mutex_unlock(&u->iolock);
2457 if (!skb)
2458 return err;
2459
2460 return recv_actor(sk, skb);
2461 }
2462
2463 /*
2464 * Sleep until more data has arrived. But check for races..
2465 */
unix_stream_data_wait(struct sock * sk,long timeo,struct sk_buff * last,unsigned int last_len,bool freezable)2466 static long unix_stream_data_wait(struct sock *sk, long timeo,
2467 struct sk_buff *last, unsigned int last_len,
2468 bool freezable)
2469 {
2470 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2471 struct sk_buff *tail;
2472 DEFINE_WAIT(wait);
2473
2474 unix_state_lock(sk);
2475
2476 for (;;) {
2477 prepare_to_wait(sk_sleep(sk), &wait, state);
2478
2479 tail = skb_peek_tail(&sk->sk_receive_queue);
2480 if (tail != last ||
2481 (tail && tail->len != last_len) ||
2482 sk->sk_err ||
2483 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2484 signal_pending(current) ||
2485 !timeo)
2486 break;
2487
2488 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2489 unix_state_unlock(sk);
2490 timeo = schedule_timeout(timeo);
2491 unix_state_lock(sk);
2492
2493 if (sock_flag(sk, SOCK_DEAD))
2494 break;
2495
2496 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2497 }
2498
2499 finish_wait(sk_sleep(sk), &wait);
2500 unix_state_unlock(sk);
2501 return timeo;
2502 }
2503
unix_skb_len(const struct sk_buff * skb)2504 static unsigned int unix_skb_len(const struct sk_buff *skb)
2505 {
2506 return skb->len - UNIXCB(skb).consumed;
2507 }
2508
2509 struct unix_stream_read_state {
2510 int (*recv_actor)(struct sk_buff *, int, int,
2511 struct unix_stream_read_state *);
2512 struct socket *socket;
2513 struct msghdr *msg;
2514 struct pipe_inode_info *pipe;
2515 size_t size;
2516 int flags;
2517 unsigned int splice_flags;
2518 };
2519
2520 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
unix_stream_recv_urg(struct unix_stream_read_state * state)2521 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2522 {
2523 struct socket *sock = state->socket;
2524 struct sock *sk = sock->sk;
2525 struct unix_sock *u = unix_sk(sk);
2526 int chunk = 1;
2527 struct sk_buff *oob_skb;
2528
2529 mutex_lock(&u->iolock);
2530 unix_state_lock(sk);
2531 spin_lock(&sk->sk_receive_queue.lock);
2532
2533 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2534 spin_unlock(&sk->sk_receive_queue.lock);
2535 unix_state_unlock(sk);
2536 mutex_unlock(&u->iolock);
2537 return -EINVAL;
2538 }
2539
2540 oob_skb = u->oob_skb;
2541
2542 if (!(state->flags & MSG_PEEK))
2543 WRITE_ONCE(u->oob_skb, NULL);
2544 else
2545 skb_get(oob_skb);
2546
2547 spin_unlock(&sk->sk_receive_queue.lock);
2548 unix_state_unlock(sk);
2549
2550 chunk = state->recv_actor(oob_skb, 0, chunk, state);
2551
2552 if (!(state->flags & MSG_PEEK))
2553 UNIXCB(oob_skb).consumed += 1;
2554
2555 consume_skb(oob_skb);
2556
2557 mutex_unlock(&u->iolock);
2558
2559 if (chunk < 0)
2560 return -EFAULT;
2561
2562 state->msg->msg_flags |= MSG_OOB;
2563 return 1;
2564 }
2565
manage_oob(struct sk_buff * skb,struct sock * sk,int flags,int copied)2566 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2567 int flags, int copied)
2568 {
2569 struct unix_sock *u = unix_sk(sk);
2570
2571 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2572 skb_unlink(skb, &sk->sk_receive_queue);
2573 consume_skb(skb);
2574 skb = NULL;
2575 } else {
2576 struct sk_buff *unlinked_skb = NULL;
2577
2578 spin_lock(&sk->sk_receive_queue.lock);
2579
2580 if (skb == u->oob_skb) {
2581 if (copied) {
2582 skb = NULL;
2583 } else if (!(flags & MSG_PEEK)) {
2584 if (sock_flag(sk, SOCK_URGINLINE)) {
2585 WRITE_ONCE(u->oob_skb, NULL);
2586 consume_skb(skb);
2587 } else {
2588 __skb_unlink(skb, &sk->sk_receive_queue);
2589 WRITE_ONCE(u->oob_skb, NULL);
2590 unlinked_skb = skb;
2591 skb = skb_peek(&sk->sk_receive_queue);
2592 }
2593 } else if (!sock_flag(sk, SOCK_URGINLINE)) {
2594 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2595 }
2596 }
2597
2598 spin_unlock(&sk->sk_receive_queue.lock);
2599
2600 if (unlinked_skb) {
2601 WARN_ON_ONCE(skb_unref(unlinked_skb));
2602 kfree_skb(unlinked_skb);
2603 }
2604 }
2605 return skb;
2606 }
2607 #endif
2608
unix_stream_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2609 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2610 {
2611 struct unix_sock *u = unix_sk(sk);
2612 struct sk_buff *skb;
2613 int err;
2614
2615 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2616 return -ENOTCONN;
2617
2618 mutex_lock(&u->iolock);
2619 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2620 mutex_unlock(&u->iolock);
2621 if (!skb)
2622 return err;
2623
2624 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2625 if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2626 bool drop = false;
2627
2628 unix_state_lock(sk);
2629
2630 if (sock_flag(sk, SOCK_DEAD)) {
2631 unix_state_unlock(sk);
2632 kfree_skb(skb);
2633 return -ECONNRESET;
2634 }
2635
2636 spin_lock(&sk->sk_receive_queue.lock);
2637 if (likely(skb == u->oob_skb)) {
2638 WRITE_ONCE(u->oob_skb, NULL);
2639 drop = true;
2640 }
2641 spin_unlock(&sk->sk_receive_queue.lock);
2642
2643 unix_state_unlock(sk);
2644
2645 if (drop) {
2646 WARN_ON_ONCE(skb_unref(skb));
2647 kfree_skb(skb);
2648 return -EAGAIN;
2649 }
2650 }
2651 #endif
2652
2653 return recv_actor(sk, skb);
2654 }
2655
unix_stream_read_generic(struct unix_stream_read_state * state,bool freezable)2656 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2657 bool freezable)
2658 {
2659 struct scm_cookie scm;
2660 struct socket *sock = state->socket;
2661 struct sock *sk = sock->sk;
2662 struct unix_sock *u = unix_sk(sk);
2663 int copied = 0;
2664 int flags = state->flags;
2665 int noblock = flags & MSG_DONTWAIT;
2666 bool check_creds = false;
2667 int target;
2668 int err = 0;
2669 long timeo;
2670 int skip;
2671 size_t size = state->size;
2672 unsigned int last_len;
2673
2674 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2675 err = -EINVAL;
2676 goto out;
2677 }
2678
2679 if (unlikely(flags & MSG_OOB)) {
2680 err = -EOPNOTSUPP;
2681 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2682 err = unix_stream_recv_urg(state);
2683 #endif
2684 goto out;
2685 }
2686
2687 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2688 timeo = sock_rcvtimeo(sk, noblock);
2689
2690 memset(&scm, 0, sizeof(scm));
2691
2692 /* Lock the socket to prevent queue disordering
2693 * while sleeps in memcpy_tomsg
2694 */
2695 mutex_lock(&u->iolock);
2696
2697 skip = max(sk_peek_offset(sk, flags), 0);
2698
2699 do {
2700 int chunk;
2701 bool drop_skb;
2702 struct sk_buff *skb, *last;
2703
2704 redo:
2705 unix_state_lock(sk);
2706 if (sock_flag(sk, SOCK_DEAD)) {
2707 err = -ECONNRESET;
2708 goto unlock;
2709 }
2710 last = skb = skb_peek(&sk->sk_receive_queue);
2711 last_len = last ? last->len : 0;
2712
2713 again:
2714 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2715 if (skb) {
2716 skb = manage_oob(skb, sk, flags, copied);
2717 if (!skb && copied) {
2718 unix_state_unlock(sk);
2719 break;
2720 }
2721 }
2722 #endif
2723 if (skb == NULL) {
2724 if (copied >= target)
2725 goto unlock;
2726
2727 /*
2728 * POSIX 1003.1g mandates this order.
2729 */
2730
2731 err = sock_error(sk);
2732 if (err)
2733 goto unlock;
2734 if (sk->sk_shutdown & RCV_SHUTDOWN)
2735 goto unlock;
2736
2737 unix_state_unlock(sk);
2738 if (!timeo) {
2739 err = -EAGAIN;
2740 break;
2741 }
2742
2743 mutex_unlock(&u->iolock);
2744
2745 timeo = unix_stream_data_wait(sk, timeo, last,
2746 last_len, freezable);
2747
2748 if (signal_pending(current)) {
2749 err = sock_intr_errno(timeo);
2750 scm_destroy(&scm);
2751 goto out;
2752 }
2753
2754 mutex_lock(&u->iolock);
2755 goto redo;
2756 unlock:
2757 unix_state_unlock(sk);
2758 break;
2759 }
2760
2761 while (skip >= unix_skb_len(skb)) {
2762 skip -= unix_skb_len(skb);
2763 last = skb;
2764 last_len = skb->len;
2765 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2766 if (!skb)
2767 goto again;
2768 }
2769
2770 unix_state_unlock(sk);
2771
2772 if (check_creds) {
2773 /* Never glue messages from different writers */
2774 if (!unix_skb_scm_eq(skb, &scm))
2775 break;
2776 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2777 test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2778 /* Copy credentials */
2779 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2780 unix_set_secdata(&scm, skb);
2781 check_creds = true;
2782 }
2783
2784 /* Copy address just once */
2785 if (state->msg && state->msg->msg_name) {
2786 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2787 state->msg->msg_name);
2788 unix_copy_addr(state->msg, skb->sk);
2789 sunaddr = NULL;
2790 }
2791
2792 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2793 skb_get(skb);
2794 chunk = state->recv_actor(skb, skip, chunk, state);
2795 drop_skb = !unix_skb_len(skb);
2796 /* skb is only safe to use if !drop_skb */
2797 consume_skb(skb);
2798 if (chunk < 0) {
2799 if (copied == 0)
2800 copied = -EFAULT;
2801 break;
2802 }
2803 copied += chunk;
2804 size -= chunk;
2805
2806 if (drop_skb) {
2807 /* the skb was touched by a concurrent reader;
2808 * we should not expect anything from this skb
2809 * anymore and assume it invalid - we can be
2810 * sure it was dropped from the socket queue
2811 *
2812 * let's report a short read
2813 */
2814 err = 0;
2815 break;
2816 }
2817
2818 /* Mark read part of skb as used */
2819 if (!(flags & MSG_PEEK)) {
2820 UNIXCB(skb).consumed += chunk;
2821
2822 sk_peek_offset_bwd(sk, chunk);
2823
2824 if (UNIXCB(skb).fp) {
2825 scm_stat_del(sk, skb);
2826 unix_detach_fds(&scm, skb);
2827 }
2828
2829 if (unix_skb_len(skb))
2830 break;
2831
2832 skb_unlink(skb, &sk->sk_receive_queue);
2833 consume_skb(skb);
2834
2835 if (scm.fp)
2836 break;
2837 } else {
2838 /* It is questionable, see note in unix_dgram_recvmsg.
2839 */
2840 if (UNIXCB(skb).fp)
2841 unix_peek_fds(&scm, skb);
2842
2843 sk_peek_offset_fwd(sk, chunk);
2844
2845 if (UNIXCB(skb).fp)
2846 break;
2847
2848 skip = 0;
2849 last = skb;
2850 last_len = skb->len;
2851 unix_state_lock(sk);
2852 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2853 if (skb)
2854 goto again;
2855 unix_state_unlock(sk);
2856 break;
2857 }
2858 } while (size);
2859
2860 mutex_unlock(&u->iolock);
2861 if (state->msg)
2862 scm_recv_unix(sock, state->msg, &scm, flags);
2863 else
2864 scm_destroy(&scm);
2865 out:
2866 return copied ? : err;
2867 }
2868
unix_stream_read_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2869 static int unix_stream_read_actor(struct sk_buff *skb,
2870 int skip, int chunk,
2871 struct unix_stream_read_state *state)
2872 {
2873 int ret;
2874
2875 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2876 state->msg, chunk);
2877 return ret ?: chunk;
2878 }
2879
__unix_stream_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2880 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2881 size_t size, int flags)
2882 {
2883 struct unix_stream_read_state state = {
2884 .recv_actor = unix_stream_read_actor,
2885 .socket = sk->sk_socket,
2886 .msg = msg,
2887 .size = size,
2888 .flags = flags
2889 };
2890
2891 return unix_stream_read_generic(&state, true);
2892 }
2893
unix_stream_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2894 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2895 size_t size, int flags)
2896 {
2897 struct unix_stream_read_state state = {
2898 .recv_actor = unix_stream_read_actor,
2899 .socket = sock,
2900 .msg = msg,
2901 .size = size,
2902 .flags = flags
2903 };
2904
2905 #ifdef CONFIG_BPF_SYSCALL
2906 struct sock *sk = sock->sk;
2907 const struct proto *prot = READ_ONCE(sk->sk_prot);
2908
2909 if (prot != &unix_stream_proto)
2910 return prot->recvmsg(sk, msg, size, flags, NULL);
2911 #endif
2912 return unix_stream_read_generic(&state, true);
2913 }
2914
unix_stream_splice_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2915 static int unix_stream_splice_actor(struct sk_buff *skb,
2916 int skip, int chunk,
2917 struct unix_stream_read_state *state)
2918 {
2919 return skb_splice_bits(skb, state->socket->sk,
2920 UNIXCB(skb).consumed + skip,
2921 state->pipe, chunk, state->splice_flags);
2922 }
2923
unix_stream_splice_read(struct socket * sock,loff_t * ppos,struct pipe_inode_info * pipe,size_t size,unsigned int flags)2924 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
2925 struct pipe_inode_info *pipe,
2926 size_t size, unsigned int flags)
2927 {
2928 struct unix_stream_read_state state = {
2929 .recv_actor = unix_stream_splice_actor,
2930 .socket = sock,
2931 .pipe = pipe,
2932 .size = size,
2933 .splice_flags = flags,
2934 };
2935
2936 if (unlikely(*ppos))
2937 return -ESPIPE;
2938
2939 if (sock->file->f_flags & O_NONBLOCK ||
2940 flags & SPLICE_F_NONBLOCK)
2941 state.flags = MSG_DONTWAIT;
2942
2943 return unix_stream_read_generic(&state, false);
2944 }
2945
unix_shutdown(struct socket * sock,int mode)2946 static int unix_shutdown(struct socket *sock, int mode)
2947 {
2948 struct sock *sk = sock->sk;
2949 struct sock *other;
2950
2951 if (mode < SHUT_RD || mode > SHUT_RDWR)
2952 return -EINVAL;
2953 /* This maps:
2954 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
2955 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
2956 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2957 */
2958 ++mode;
2959
2960 unix_state_lock(sk);
2961 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2962 other = unix_peer(sk);
2963 if (other)
2964 sock_hold(other);
2965 unix_state_unlock(sk);
2966 sk->sk_state_change(sk);
2967
2968 if (other &&
2969 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2970
2971 int peer_mode = 0;
2972 const struct proto *prot = READ_ONCE(other->sk_prot);
2973
2974 if (prot->unhash)
2975 prot->unhash(other);
2976 if (mode&RCV_SHUTDOWN)
2977 peer_mode |= SEND_SHUTDOWN;
2978 if (mode&SEND_SHUTDOWN)
2979 peer_mode |= RCV_SHUTDOWN;
2980 unix_state_lock(other);
2981 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2982 unix_state_unlock(other);
2983 other->sk_state_change(other);
2984 if (peer_mode == SHUTDOWN_MASK)
2985 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2986 else if (peer_mode & RCV_SHUTDOWN)
2987 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2988 }
2989 if (other)
2990 sock_put(other);
2991
2992 return 0;
2993 }
2994
unix_inq_len(struct sock * sk)2995 long unix_inq_len(struct sock *sk)
2996 {
2997 struct sk_buff *skb;
2998 long amount = 0;
2999
3000 if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3001 return -EINVAL;
3002
3003 spin_lock(&sk->sk_receive_queue.lock);
3004 if (sk->sk_type == SOCK_STREAM ||
3005 sk->sk_type == SOCK_SEQPACKET) {
3006 skb_queue_walk(&sk->sk_receive_queue, skb)
3007 amount += unix_skb_len(skb);
3008 } else {
3009 skb = skb_peek(&sk->sk_receive_queue);
3010 if (skb)
3011 amount = skb->len;
3012 }
3013 spin_unlock(&sk->sk_receive_queue.lock);
3014
3015 return amount;
3016 }
3017 EXPORT_SYMBOL_GPL(unix_inq_len);
3018
unix_outq_len(struct sock * sk)3019 long unix_outq_len(struct sock *sk)
3020 {
3021 return sk_wmem_alloc_get(sk);
3022 }
3023 EXPORT_SYMBOL_GPL(unix_outq_len);
3024
unix_open_file(struct sock * sk)3025 static int unix_open_file(struct sock *sk)
3026 {
3027 struct path path;
3028 struct file *f;
3029 int fd;
3030
3031 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3032 return -EPERM;
3033
3034 if (!smp_load_acquire(&unix_sk(sk)->addr))
3035 return -ENOENT;
3036
3037 path = unix_sk(sk)->path;
3038 if (!path.dentry)
3039 return -ENOENT;
3040
3041 path_get(&path);
3042
3043 fd = get_unused_fd_flags(O_CLOEXEC);
3044 if (fd < 0)
3045 goto out;
3046
3047 f = dentry_open(&path, O_PATH, current_cred());
3048 if (IS_ERR(f)) {
3049 put_unused_fd(fd);
3050 fd = PTR_ERR(f);
3051 goto out;
3052 }
3053
3054 fd_install(fd, f);
3055 out:
3056 path_put(&path);
3057
3058 return fd;
3059 }
3060
unix_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3061 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3062 {
3063 struct sock *sk = sock->sk;
3064 long amount = 0;
3065 int err;
3066
3067 switch (cmd) {
3068 case SIOCOUTQ:
3069 amount = unix_outq_len(sk);
3070 err = put_user(amount, (int __user *)arg);
3071 break;
3072 case SIOCINQ:
3073 amount = unix_inq_len(sk);
3074 if (amount < 0)
3075 err = amount;
3076 else
3077 err = put_user(amount, (int __user *)arg);
3078 break;
3079 case SIOCUNIXFILE:
3080 err = unix_open_file(sk);
3081 break;
3082 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3083 case SIOCATMARK:
3084 {
3085 struct sk_buff *skb;
3086 int answ = 0;
3087
3088 skb = skb_peek(&sk->sk_receive_queue);
3089 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3090 answ = 1;
3091 err = put_user(answ, (int __user *)arg);
3092 }
3093 break;
3094 #endif
3095 default:
3096 err = -ENOIOCTLCMD;
3097 break;
3098 }
3099 return err;
3100 }
3101
3102 #ifdef CONFIG_COMPAT
unix_compat_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3103 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3104 {
3105 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3106 }
3107 #endif
3108
unix_poll(struct file * file,struct socket * sock,poll_table * wait)3109 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3110 {
3111 struct sock *sk = sock->sk;
3112 unsigned char state;
3113 __poll_t mask;
3114 u8 shutdown;
3115
3116 sock_poll_wait(file, sock, wait);
3117 mask = 0;
3118 shutdown = READ_ONCE(sk->sk_shutdown);
3119 state = READ_ONCE(sk->sk_state);
3120
3121 /* exceptional events? */
3122 if (READ_ONCE(sk->sk_err))
3123 mask |= EPOLLERR;
3124 if (shutdown == SHUTDOWN_MASK)
3125 mask |= EPOLLHUP;
3126 if (shutdown & RCV_SHUTDOWN)
3127 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3128
3129 /* readable? */
3130 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3131 mask |= EPOLLIN | EPOLLRDNORM;
3132 if (sk_is_readable(sk))
3133 mask |= EPOLLIN | EPOLLRDNORM;
3134 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3135 if (READ_ONCE(unix_sk(sk)->oob_skb))
3136 mask |= EPOLLPRI;
3137 #endif
3138
3139 /* Connection-based need to check for termination and startup */
3140 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3141 state == TCP_CLOSE)
3142 mask |= EPOLLHUP;
3143
3144 /*
3145 * we set writable also when the other side has shut down the
3146 * connection. This prevents stuck sockets.
3147 */
3148 if (unix_writable(sk, state))
3149 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3150
3151 return mask;
3152 }
3153
unix_dgram_poll(struct file * file,struct socket * sock,poll_table * wait)3154 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3155 poll_table *wait)
3156 {
3157 struct sock *sk = sock->sk, *other;
3158 unsigned int writable;
3159 unsigned char state;
3160 __poll_t mask;
3161 u8 shutdown;
3162
3163 sock_poll_wait(file, sock, wait);
3164 mask = 0;
3165 shutdown = READ_ONCE(sk->sk_shutdown);
3166 state = READ_ONCE(sk->sk_state);
3167
3168 /* exceptional events? */
3169 if (READ_ONCE(sk->sk_err) ||
3170 !skb_queue_empty_lockless(&sk->sk_error_queue))
3171 mask |= EPOLLERR |
3172 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3173
3174 if (shutdown & RCV_SHUTDOWN)
3175 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3176 if (shutdown == SHUTDOWN_MASK)
3177 mask |= EPOLLHUP;
3178
3179 /* readable? */
3180 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3181 mask |= EPOLLIN | EPOLLRDNORM;
3182 if (sk_is_readable(sk))
3183 mask |= EPOLLIN | EPOLLRDNORM;
3184
3185 /* Connection-based need to check for termination and startup */
3186 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3187 mask |= EPOLLHUP;
3188
3189 /* No write status requested, avoid expensive OUT tests. */
3190 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3191 return mask;
3192
3193 writable = unix_writable(sk, state);
3194 if (writable) {
3195 unix_state_lock(sk);
3196
3197 other = unix_peer(sk);
3198 if (other && unix_peer(other) != sk &&
3199 unix_recvq_full_lockless(other) &&
3200 unix_dgram_peer_wake_me(sk, other))
3201 writable = 0;
3202
3203 unix_state_unlock(sk);
3204 }
3205
3206 if (writable)
3207 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3208 else
3209 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3210
3211 return mask;
3212 }
3213
3214 #ifdef CONFIG_PROC_FS
3215
3216 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3217
3218 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3219 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3220 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3221
unix_from_bucket(struct seq_file * seq,loff_t * pos)3222 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3223 {
3224 unsigned long offset = get_offset(*pos);
3225 unsigned long bucket = get_bucket(*pos);
3226 unsigned long count = 0;
3227 struct sock *sk;
3228
3229 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3230 sk; sk = sk_next(sk)) {
3231 if (++count == offset)
3232 break;
3233 }
3234
3235 return sk;
3236 }
3237
unix_get_first(struct seq_file * seq,loff_t * pos)3238 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3239 {
3240 unsigned long bucket = get_bucket(*pos);
3241 struct net *net = seq_file_net(seq);
3242 struct sock *sk;
3243
3244 while (bucket < UNIX_HASH_SIZE) {
3245 spin_lock(&net->unx.table.locks[bucket]);
3246
3247 sk = unix_from_bucket(seq, pos);
3248 if (sk)
3249 return sk;
3250
3251 spin_unlock(&net->unx.table.locks[bucket]);
3252
3253 *pos = set_bucket_offset(++bucket, 1);
3254 }
3255
3256 return NULL;
3257 }
3258
unix_get_next(struct seq_file * seq,struct sock * sk,loff_t * pos)3259 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3260 loff_t *pos)
3261 {
3262 unsigned long bucket = get_bucket(*pos);
3263
3264 sk = sk_next(sk);
3265 if (sk)
3266 return sk;
3267
3268
3269 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3270
3271 *pos = set_bucket_offset(++bucket, 1);
3272
3273 return unix_get_first(seq, pos);
3274 }
3275
unix_seq_start(struct seq_file * seq,loff_t * pos)3276 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3277 {
3278 if (!*pos)
3279 return SEQ_START_TOKEN;
3280
3281 return unix_get_first(seq, pos);
3282 }
3283
unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3284 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3285 {
3286 ++*pos;
3287
3288 if (v == SEQ_START_TOKEN)
3289 return unix_get_first(seq, pos);
3290
3291 return unix_get_next(seq, v, pos);
3292 }
3293
unix_seq_stop(struct seq_file * seq,void * v)3294 static void unix_seq_stop(struct seq_file *seq, void *v)
3295 {
3296 struct sock *sk = v;
3297
3298 if (sk)
3299 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3300 }
3301
unix_seq_show(struct seq_file * seq,void * v)3302 static int unix_seq_show(struct seq_file *seq, void *v)
3303 {
3304
3305 if (v == SEQ_START_TOKEN)
3306 seq_puts(seq, "Num RefCount Protocol Flags Type St "
3307 "Inode Path\n");
3308 else {
3309 struct sock *s = v;
3310 struct unix_sock *u = unix_sk(s);
3311 unix_state_lock(s);
3312
3313 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3314 s,
3315 refcount_read(&s->sk_refcnt),
3316 0,
3317 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3318 s->sk_type,
3319 s->sk_socket ?
3320 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3321 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3322 sock_i_ino(s));
3323
3324 if (u->addr) { // under a hash table lock here
3325 int i, len;
3326 seq_putc(seq, ' ');
3327
3328 i = 0;
3329 len = u->addr->len -
3330 offsetof(struct sockaddr_un, sun_path);
3331 if (u->addr->name->sun_path[0]) {
3332 len--;
3333 } else {
3334 seq_putc(seq, '@');
3335 i++;
3336 }
3337 for ( ; i < len; i++)
3338 seq_putc(seq, u->addr->name->sun_path[i] ?:
3339 '@');
3340 }
3341 unix_state_unlock(s);
3342 seq_putc(seq, '\n');
3343 }
3344
3345 return 0;
3346 }
3347
3348 static const struct seq_operations unix_seq_ops = {
3349 .start = unix_seq_start,
3350 .next = unix_seq_next,
3351 .stop = unix_seq_stop,
3352 .show = unix_seq_show,
3353 };
3354
3355 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3356 struct bpf_unix_iter_state {
3357 struct seq_net_private p;
3358 unsigned int cur_sk;
3359 unsigned int end_sk;
3360 unsigned int max_sk;
3361 struct sock **batch;
3362 bool st_bucket_done;
3363 };
3364
3365 struct bpf_iter__unix {
3366 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3367 __bpf_md_ptr(struct unix_sock *, unix_sk);
3368 uid_t uid __aligned(8);
3369 };
3370
unix_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3371 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3372 struct unix_sock *unix_sk, uid_t uid)
3373 {
3374 struct bpf_iter__unix ctx;
3375
3376 meta->seq_num--; /* skip SEQ_START_TOKEN */
3377 ctx.meta = meta;
3378 ctx.unix_sk = unix_sk;
3379 ctx.uid = uid;
3380 return bpf_iter_run_prog(prog, &ctx);
3381 }
3382
bpf_iter_unix_hold_batch(struct seq_file * seq,struct sock * start_sk)3383 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3384
3385 {
3386 struct bpf_unix_iter_state *iter = seq->private;
3387 unsigned int expected = 1;
3388 struct sock *sk;
3389
3390 sock_hold(start_sk);
3391 iter->batch[iter->end_sk++] = start_sk;
3392
3393 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3394 if (iter->end_sk < iter->max_sk) {
3395 sock_hold(sk);
3396 iter->batch[iter->end_sk++] = sk;
3397 }
3398
3399 expected++;
3400 }
3401
3402 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3403
3404 return expected;
3405 }
3406
bpf_iter_unix_put_batch(struct bpf_unix_iter_state * iter)3407 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3408 {
3409 while (iter->cur_sk < iter->end_sk)
3410 sock_put(iter->batch[iter->cur_sk++]);
3411 }
3412
bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state * iter,unsigned int new_batch_sz)3413 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3414 unsigned int new_batch_sz)
3415 {
3416 struct sock **new_batch;
3417
3418 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3419 GFP_USER | __GFP_NOWARN);
3420 if (!new_batch)
3421 return -ENOMEM;
3422
3423 bpf_iter_unix_put_batch(iter);
3424 kvfree(iter->batch);
3425 iter->batch = new_batch;
3426 iter->max_sk = new_batch_sz;
3427
3428 return 0;
3429 }
3430
bpf_iter_unix_batch(struct seq_file * seq,loff_t * pos)3431 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3432 loff_t *pos)
3433 {
3434 struct bpf_unix_iter_state *iter = seq->private;
3435 unsigned int expected;
3436 bool resized = false;
3437 struct sock *sk;
3438
3439 if (iter->st_bucket_done)
3440 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3441
3442 again:
3443 /* Get a new batch */
3444 iter->cur_sk = 0;
3445 iter->end_sk = 0;
3446
3447 sk = unix_get_first(seq, pos);
3448 if (!sk)
3449 return NULL; /* Done */
3450
3451 expected = bpf_iter_unix_hold_batch(seq, sk);
3452
3453 if (iter->end_sk == expected) {
3454 iter->st_bucket_done = true;
3455 return sk;
3456 }
3457
3458 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3459 resized = true;
3460 goto again;
3461 }
3462
3463 return sk;
3464 }
3465
bpf_iter_unix_seq_start(struct seq_file * seq,loff_t * pos)3466 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3467 {
3468 if (!*pos)
3469 return SEQ_START_TOKEN;
3470
3471 /* bpf iter does not support lseek, so it always
3472 * continue from where it was stop()-ped.
3473 */
3474 return bpf_iter_unix_batch(seq, pos);
3475 }
3476
bpf_iter_unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3477 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3478 {
3479 struct bpf_unix_iter_state *iter = seq->private;
3480 struct sock *sk;
3481
3482 /* Whenever seq_next() is called, the iter->cur_sk is
3483 * done with seq_show(), so advance to the next sk in
3484 * the batch.
3485 */
3486 if (iter->cur_sk < iter->end_sk)
3487 sock_put(iter->batch[iter->cur_sk++]);
3488
3489 ++*pos;
3490
3491 if (iter->cur_sk < iter->end_sk)
3492 sk = iter->batch[iter->cur_sk];
3493 else
3494 sk = bpf_iter_unix_batch(seq, pos);
3495
3496 return sk;
3497 }
3498
bpf_iter_unix_seq_show(struct seq_file * seq,void * v)3499 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3500 {
3501 struct bpf_iter_meta meta;
3502 struct bpf_prog *prog;
3503 struct sock *sk = v;
3504 uid_t uid;
3505 bool slow;
3506 int ret;
3507
3508 if (v == SEQ_START_TOKEN)
3509 return 0;
3510
3511 slow = lock_sock_fast(sk);
3512
3513 if (unlikely(sk_unhashed(sk))) {
3514 ret = SEQ_SKIP;
3515 goto unlock;
3516 }
3517
3518 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3519 meta.seq = seq;
3520 prog = bpf_iter_get_info(&meta, false);
3521 ret = unix_prog_seq_show(prog, &meta, v, uid);
3522 unlock:
3523 unlock_sock_fast(sk, slow);
3524 return ret;
3525 }
3526
bpf_iter_unix_seq_stop(struct seq_file * seq,void * v)3527 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3528 {
3529 struct bpf_unix_iter_state *iter = seq->private;
3530 struct bpf_iter_meta meta;
3531 struct bpf_prog *prog;
3532
3533 if (!v) {
3534 meta.seq = seq;
3535 prog = bpf_iter_get_info(&meta, true);
3536 if (prog)
3537 (void)unix_prog_seq_show(prog, &meta, v, 0);
3538 }
3539
3540 if (iter->cur_sk < iter->end_sk)
3541 bpf_iter_unix_put_batch(iter);
3542 }
3543
3544 static const struct seq_operations bpf_iter_unix_seq_ops = {
3545 .start = bpf_iter_unix_seq_start,
3546 .next = bpf_iter_unix_seq_next,
3547 .stop = bpf_iter_unix_seq_stop,
3548 .show = bpf_iter_unix_seq_show,
3549 };
3550 #endif
3551 #endif
3552
3553 static const struct net_proto_family unix_family_ops = {
3554 .family = PF_UNIX,
3555 .create = unix_create,
3556 .owner = THIS_MODULE,
3557 };
3558
3559
unix_net_init(struct net * net)3560 static int __net_init unix_net_init(struct net *net)
3561 {
3562 int i;
3563
3564 net->unx.sysctl_max_dgram_qlen = 10;
3565 if (unix_sysctl_register(net))
3566 goto out;
3567
3568 #ifdef CONFIG_PROC_FS
3569 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3570 sizeof(struct seq_net_private)))
3571 goto err_sysctl;
3572 #endif
3573
3574 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3575 sizeof(spinlock_t), GFP_KERNEL);
3576 if (!net->unx.table.locks)
3577 goto err_proc;
3578
3579 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3580 sizeof(struct hlist_head),
3581 GFP_KERNEL);
3582 if (!net->unx.table.buckets)
3583 goto free_locks;
3584
3585 for (i = 0; i < UNIX_HASH_SIZE; i++) {
3586 spin_lock_init(&net->unx.table.locks[i]);
3587 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3588 }
3589
3590 return 0;
3591
3592 free_locks:
3593 kvfree(net->unx.table.locks);
3594 err_proc:
3595 #ifdef CONFIG_PROC_FS
3596 remove_proc_entry("unix", net->proc_net);
3597 err_sysctl:
3598 #endif
3599 unix_sysctl_unregister(net);
3600 out:
3601 return -ENOMEM;
3602 }
3603
unix_net_exit(struct net * net)3604 static void __net_exit unix_net_exit(struct net *net)
3605 {
3606 kvfree(net->unx.table.buckets);
3607 kvfree(net->unx.table.locks);
3608 unix_sysctl_unregister(net);
3609 remove_proc_entry("unix", net->proc_net);
3610 }
3611
3612 static struct pernet_operations unix_net_ops = {
3613 .init = unix_net_init,
3614 .exit = unix_net_exit,
3615 };
3616
3617 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(unix,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3618 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3619 struct unix_sock *unix_sk, uid_t uid)
3620
3621 #define INIT_BATCH_SZ 16
3622
3623 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3624 {
3625 struct bpf_unix_iter_state *iter = priv_data;
3626 int err;
3627
3628 err = bpf_iter_init_seq_net(priv_data, aux);
3629 if (err)
3630 return err;
3631
3632 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3633 if (err) {
3634 bpf_iter_fini_seq_net(priv_data);
3635 return err;
3636 }
3637
3638 return 0;
3639 }
3640
bpf_iter_fini_unix(void * priv_data)3641 static void bpf_iter_fini_unix(void *priv_data)
3642 {
3643 struct bpf_unix_iter_state *iter = priv_data;
3644
3645 bpf_iter_fini_seq_net(priv_data);
3646 kvfree(iter->batch);
3647 }
3648
3649 static const struct bpf_iter_seq_info unix_seq_info = {
3650 .seq_ops = &bpf_iter_unix_seq_ops,
3651 .init_seq_private = bpf_iter_init_unix,
3652 .fini_seq_private = bpf_iter_fini_unix,
3653 .seq_priv_size = sizeof(struct bpf_unix_iter_state),
3654 };
3655
3656 static const struct bpf_func_proto *
bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3657 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3658 const struct bpf_prog *prog)
3659 {
3660 switch (func_id) {
3661 case BPF_FUNC_setsockopt:
3662 return &bpf_sk_setsockopt_proto;
3663 case BPF_FUNC_getsockopt:
3664 return &bpf_sk_getsockopt_proto;
3665 default:
3666 return NULL;
3667 }
3668 }
3669
3670 static struct bpf_iter_reg unix_reg_info = {
3671 .target = "unix",
3672 .ctx_arg_info_size = 1,
3673 .ctx_arg_info = {
3674 { offsetof(struct bpf_iter__unix, unix_sk),
3675 PTR_TO_BTF_ID_OR_NULL },
3676 },
3677 .get_func_proto = bpf_iter_unix_get_func_proto,
3678 .seq_info = &unix_seq_info,
3679 };
3680
bpf_iter_register(void)3681 static void __init bpf_iter_register(void)
3682 {
3683 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3684 if (bpf_iter_reg_target(&unix_reg_info))
3685 pr_warn("Warning: could not register bpf iterator unix\n");
3686 }
3687 #endif
3688
af_unix_init(void)3689 static int __init af_unix_init(void)
3690 {
3691 int i, rc = -1;
3692
3693 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3694
3695 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3696 spin_lock_init(&bsd_socket_locks[i]);
3697 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3698 }
3699
3700 rc = proto_register(&unix_dgram_proto, 1);
3701 if (rc != 0) {
3702 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3703 goto out;
3704 }
3705
3706 rc = proto_register(&unix_stream_proto, 1);
3707 if (rc != 0) {
3708 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3709 proto_unregister(&unix_dgram_proto);
3710 goto out;
3711 }
3712
3713 sock_register(&unix_family_ops);
3714 register_pernet_subsys(&unix_net_ops);
3715 unix_bpf_build_proto();
3716
3717 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3718 bpf_iter_register();
3719 #endif
3720
3721 out:
3722 return rc;
3723 }
3724
af_unix_exit(void)3725 static void __exit af_unix_exit(void)
3726 {
3727 sock_unregister(PF_UNIX);
3728 proto_unregister(&unix_dgram_proto);
3729 proto_unregister(&unix_stream_proto);
3730 unregister_pernet_subsys(&unix_net_ops);
3731 }
3732
3733 /* Earlier than device_initcall() so that other drivers invoking
3734 request_module() don't end up in a loop when modprobe tries
3735 to use a UNIX socket. But later than subsys_initcall() because
3736 we depend on stuff initialised there */
3737 fs_initcall(af_unix_init);
3738 module_exit(af_unix_exit);
3739
3740 MODULE_LICENSE("GPL");
3741 MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver);
3742 MODULE_ALIAS_NETPROTO(PF_UNIX);
3743