• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 #ifdef CONFIG_PROVE_LOCKING
130 #define cmp_ptr(l, r)	(((l) > (r)) - ((l) < (r)))
131 
unix_table_lock_cmp_fn(const struct lockdep_map * a,const struct lockdep_map * b)132 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
133 				  const struct lockdep_map *b)
134 {
135 	return cmp_ptr(a, b);
136 }
137 
unix_state_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)138 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
139 				  const struct lockdep_map *_b)
140 {
141 	const struct unix_sock *a, *b;
142 
143 	a = container_of(_a, struct unix_sock, lock.dep_map);
144 	b = container_of(_b, struct unix_sock, lock.dep_map);
145 
146 	if (a->sk.sk_state == TCP_LISTEN) {
147 		/* unix_stream_connect(): Before the 2nd unix_state_lock(),
148 		 *
149 		 *   1. a is TCP_LISTEN.
150 		 *   2. b is not a.
151 		 *   3. concurrent connect(b -> a) must fail.
152 		 *
153 		 * Except for 2. & 3., the b's state can be any possible
154 		 * value due to concurrent connect() or listen().
155 		 *
156 		 * 2. is detected in debug_spin_lock_before(), and 3. cannot
157 		 * be expressed as lock_cmp_fn.
158 		 */
159 		switch (b->sk.sk_state) {
160 		case TCP_CLOSE:
161 		case TCP_ESTABLISHED:
162 		case TCP_LISTEN:
163 			return -1;
164 		default:
165 			/* Invalid case. */
166 			return 0;
167 		}
168 	}
169 
170 	/* Should never happen.  Just to be symmetric. */
171 	if (b->sk.sk_state == TCP_LISTEN) {
172 		switch (b->sk.sk_state) {
173 		case TCP_CLOSE:
174 		case TCP_ESTABLISHED:
175 			return 1;
176 		default:
177 			return 0;
178 		}
179 	}
180 
181 	/* unix_state_double_lock(): ascending address order. */
182 	return cmp_ptr(a, b);
183 }
184 
unix_recvq_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)185 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
186 				  const struct lockdep_map *_b)
187 {
188 	const struct sock *a, *b;
189 
190 	a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
191 	b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
192 
193 	/* unix_collect_skb(): listener -> embryo order. */
194 	if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
195 		return -1;
196 
197 	/* Should never happen.  Just to be symmetric. */
198 	if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
199 		return 1;
200 
201 	return 0;
202 }
203 #endif
204 
unix_unbound_hash(struct sock * sk)205 static unsigned int unix_unbound_hash(struct sock *sk)
206 {
207 	unsigned long hash = (unsigned long)sk;
208 
209 	hash ^= hash >> 16;
210 	hash ^= hash >> 8;
211 	hash ^= sk->sk_type;
212 
213 	return hash & UNIX_HASH_MOD;
214 }
215 
unix_bsd_hash(struct inode * i)216 static unsigned int unix_bsd_hash(struct inode *i)
217 {
218 	return i->i_ino & UNIX_HASH_MOD;
219 }
220 
unix_abstract_hash(struct sockaddr_un * sunaddr,int addr_len,int type)221 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
222 				       int addr_len, int type)
223 {
224 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
225 	unsigned int hash;
226 
227 	hash = (__force unsigned int)csum_fold(csum);
228 	hash ^= hash >> 8;
229 	hash ^= type;
230 
231 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
232 }
233 
unix_table_double_lock(struct net * net,unsigned int hash1,unsigned int hash2)234 static void unix_table_double_lock(struct net *net,
235 				   unsigned int hash1, unsigned int hash2)
236 {
237 	if (hash1 == hash2) {
238 		spin_lock(&net->unx.table.locks[hash1]);
239 		return;
240 	}
241 
242 	if (hash1 > hash2)
243 		swap(hash1, hash2);
244 
245 	spin_lock(&net->unx.table.locks[hash1]);
246 	spin_lock(&net->unx.table.locks[hash2]);
247 }
248 
unix_table_double_unlock(struct net * net,unsigned int hash1,unsigned int hash2)249 static void unix_table_double_unlock(struct net *net,
250 				     unsigned int hash1, unsigned int hash2)
251 {
252 	if (hash1 == hash2) {
253 		spin_unlock(&net->unx.table.locks[hash1]);
254 		return;
255 	}
256 
257 	spin_unlock(&net->unx.table.locks[hash1]);
258 	spin_unlock(&net->unx.table.locks[hash2]);
259 }
260 
261 #ifdef CONFIG_SECURITY_NETWORK
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)262 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
263 {
264 	UNIXCB(skb).secid = scm->secid;
265 }
266 
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)267 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
268 {
269 	scm->secid = UNIXCB(skb).secid;
270 }
271 
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)272 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
273 {
274 	return (scm->secid == UNIXCB(skb).secid);
275 }
276 #else
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)277 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
278 { }
279 
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)280 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
281 { }
282 
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)283 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
284 {
285 	return true;
286 }
287 #endif /* CONFIG_SECURITY_NETWORK */
288 
unix_our_peer(struct sock * sk,struct sock * osk)289 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
290 {
291 	return unix_peer(osk) == sk;
292 }
293 
unix_may_send(struct sock * sk,struct sock * osk)294 static inline int unix_may_send(struct sock *sk, struct sock *osk)
295 {
296 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
297 }
298 
unix_recvq_full_lockless(const struct sock * sk)299 static inline int unix_recvq_full_lockless(const struct sock *sk)
300 {
301 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
302 }
303 
unix_peer_get(struct sock * s)304 struct sock *unix_peer_get(struct sock *s)
305 {
306 	struct sock *peer;
307 
308 	unix_state_lock(s);
309 	peer = unix_peer(s);
310 	if (peer)
311 		sock_hold(peer);
312 	unix_state_unlock(s);
313 	return peer;
314 }
315 EXPORT_SYMBOL_GPL(unix_peer_get);
316 
unix_create_addr(struct sockaddr_un * sunaddr,int addr_len)317 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
318 					     int addr_len)
319 {
320 	struct unix_address *addr;
321 
322 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
323 	if (!addr)
324 		return NULL;
325 
326 	refcount_set(&addr->refcnt, 1);
327 	addr->len = addr_len;
328 	memcpy(addr->name, sunaddr, addr_len);
329 
330 	return addr;
331 }
332 
unix_release_addr(struct unix_address * addr)333 static inline void unix_release_addr(struct unix_address *addr)
334 {
335 	if (refcount_dec_and_test(&addr->refcnt))
336 		kfree(addr);
337 }
338 
339 /*
340  *	Check unix socket name:
341  *		- should be not zero length.
342  *	        - if started by not zero, should be NULL terminated (FS object)
343  *		- if started by zero, it is abstract name.
344  */
345 
unix_validate_addr(struct sockaddr_un * sunaddr,int addr_len)346 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
347 {
348 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
349 	    addr_len > sizeof(*sunaddr))
350 		return -EINVAL;
351 
352 	if (sunaddr->sun_family != AF_UNIX)
353 		return -EINVAL;
354 
355 	return 0;
356 }
357 
unix_mkname_bsd(struct sockaddr_un * sunaddr,int addr_len)358 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
359 {
360 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
361 	short offset = offsetof(struct sockaddr_storage, __data);
362 
363 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
364 
365 	/* This may look like an off by one error but it is a bit more
366 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
367 	 * sun_path[108] doesn't as such exist.  However in kernel space
368 	 * we are guaranteed that it is a valid memory location in our
369 	 * kernel address buffer because syscall functions always pass
370 	 * a pointer of struct sockaddr_storage which has a bigger buffer
371 	 * than 108.  Also, we must terminate sun_path for strlen() in
372 	 * getname_kernel().
373 	 */
374 	addr->__data[addr_len - offset] = 0;
375 
376 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
377 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
378 	 * know the actual buffer.
379 	 */
380 	return strlen(addr->__data) + offset + 1;
381 }
382 
__unix_remove_socket(struct sock * sk)383 static void __unix_remove_socket(struct sock *sk)
384 {
385 	sk_del_node_init(sk);
386 }
387 
__unix_insert_socket(struct net * net,struct sock * sk)388 static void __unix_insert_socket(struct net *net, struct sock *sk)
389 {
390 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
391 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
392 }
393 
__unix_set_addr_hash(struct net * net,struct sock * sk,struct unix_address * addr,unsigned int hash)394 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
395 				 struct unix_address *addr, unsigned int hash)
396 {
397 	__unix_remove_socket(sk);
398 	smp_store_release(&unix_sk(sk)->addr, addr);
399 
400 	sk->sk_hash = hash;
401 	__unix_insert_socket(net, sk);
402 }
403 
unix_remove_socket(struct net * net,struct sock * sk)404 static void unix_remove_socket(struct net *net, struct sock *sk)
405 {
406 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
407 	__unix_remove_socket(sk);
408 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
409 }
410 
unix_insert_unbound_socket(struct net * net,struct sock * sk)411 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
412 {
413 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
414 	__unix_insert_socket(net, sk);
415 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
416 }
417 
unix_insert_bsd_socket(struct sock * sk)418 static void unix_insert_bsd_socket(struct sock *sk)
419 {
420 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
421 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
422 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
423 }
424 
unix_remove_bsd_socket(struct sock * sk)425 static void unix_remove_bsd_socket(struct sock *sk)
426 {
427 	if (!hlist_unhashed(&sk->sk_bind_node)) {
428 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
429 		__sk_del_bind_node(sk);
430 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
431 
432 		sk_node_init(&sk->sk_bind_node);
433 	}
434 }
435 
__unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)436 static struct sock *__unix_find_socket_byname(struct net *net,
437 					      struct sockaddr_un *sunname,
438 					      int len, unsigned int hash)
439 {
440 	struct sock *s;
441 
442 	sk_for_each(s, &net->unx.table.buckets[hash]) {
443 		struct unix_sock *u = unix_sk(s);
444 
445 		if (u->addr->len == len &&
446 		    !memcmp(u->addr->name, sunname, len))
447 			return s;
448 	}
449 	return NULL;
450 }
451 
unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)452 static inline struct sock *unix_find_socket_byname(struct net *net,
453 						   struct sockaddr_un *sunname,
454 						   int len, unsigned int hash)
455 {
456 	struct sock *s;
457 
458 	spin_lock(&net->unx.table.locks[hash]);
459 	s = __unix_find_socket_byname(net, sunname, len, hash);
460 	if (s)
461 		sock_hold(s);
462 	spin_unlock(&net->unx.table.locks[hash]);
463 	return s;
464 }
465 
unix_find_socket_byinode(struct inode * i)466 static struct sock *unix_find_socket_byinode(struct inode *i)
467 {
468 	unsigned int hash = unix_bsd_hash(i);
469 	struct sock *s;
470 
471 	spin_lock(&bsd_socket_locks[hash]);
472 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
473 		struct dentry *dentry = unix_sk(s)->path.dentry;
474 
475 		if (dentry && d_backing_inode(dentry) == i) {
476 			sock_hold(s);
477 			spin_unlock(&bsd_socket_locks[hash]);
478 			return s;
479 		}
480 	}
481 	spin_unlock(&bsd_socket_locks[hash]);
482 	return NULL;
483 }
484 
485 /* Support code for asymmetrically connected dgram sockets
486  *
487  * If a datagram socket is connected to a socket not itself connected
488  * to the first socket (eg, /dev/log), clients may only enqueue more
489  * messages if the present receive queue of the server socket is not
490  * "too large". This means there's a second writeability condition
491  * poll and sendmsg need to test. The dgram recv code will do a wake
492  * up on the peer_wait wait queue of a socket upon reception of a
493  * datagram which needs to be propagated to sleeping would-be writers
494  * since these might not have sent anything so far. This can't be
495  * accomplished via poll_wait because the lifetime of the server
496  * socket might be less than that of its clients if these break their
497  * association with it or if the server socket is closed while clients
498  * are still connected to it and there's no way to inform "a polling
499  * implementation" that it should let go of a certain wait queue
500  *
501  * In order to propagate a wake up, a wait_queue_entry_t of the client
502  * socket is enqueued on the peer_wait queue of the server socket
503  * whose wake function does a wake_up on the ordinary client socket
504  * wait queue. This connection is established whenever a write (or
505  * poll for write) hit the flow control condition and broken when the
506  * association to the server socket is dissolved or after a wake up
507  * was relayed.
508  */
509 
unix_dgram_peer_wake_relay(wait_queue_entry_t * q,unsigned mode,int flags,void * key)510 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
511 				      void *key)
512 {
513 	struct unix_sock *u;
514 	wait_queue_head_t *u_sleep;
515 
516 	u = container_of(q, struct unix_sock, peer_wake);
517 
518 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
519 			    q);
520 	u->peer_wake.private = NULL;
521 
522 	/* relaying can only happen while the wq still exists */
523 	u_sleep = sk_sleep(&u->sk);
524 	if (u_sleep)
525 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
526 
527 	return 0;
528 }
529 
unix_dgram_peer_wake_connect(struct sock * sk,struct sock * other)530 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
531 {
532 	struct unix_sock *u, *u_other;
533 	int rc;
534 
535 	u = unix_sk(sk);
536 	u_other = unix_sk(other);
537 	rc = 0;
538 	spin_lock(&u_other->peer_wait.lock);
539 
540 	if (!u->peer_wake.private) {
541 		u->peer_wake.private = other;
542 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
543 
544 		rc = 1;
545 	}
546 
547 	spin_unlock(&u_other->peer_wait.lock);
548 	return rc;
549 }
550 
unix_dgram_peer_wake_disconnect(struct sock * sk,struct sock * other)551 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
552 					    struct sock *other)
553 {
554 	struct unix_sock *u, *u_other;
555 
556 	u = unix_sk(sk);
557 	u_other = unix_sk(other);
558 	spin_lock(&u_other->peer_wait.lock);
559 
560 	if (u->peer_wake.private == other) {
561 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
562 		u->peer_wake.private = NULL;
563 	}
564 
565 	spin_unlock(&u_other->peer_wait.lock);
566 }
567 
unix_dgram_peer_wake_disconnect_wakeup(struct sock * sk,struct sock * other)568 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
569 						   struct sock *other)
570 {
571 	unix_dgram_peer_wake_disconnect(sk, other);
572 	wake_up_interruptible_poll(sk_sleep(sk),
573 				   EPOLLOUT |
574 				   EPOLLWRNORM |
575 				   EPOLLWRBAND);
576 }
577 
578 /* preconditions:
579  *	- unix_peer(sk) == other
580  *	- association is stable
581  */
unix_dgram_peer_wake_me(struct sock * sk,struct sock * other)582 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
583 {
584 	int connected;
585 
586 	connected = unix_dgram_peer_wake_connect(sk, other);
587 
588 	/* If other is SOCK_DEAD, we want to make sure we signal
589 	 * POLLOUT, such that a subsequent write() can get a
590 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
591 	 * to other and its full, we will hang waiting for POLLOUT.
592 	 */
593 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
594 		return 1;
595 
596 	if (connected)
597 		unix_dgram_peer_wake_disconnect(sk, other);
598 
599 	return 0;
600 }
601 
unix_writable(const struct sock * sk,unsigned char state)602 static int unix_writable(const struct sock *sk, unsigned char state)
603 {
604 	return state != TCP_LISTEN &&
605 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
606 }
607 
unix_write_space(struct sock * sk)608 static void unix_write_space(struct sock *sk)
609 {
610 	struct socket_wq *wq;
611 
612 	rcu_read_lock();
613 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
614 		wq = rcu_dereference(sk->sk_wq);
615 		if (skwq_has_sleeper(wq))
616 			wake_up_interruptible_sync_poll(&wq->wait,
617 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
618 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
619 	}
620 	rcu_read_unlock();
621 }
622 
623 /* When dgram socket disconnects (or changes its peer), we clear its receive
624  * queue of packets arrived from previous peer. First, it allows to do
625  * flow control based only on wmem_alloc; second, sk connected to peer
626  * may receive messages only from that peer. */
unix_dgram_disconnected(struct sock * sk,struct sock * other)627 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
628 {
629 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
630 		skb_queue_purge(&sk->sk_receive_queue);
631 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
632 
633 		/* If one link of bidirectional dgram pipe is disconnected,
634 		 * we signal error. Messages are lost. Do not make this,
635 		 * when peer was not connected to us.
636 		 */
637 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
638 			WRITE_ONCE(other->sk_err, ECONNRESET);
639 			sk_error_report(other);
640 		}
641 	}
642 }
643 
unix_sock_destructor(struct sock * sk)644 static void unix_sock_destructor(struct sock *sk)
645 {
646 	struct unix_sock *u = unix_sk(sk);
647 
648 	skb_queue_purge(&sk->sk_receive_queue);
649 
650 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
651 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
652 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
653 	if (!sock_flag(sk, SOCK_DEAD)) {
654 		pr_info("Attempt to release alive unix socket: %p\n", sk);
655 		return;
656 	}
657 
658 	if (u->addr)
659 		unix_release_addr(u->addr);
660 
661 	atomic_long_dec(&unix_nr_socks);
662 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
663 #ifdef UNIX_REFCNT_DEBUG
664 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
665 		atomic_long_read(&unix_nr_socks));
666 #endif
667 }
668 
unix_skb_len(const struct sk_buff * skb)669 static unsigned int unix_skb_len(const struct sk_buff *skb)
670 {
671 	return skb->len - UNIXCB(skb).consumed;
672 }
673 
unix_release_sock(struct sock * sk,int embrion)674 static void unix_release_sock(struct sock *sk, int embrion)
675 {
676 	struct unix_sock *u = unix_sk(sk);
677 	struct sock *skpair;
678 	struct sk_buff *skb;
679 	struct path path;
680 	int state;
681 
682 	unix_remove_socket(sock_net(sk), sk);
683 	unix_remove_bsd_socket(sk);
684 
685 	/* Clear state */
686 	unix_state_lock(sk);
687 	sock_orphan(sk);
688 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
689 	path	     = u->path;
690 	u->path.dentry = NULL;
691 	u->path.mnt = NULL;
692 	state = sk->sk_state;
693 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
694 
695 	skpair = unix_peer(sk);
696 	unix_peer(sk) = NULL;
697 
698 	unix_state_unlock(sk);
699 
700 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
701 	u->oob_skb = NULL;
702 #endif
703 
704 	wake_up_interruptible_all(&u->peer_wait);
705 
706 	if (skpair != NULL) {
707 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
708 			struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
709 
710 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
711 			if (skb && !unix_skb_len(skb))
712 				skb = skb_peek_next(skb, &sk->sk_receive_queue);
713 #endif
714 			unix_state_lock(skpair);
715 			/* No more writes */
716 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
717 			if (skb || embrion)
718 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
719 			unix_state_unlock(skpair);
720 			skpair->sk_state_change(skpair);
721 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
722 		}
723 
724 		unix_dgram_peer_wake_disconnect(sk, skpair);
725 		sock_put(skpair); /* It may now die */
726 	}
727 
728 	/* Try to flush out this socket. Throw out buffers at least */
729 
730 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
731 		if (state == TCP_LISTEN)
732 			unix_release_sock(skb->sk, 1);
733 
734 		/* passed fds are erased in the kfree_skb hook	      */
735 		kfree_skb(skb);
736 	}
737 
738 	if (path.dentry)
739 		path_put(&path);
740 
741 	sock_put(sk);
742 
743 	/* ---- Socket is dead now and most probably destroyed ---- */
744 
745 	/*
746 	 * Fixme: BSD difference: In BSD all sockets connected to us get
747 	 *	  ECONNRESET and we die on the spot. In Linux we behave
748 	 *	  like files and pipes do and wait for the last
749 	 *	  dereference.
750 	 *
751 	 * Can't we simply set sock->err?
752 	 *
753 	 *	  What the above comment does talk about? --ANK(980817)
754 	 */
755 
756 	if (READ_ONCE(unix_tot_inflight))
757 		unix_gc();		/* Garbage collect fds */
758 }
759 
init_peercred(struct sock * sk)760 static void init_peercred(struct sock *sk)
761 {
762 	sk->sk_peer_pid = get_pid(task_tgid(current));
763 	sk->sk_peer_cred = get_current_cred();
764 }
765 
update_peercred(struct sock * sk)766 static void update_peercred(struct sock *sk)
767 {
768 	const struct cred *old_cred;
769 	struct pid *old_pid;
770 
771 	spin_lock(&sk->sk_peer_lock);
772 	old_pid = sk->sk_peer_pid;
773 	old_cred = sk->sk_peer_cred;
774 	init_peercred(sk);
775 	spin_unlock(&sk->sk_peer_lock);
776 
777 	put_pid(old_pid);
778 	put_cred(old_cred);
779 }
780 
copy_peercred(struct sock * sk,struct sock * peersk)781 static void copy_peercred(struct sock *sk, struct sock *peersk)
782 {
783 	lockdep_assert_held(&unix_sk(peersk)->lock);
784 
785 	spin_lock(&sk->sk_peer_lock);
786 	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
787 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
788 	spin_unlock(&sk->sk_peer_lock);
789 }
790 
unix_listen(struct socket * sock,int backlog)791 static int unix_listen(struct socket *sock, int backlog)
792 {
793 	int err;
794 	struct sock *sk = sock->sk;
795 	struct unix_sock *u = unix_sk(sk);
796 
797 	err = -EOPNOTSUPP;
798 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
799 		goto out;	/* Only stream/seqpacket sockets accept */
800 	err = -EINVAL;
801 	if (!READ_ONCE(u->addr))
802 		goto out;	/* No listens on an unbound socket */
803 	unix_state_lock(sk);
804 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
805 		goto out_unlock;
806 	if (backlog > sk->sk_max_ack_backlog)
807 		wake_up_interruptible_all(&u->peer_wait);
808 	sk->sk_max_ack_backlog	= backlog;
809 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
810 
811 	/* set credentials so connect can copy them */
812 	update_peercred(sk);
813 	err = 0;
814 
815 out_unlock:
816 	unix_state_unlock(sk);
817 out:
818 	return err;
819 }
820 
821 static int unix_release(struct socket *);
822 static int unix_bind(struct socket *, struct sockaddr *, int);
823 static int unix_stream_connect(struct socket *, struct sockaddr *,
824 			       int addr_len, int flags);
825 static int unix_socketpair(struct socket *, struct socket *);
826 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
827 static int unix_getname(struct socket *, struct sockaddr *, int);
828 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
829 static __poll_t unix_dgram_poll(struct file *, struct socket *,
830 				    poll_table *);
831 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
832 #ifdef CONFIG_COMPAT
833 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
834 #endif
835 static int unix_shutdown(struct socket *, int);
836 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
837 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
838 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
839 				       struct pipe_inode_info *, size_t size,
840 				       unsigned int flags);
841 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
842 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
843 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
844 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
845 static int unix_dgram_connect(struct socket *, struct sockaddr *,
846 			      int, int);
847 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
848 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
849 				  int);
850 
851 #ifdef CONFIG_PROC_FS
unix_count_nr_fds(struct sock * sk)852 static int unix_count_nr_fds(struct sock *sk)
853 {
854 	struct sk_buff *skb;
855 	struct unix_sock *u;
856 	int nr_fds = 0;
857 
858 	spin_lock(&sk->sk_receive_queue.lock);
859 	skb = skb_peek(&sk->sk_receive_queue);
860 	while (skb) {
861 		u = unix_sk(skb->sk);
862 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
863 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
864 	}
865 	spin_unlock(&sk->sk_receive_queue.lock);
866 
867 	return nr_fds;
868 }
869 
unix_show_fdinfo(struct seq_file * m,struct socket * sock)870 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
871 {
872 	struct sock *sk = sock->sk;
873 	unsigned char s_state;
874 	struct unix_sock *u;
875 	int nr_fds = 0;
876 
877 	if (sk) {
878 		s_state = READ_ONCE(sk->sk_state);
879 		u = unix_sk(sk);
880 
881 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
882 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
883 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
884 		 */
885 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
886 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
887 		else if (s_state == TCP_LISTEN)
888 			nr_fds = unix_count_nr_fds(sk);
889 
890 		seq_printf(m, "scm_fds: %u\n", nr_fds);
891 	}
892 }
893 #else
894 #define unix_show_fdinfo NULL
895 #endif
896 
897 static const struct proto_ops unix_stream_ops = {
898 	.family =	PF_UNIX,
899 	.owner =	THIS_MODULE,
900 	.release =	unix_release,
901 	.bind =		unix_bind,
902 	.connect =	unix_stream_connect,
903 	.socketpair =	unix_socketpair,
904 	.accept =	unix_accept,
905 	.getname =	unix_getname,
906 	.poll =		unix_poll,
907 	.ioctl =	unix_ioctl,
908 #ifdef CONFIG_COMPAT
909 	.compat_ioctl =	unix_compat_ioctl,
910 #endif
911 	.listen =	unix_listen,
912 	.shutdown =	unix_shutdown,
913 	.sendmsg =	unix_stream_sendmsg,
914 	.recvmsg =	unix_stream_recvmsg,
915 	.read_skb =	unix_stream_read_skb,
916 	.mmap =		sock_no_mmap,
917 	.splice_read =	unix_stream_splice_read,
918 	.set_peek_off =	sk_set_peek_off,
919 	.show_fdinfo =	unix_show_fdinfo,
920 };
921 
922 static const struct proto_ops unix_dgram_ops = {
923 	.family =	PF_UNIX,
924 	.owner =	THIS_MODULE,
925 	.release =	unix_release,
926 	.bind =		unix_bind,
927 	.connect =	unix_dgram_connect,
928 	.socketpair =	unix_socketpair,
929 	.accept =	sock_no_accept,
930 	.getname =	unix_getname,
931 	.poll =		unix_dgram_poll,
932 	.ioctl =	unix_ioctl,
933 #ifdef CONFIG_COMPAT
934 	.compat_ioctl =	unix_compat_ioctl,
935 #endif
936 	.listen =	sock_no_listen,
937 	.shutdown =	unix_shutdown,
938 	.sendmsg =	unix_dgram_sendmsg,
939 	.read_skb =	unix_read_skb,
940 	.recvmsg =	unix_dgram_recvmsg,
941 	.mmap =		sock_no_mmap,
942 	.set_peek_off =	sk_set_peek_off,
943 	.show_fdinfo =	unix_show_fdinfo,
944 };
945 
946 static const struct proto_ops unix_seqpacket_ops = {
947 	.family =	PF_UNIX,
948 	.owner =	THIS_MODULE,
949 	.release =	unix_release,
950 	.bind =		unix_bind,
951 	.connect =	unix_stream_connect,
952 	.socketpair =	unix_socketpair,
953 	.accept =	unix_accept,
954 	.getname =	unix_getname,
955 	.poll =		unix_dgram_poll,
956 	.ioctl =	unix_ioctl,
957 #ifdef CONFIG_COMPAT
958 	.compat_ioctl =	unix_compat_ioctl,
959 #endif
960 	.listen =	unix_listen,
961 	.shutdown =	unix_shutdown,
962 	.sendmsg =	unix_seqpacket_sendmsg,
963 	.recvmsg =	unix_seqpacket_recvmsg,
964 	.mmap =		sock_no_mmap,
965 	.set_peek_off =	sk_set_peek_off,
966 	.show_fdinfo =	unix_show_fdinfo,
967 };
968 
unix_close(struct sock * sk,long timeout)969 static void unix_close(struct sock *sk, long timeout)
970 {
971 	/* Nothing to do here, unix socket does not need a ->close().
972 	 * This is merely for sockmap.
973 	 */
974 }
975 
unix_unhash(struct sock * sk)976 static void unix_unhash(struct sock *sk)
977 {
978 	/* Nothing to do here, unix socket does not need a ->unhash().
979 	 * This is merely for sockmap.
980 	 */
981 }
982 
unix_bpf_bypass_getsockopt(int level,int optname)983 static bool unix_bpf_bypass_getsockopt(int level, int optname)
984 {
985 	if (level == SOL_SOCKET) {
986 		switch (optname) {
987 		case SO_PEERPIDFD:
988 			return true;
989 		default:
990 			return false;
991 		}
992 	}
993 
994 	return false;
995 }
996 
997 struct proto unix_dgram_proto = {
998 	.name			= "UNIX",
999 	.owner			= THIS_MODULE,
1000 	.obj_size		= sizeof(struct unix_sock),
1001 	.close			= unix_close,
1002 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
1003 #ifdef CONFIG_BPF_SYSCALL
1004 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
1005 #endif
1006 };
1007 
1008 struct proto unix_stream_proto = {
1009 	.name			= "UNIX-STREAM",
1010 	.owner			= THIS_MODULE,
1011 	.obj_size		= sizeof(struct unix_sock),
1012 	.close			= unix_close,
1013 	.unhash			= unix_unhash,
1014 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
1015 #ifdef CONFIG_BPF_SYSCALL
1016 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
1017 #endif
1018 };
1019 
unix_create1(struct net * net,struct socket * sock,int kern,int type)1020 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1021 {
1022 	struct unix_sock *u;
1023 	struct sock *sk;
1024 	int err;
1025 
1026 	atomic_long_inc(&unix_nr_socks);
1027 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1028 		err = -ENFILE;
1029 		goto err;
1030 	}
1031 
1032 	if (type == SOCK_STREAM)
1033 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1034 	else /*dgram and  seqpacket */
1035 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1036 
1037 	if (!sk) {
1038 		err = -ENOMEM;
1039 		goto err;
1040 	}
1041 
1042 	sock_init_data(sock, sk);
1043 
1044 	sk->sk_hash		= unix_unbound_hash(sk);
1045 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
1046 	sk->sk_write_space	= unix_write_space;
1047 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1048 	sk->sk_destruct		= unix_sock_destructor;
1049 	lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1050 
1051 	u = unix_sk(sk);
1052 	u->listener = NULL;
1053 	u->vertex = NULL;
1054 	u->path.dentry = NULL;
1055 	u->path.mnt = NULL;
1056 	spin_lock_init(&u->lock);
1057 	lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1058 	mutex_init(&u->iolock); /* single task reading lock */
1059 	mutex_init(&u->bindlock); /* single task binding lock */
1060 	init_waitqueue_head(&u->peer_wait);
1061 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1062 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1063 	unix_insert_unbound_socket(net, sk);
1064 
1065 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1066 
1067 	return sk;
1068 
1069 err:
1070 	atomic_long_dec(&unix_nr_socks);
1071 	return ERR_PTR(err);
1072 }
1073 
unix_create(struct net * net,struct socket * sock,int protocol,int kern)1074 static int unix_create(struct net *net, struct socket *sock, int protocol,
1075 		       int kern)
1076 {
1077 	struct sock *sk;
1078 
1079 	if (protocol && protocol != PF_UNIX)
1080 		return -EPROTONOSUPPORT;
1081 
1082 	sock->state = SS_UNCONNECTED;
1083 
1084 	switch (sock->type) {
1085 	case SOCK_STREAM:
1086 		sock->ops = &unix_stream_ops;
1087 		break;
1088 		/*
1089 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1090 		 *	nothing uses it.
1091 		 */
1092 	case SOCK_RAW:
1093 		sock->type = SOCK_DGRAM;
1094 		fallthrough;
1095 	case SOCK_DGRAM:
1096 		sock->ops = &unix_dgram_ops;
1097 		break;
1098 	case SOCK_SEQPACKET:
1099 		sock->ops = &unix_seqpacket_ops;
1100 		break;
1101 	default:
1102 		return -ESOCKTNOSUPPORT;
1103 	}
1104 
1105 	sk = unix_create1(net, sock, kern, sock->type);
1106 	if (IS_ERR(sk))
1107 		return PTR_ERR(sk);
1108 
1109 	return 0;
1110 }
1111 
unix_release(struct socket * sock)1112 static int unix_release(struct socket *sock)
1113 {
1114 	struct sock *sk = sock->sk;
1115 
1116 	if (!sk)
1117 		return 0;
1118 
1119 	sk->sk_prot->close(sk, 0);
1120 	unix_release_sock(sk, 0);
1121 	sock->sk = NULL;
1122 
1123 	return 0;
1124 }
1125 
unix_find_bsd(struct sockaddr_un * sunaddr,int addr_len,int type)1126 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1127 				  int type)
1128 {
1129 	struct inode *inode;
1130 	struct path path;
1131 	struct sock *sk;
1132 	int err;
1133 
1134 	unix_mkname_bsd(sunaddr, addr_len);
1135 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1136 	if (err)
1137 		goto fail;
1138 
1139 	err = path_permission(&path, MAY_WRITE);
1140 	if (err)
1141 		goto path_put;
1142 
1143 	err = -ECONNREFUSED;
1144 	inode = d_backing_inode(path.dentry);
1145 	if (!S_ISSOCK(inode->i_mode))
1146 		goto path_put;
1147 
1148 	sk = unix_find_socket_byinode(inode);
1149 	if (!sk)
1150 		goto path_put;
1151 
1152 	err = -EPROTOTYPE;
1153 	if (sk->sk_type == type)
1154 		touch_atime(&path);
1155 	else
1156 		goto sock_put;
1157 
1158 	path_put(&path);
1159 
1160 	return sk;
1161 
1162 sock_put:
1163 	sock_put(sk);
1164 path_put:
1165 	path_put(&path);
1166 fail:
1167 	return ERR_PTR(err);
1168 }
1169 
unix_find_abstract(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1170 static struct sock *unix_find_abstract(struct net *net,
1171 				       struct sockaddr_un *sunaddr,
1172 				       int addr_len, int type)
1173 {
1174 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1175 	struct dentry *dentry;
1176 	struct sock *sk;
1177 
1178 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1179 	if (!sk)
1180 		return ERR_PTR(-ECONNREFUSED);
1181 
1182 	dentry = unix_sk(sk)->path.dentry;
1183 	if (dentry)
1184 		touch_atime(&unix_sk(sk)->path);
1185 
1186 	return sk;
1187 }
1188 
unix_find_other(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1189 static struct sock *unix_find_other(struct net *net,
1190 				    struct sockaddr_un *sunaddr,
1191 				    int addr_len, int type)
1192 {
1193 	struct sock *sk;
1194 
1195 	if (sunaddr->sun_path[0])
1196 		sk = unix_find_bsd(sunaddr, addr_len, type);
1197 	else
1198 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1199 
1200 	return sk;
1201 }
1202 
unix_autobind(struct sock * sk)1203 static int unix_autobind(struct sock *sk)
1204 {
1205 	struct unix_sock *u = unix_sk(sk);
1206 	unsigned int new_hash, old_hash;
1207 	struct net *net = sock_net(sk);
1208 	struct unix_address *addr;
1209 	u32 lastnum, ordernum;
1210 	int err;
1211 
1212 	err = mutex_lock_interruptible(&u->bindlock);
1213 	if (err)
1214 		return err;
1215 
1216 	if (u->addr)
1217 		goto out;
1218 
1219 	err = -ENOMEM;
1220 	addr = kzalloc(sizeof(*addr) +
1221 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1222 	if (!addr)
1223 		goto out;
1224 
1225 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1226 	addr->name->sun_family = AF_UNIX;
1227 	refcount_set(&addr->refcnt, 1);
1228 
1229 	old_hash = sk->sk_hash;
1230 	ordernum = get_random_u32();
1231 	lastnum = ordernum & 0xFFFFF;
1232 retry:
1233 	ordernum = (ordernum + 1) & 0xFFFFF;
1234 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1235 
1236 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1237 	unix_table_double_lock(net, old_hash, new_hash);
1238 
1239 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1240 		unix_table_double_unlock(net, old_hash, new_hash);
1241 
1242 		/* __unix_find_socket_byname() may take long time if many names
1243 		 * are already in use.
1244 		 */
1245 		cond_resched();
1246 
1247 		if (ordernum == lastnum) {
1248 			/* Give up if all names seems to be in use. */
1249 			err = -ENOSPC;
1250 			unix_release_addr(addr);
1251 			goto out;
1252 		}
1253 
1254 		goto retry;
1255 	}
1256 
1257 	__unix_set_addr_hash(net, sk, addr, new_hash);
1258 	unix_table_double_unlock(net, old_hash, new_hash);
1259 	err = 0;
1260 
1261 out:	mutex_unlock(&u->bindlock);
1262 	return err;
1263 }
1264 
unix_bind_bsd(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1265 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1266 			 int addr_len)
1267 {
1268 	umode_t mode = S_IFSOCK |
1269 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1270 	struct unix_sock *u = unix_sk(sk);
1271 	unsigned int new_hash, old_hash;
1272 	struct net *net = sock_net(sk);
1273 	struct mnt_idmap *idmap;
1274 	struct unix_address *addr;
1275 	struct dentry *dentry;
1276 	struct path parent;
1277 	int err;
1278 
1279 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1280 	addr = unix_create_addr(sunaddr, addr_len);
1281 	if (!addr)
1282 		return -ENOMEM;
1283 
1284 	/*
1285 	 * Get the parent directory, calculate the hash for last
1286 	 * component.
1287 	 */
1288 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1289 	if (IS_ERR(dentry)) {
1290 		err = PTR_ERR(dentry);
1291 		goto out;
1292 	}
1293 
1294 	/*
1295 	 * All right, let's create it.
1296 	 */
1297 	idmap = mnt_idmap(parent.mnt);
1298 	err = security_path_mknod(&parent, dentry, mode, 0);
1299 	if (!err)
1300 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1301 	if (err)
1302 		goto out_path;
1303 	err = mutex_lock_interruptible(&u->bindlock);
1304 	if (err)
1305 		goto out_unlink;
1306 	if (u->addr)
1307 		goto out_unlock;
1308 
1309 	old_hash = sk->sk_hash;
1310 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1311 	unix_table_double_lock(net, old_hash, new_hash);
1312 	u->path.mnt = mntget(parent.mnt);
1313 	u->path.dentry = dget(dentry);
1314 	__unix_set_addr_hash(net, sk, addr, new_hash);
1315 	unix_table_double_unlock(net, old_hash, new_hash);
1316 	unix_insert_bsd_socket(sk);
1317 	mutex_unlock(&u->bindlock);
1318 	done_path_create(&parent, dentry);
1319 	return 0;
1320 
1321 out_unlock:
1322 	mutex_unlock(&u->bindlock);
1323 	err = -EINVAL;
1324 out_unlink:
1325 	/* failed after successful mknod?  unlink what we'd created... */
1326 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1327 out_path:
1328 	done_path_create(&parent, dentry);
1329 out:
1330 	unix_release_addr(addr);
1331 	return err == -EEXIST ? -EADDRINUSE : err;
1332 }
1333 
unix_bind_abstract(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1334 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1335 			      int addr_len)
1336 {
1337 	struct unix_sock *u = unix_sk(sk);
1338 	unsigned int new_hash, old_hash;
1339 	struct net *net = sock_net(sk);
1340 	struct unix_address *addr;
1341 	int err;
1342 
1343 	addr = unix_create_addr(sunaddr, addr_len);
1344 	if (!addr)
1345 		return -ENOMEM;
1346 
1347 	err = mutex_lock_interruptible(&u->bindlock);
1348 	if (err)
1349 		goto out;
1350 
1351 	if (u->addr) {
1352 		err = -EINVAL;
1353 		goto out_mutex;
1354 	}
1355 
1356 	old_hash = sk->sk_hash;
1357 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1358 	unix_table_double_lock(net, old_hash, new_hash);
1359 
1360 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1361 		goto out_spin;
1362 
1363 	__unix_set_addr_hash(net, sk, addr, new_hash);
1364 	unix_table_double_unlock(net, old_hash, new_hash);
1365 	mutex_unlock(&u->bindlock);
1366 	return 0;
1367 
1368 out_spin:
1369 	unix_table_double_unlock(net, old_hash, new_hash);
1370 	err = -EADDRINUSE;
1371 out_mutex:
1372 	mutex_unlock(&u->bindlock);
1373 out:
1374 	unix_release_addr(addr);
1375 	return err;
1376 }
1377 
unix_bind(struct socket * sock,struct sockaddr * uaddr,int addr_len)1378 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1379 {
1380 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1381 	struct sock *sk = sock->sk;
1382 	int err;
1383 
1384 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1385 	    sunaddr->sun_family == AF_UNIX)
1386 		return unix_autobind(sk);
1387 
1388 	err = unix_validate_addr(sunaddr, addr_len);
1389 	if (err)
1390 		return err;
1391 
1392 	if (sunaddr->sun_path[0])
1393 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1394 	else
1395 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1396 
1397 	return err;
1398 }
1399 
unix_state_double_lock(struct sock * sk1,struct sock * sk2)1400 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1401 {
1402 	if (unlikely(sk1 == sk2) || !sk2) {
1403 		unix_state_lock(sk1);
1404 		return;
1405 	}
1406 
1407 	if (sk1 > sk2)
1408 		swap(sk1, sk2);
1409 
1410 	unix_state_lock(sk1);
1411 	unix_state_lock(sk2);
1412 }
1413 
unix_state_double_unlock(struct sock * sk1,struct sock * sk2)1414 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1415 {
1416 	if (unlikely(sk1 == sk2) || !sk2) {
1417 		unix_state_unlock(sk1);
1418 		return;
1419 	}
1420 	unix_state_unlock(sk1);
1421 	unix_state_unlock(sk2);
1422 }
1423 
unix_dgram_connect(struct socket * sock,struct sockaddr * addr,int alen,int flags)1424 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1425 			      int alen, int flags)
1426 {
1427 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1428 	struct sock *sk = sock->sk;
1429 	struct sock *other;
1430 	int err;
1431 
1432 	err = -EINVAL;
1433 	if (alen < offsetofend(struct sockaddr, sa_family))
1434 		goto out;
1435 
1436 	if (addr->sa_family != AF_UNSPEC) {
1437 		err = unix_validate_addr(sunaddr, alen);
1438 		if (err)
1439 			goto out;
1440 
1441 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1442 		if (err)
1443 			goto out;
1444 
1445 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1446 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1447 		    !READ_ONCE(unix_sk(sk)->addr)) {
1448 			err = unix_autobind(sk);
1449 			if (err)
1450 				goto out;
1451 		}
1452 
1453 restart:
1454 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1455 		if (IS_ERR(other)) {
1456 			err = PTR_ERR(other);
1457 			goto out;
1458 		}
1459 
1460 		unix_state_double_lock(sk, other);
1461 
1462 		/* Apparently VFS overslept socket death. Retry. */
1463 		if (sock_flag(other, SOCK_DEAD)) {
1464 			unix_state_double_unlock(sk, other);
1465 			sock_put(other);
1466 			goto restart;
1467 		}
1468 
1469 		err = -EPERM;
1470 		if (!unix_may_send(sk, other))
1471 			goto out_unlock;
1472 
1473 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1474 		if (err)
1475 			goto out_unlock;
1476 
1477 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1478 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1479 	} else {
1480 		/*
1481 		 *	1003.1g breaking connected state with AF_UNSPEC
1482 		 */
1483 		other = NULL;
1484 		unix_state_double_lock(sk, other);
1485 	}
1486 
1487 	/*
1488 	 * If it was connected, reconnect.
1489 	 */
1490 	if (unix_peer(sk)) {
1491 		struct sock *old_peer = unix_peer(sk);
1492 
1493 		unix_peer(sk) = other;
1494 		if (!other)
1495 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1496 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1497 
1498 		unix_state_double_unlock(sk, other);
1499 
1500 		if (other != old_peer) {
1501 			unix_dgram_disconnected(sk, old_peer);
1502 
1503 			unix_state_lock(old_peer);
1504 			if (!unix_peer(old_peer))
1505 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1506 			unix_state_unlock(old_peer);
1507 		}
1508 
1509 		sock_put(old_peer);
1510 	} else {
1511 		unix_peer(sk) = other;
1512 		unix_state_double_unlock(sk, other);
1513 	}
1514 
1515 	return 0;
1516 
1517 out_unlock:
1518 	unix_state_double_unlock(sk, other);
1519 	sock_put(other);
1520 out:
1521 	return err;
1522 }
1523 
unix_wait_for_peer(struct sock * other,long timeo)1524 static long unix_wait_for_peer(struct sock *other, long timeo)
1525 	__releases(&unix_sk(other)->lock)
1526 {
1527 	struct unix_sock *u = unix_sk(other);
1528 	int sched;
1529 	DEFINE_WAIT(wait);
1530 
1531 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1532 
1533 	sched = !sock_flag(other, SOCK_DEAD) &&
1534 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1535 		unix_recvq_full_lockless(other);
1536 
1537 	unix_state_unlock(other);
1538 
1539 	if (sched)
1540 		timeo = schedule_timeout(timeo);
1541 
1542 	finish_wait(&u->peer_wait, &wait);
1543 	return timeo;
1544 }
1545 
unix_stream_connect(struct socket * sock,struct sockaddr * uaddr,int addr_len,int flags)1546 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1547 			       int addr_len, int flags)
1548 {
1549 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1550 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1551 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1552 	struct net *net = sock_net(sk);
1553 	struct sk_buff *skb = NULL;
1554 	unsigned char state;
1555 	long timeo;
1556 	int err;
1557 
1558 	err = unix_validate_addr(sunaddr, addr_len);
1559 	if (err)
1560 		goto out;
1561 
1562 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1563 	if (err)
1564 		goto out;
1565 
1566 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1567 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1568 	    !READ_ONCE(u->addr)) {
1569 		err = unix_autobind(sk);
1570 		if (err)
1571 			goto out;
1572 	}
1573 
1574 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1575 
1576 	/* First of all allocate resources.
1577 	   If we will make it after state is locked,
1578 	   we will have to recheck all again in any case.
1579 	 */
1580 
1581 	/* create new sock for complete connection */
1582 	newsk = unix_create1(net, NULL, 0, sock->type);
1583 	if (IS_ERR(newsk)) {
1584 		err = PTR_ERR(newsk);
1585 		newsk = NULL;
1586 		goto out;
1587 	}
1588 
1589 	err = -ENOMEM;
1590 
1591 	/* Allocate skb for sending to listening sock */
1592 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1593 	if (skb == NULL)
1594 		goto out;
1595 
1596 restart:
1597 	/*  Find listening sock. */
1598 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1599 	if (IS_ERR(other)) {
1600 		err = PTR_ERR(other);
1601 		other = NULL;
1602 		goto out;
1603 	}
1604 
1605 	unix_state_lock(other);
1606 
1607 	/* Apparently VFS overslept socket death. Retry. */
1608 	if (sock_flag(other, SOCK_DEAD)) {
1609 		unix_state_unlock(other);
1610 		sock_put(other);
1611 		goto restart;
1612 	}
1613 
1614 	err = -ECONNREFUSED;
1615 	if (other->sk_state != TCP_LISTEN)
1616 		goto out_unlock;
1617 	if (other->sk_shutdown & RCV_SHUTDOWN)
1618 		goto out_unlock;
1619 
1620 	if (unix_recvq_full_lockless(other)) {
1621 		err = -EAGAIN;
1622 		if (!timeo)
1623 			goto out_unlock;
1624 
1625 		timeo = unix_wait_for_peer(other, timeo);
1626 
1627 		err = sock_intr_errno(timeo);
1628 		if (signal_pending(current))
1629 			goto out;
1630 		sock_put(other);
1631 		goto restart;
1632 	}
1633 
1634 	/* self connect and simultaneous connect are eliminated
1635 	 * by rejecting TCP_LISTEN socket to avoid deadlock.
1636 	 */
1637 	state = READ_ONCE(sk->sk_state);
1638 	if (unlikely(state != TCP_CLOSE)) {
1639 		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1640 		goto out_unlock;
1641 	}
1642 
1643 	unix_state_lock(sk);
1644 
1645 	if (unlikely(sk->sk_state != TCP_CLOSE)) {
1646 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1647 		unix_state_unlock(sk);
1648 		goto out_unlock;
1649 	}
1650 
1651 	err = security_unix_stream_connect(sk, other, newsk);
1652 	if (err) {
1653 		unix_state_unlock(sk);
1654 		goto out_unlock;
1655 	}
1656 
1657 	/* The way is open! Fastly set all the necessary fields... */
1658 
1659 	sock_hold(sk);
1660 	unix_peer(newsk)	= sk;
1661 	newsk->sk_state		= TCP_ESTABLISHED;
1662 	newsk->sk_type		= sk->sk_type;
1663 	init_peercred(newsk);
1664 	newu = unix_sk(newsk);
1665 	newu->listener = other;
1666 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1667 	otheru = unix_sk(other);
1668 
1669 	/* copy address information from listening to new sock
1670 	 *
1671 	 * The contents of *(otheru->addr) and otheru->path
1672 	 * are seen fully set up here, since we have found
1673 	 * otheru in hash under its lock.  Insertion into the
1674 	 * hash chain we'd found it in had been done in an
1675 	 * earlier critical area protected by the chain's lock,
1676 	 * the same one where we'd set *(otheru->addr) contents,
1677 	 * as well as otheru->path and otheru->addr itself.
1678 	 *
1679 	 * Using smp_store_release() here to set newu->addr
1680 	 * is enough to make those stores, as well as stores
1681 	 * to newu->path visible to anyone who gets newu->addr
1682 	 * by smp_load_acquire().  IOW, the same warranties
1683 	 * as for unix_sock instances bound in unix_bind() or
1684 	 * in unix_autobind().
1685 	 */
1686 	if (otheru->path.dentry) {
1687 		path_get(&otheru->path);
1688 		newu->path = otheru->path;
1689 	}
1690 	refcount_inc(&otheru->addr->refcnt);
1691 	smp_store_release(&newu->addr, otheru->addr);
1692 
1693 	/* Set credentials */
1694 	copy_peercred(sk, other);
1695 
1696 	sock->state	= SS_CONNECTED;
1697 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1698 	sock_hold(newsk);
1699 
1700 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1701 	unix_peer(sk)	= newsk;
1702 
1703 	unix_state_unlock(sk);
1704 
1705 	/* take ten and send info to listening sock */
1706 	spin_lock(&other->sk_receive_queue.lock);
1707 	__skb_queue_tail(&other->sk_receive_queue, skb);
1708 	spin_unlock(&other->sk_receive_queue.lock);
1709 	unix_state_unlock(other);
1710 	other->sk_data_ready(other);
1711 	sock_put(other);
1712 	return 0;
1713 
1714 out_unlock:
1715 	if (other)
1716 		unix_state_unlock(other);
1717 
1718 out:
1719 	kfree_skb(skb);
1720 	if (newsk)
1721 		unix_release_sock(newsk, 0);
1722 	if (other)
1723 		sock_put(other);
1724 	return err;
1725 }
1726 
unix_socketpair(struct socket * socka,struct socket * sockb)1727 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1728 {
1729 	struct sock *ska = socka->sk, *skb = sockb->sk;
1730 
1731 	/* Join our sockets back to back */
1732 	sock_hold(ska);
1733 	sock_hold(skb);
1734 	unix_peer(ska) = skb;
1735 	unix_peer(skb) = ska;
1736 	init_peercred(ska);
1737 	init_peercred(skb);
1738 
1739 	ska->sk_state = TCP_ESTABLISHED;
1740 	skb->sk_state = TCP_ESTABLISHED;
1741 	socka->state  = SS_CONNECTED;
1742 	sockb->state  = SS_CONNECTED;
1743 	return 0;
1744 }
1745 
unix_sock_inherit_flags(const struct socket * old,struct socket * new)1746 static void unix_sock_inherit_flags(const struct socket *old,
1747 				    struct socket *new)
1748 {
1749 	if (test_bit(SOCK_PASSCRED, &old->flags))
1750 		set_bit(SOCK_PASSCRED, &new->flags);
1751 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1752 		set_bit(SOCK_PASSPIDFD, &new->flags);
1753 	if (test_bit(SOCK_PASSSEC, &old->flags))
1754 		set_bit(SOCK_PASSSEC, &new->flags);
1755 }
1756 
unix_accept(struct socket * sock,struct socket * newsock,struct proto_accept_arg * arg)1757 static int unix_accept(struct socket *sock, struct socket *newsock,
1758 		       struct proto_accept_arg *arg)
1759 {
1760 	struct sock *sk = sock->sk;
1761 	struct sk_buff *skb;
1762 	struct sock *tsk;
1763 
1764 	arg->err = -EOPNOTSUPP;
1765 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1766 		goto out;
1767 
1768 	arg->err = -EINVAL;
1769 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1770 		goto out;
1771 
1772 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1773 	 * so that no locks are necessary.
1774 	 */
1775 
1776 	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1777 				&arg->err);
1778 	if (!skb) {
1779 		/* This means receive shutdown. */
1780 		if (arg->err == 0)
1781 			arg->err = -EINVAL;
1782 		goto out;
1783 	}
1784 
1785 	tsk = skb->sk;
1786 	skb_free_datagram(sk, skb);
1787 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1788 
1789 	/* attach accepted sock to socket */
1790 	unix_state_lock(tsk);
1791 	unix_update_edges(unix_sk(tsk));
1792 	newsock->state = SS_CONNECTED;
1793 	unix_sock_inherit_flags(sock, newsock);
1794 	sock_graft(tsk, newsock);
1795 	unix_state_unlock(tsk);
1796 	return 0;
1797 
1798 out:
1799 	return arg->err;
1800 }
1801 
1802 
unix_getname(struct socket * sock,struct sockaddr * uaddr,int peer)1803 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1804 {
1805 	struct sock *sk = sock->sk;
1806 	struct unix_address *addr;
1807 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1808 	int err = 0;
1809 
1810 	if (peer) {
1811 		sk = unix_peer_get(sk);
1812 
1813 		err = -ENOTCONN;
1814 		if (!sk)
1815 			goto out;
1816 		err = 0;
1817 	} else {
1818 		sock_hold(sk);
1819 	}
1820 
1821 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1822 	if (!addr) {
1823 		sunaddr->sun_family = AF_UNIX;
1824 		sunaddr->sun_path[0] = 0;
1825 		err = offsetof(struct sockaddr_un, sun_path);
1826 	} else {
1827 		err = addr->len;
1828 		memcpy(sunaddr, addr->name, addr->len);
1829 
1830 		if (peer)
1831 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1832 					       CGROUP_UNIX_GETPEERNAME);
1833 		else
1834 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1835 					       CGROUP_UNIX_GETSOCKNAME);
1836 	}
1837 	sock_put(sk);
1838 out:
1839 	return err;
1840 }
1841 
1842 /* The "user->unix_inflight" variable is protected by the garbage
1843  * collection lock, and we just read it locklessly here. If you go
1844  * over the limit, there might be a tiny race in actually noticing
1845  * it across threads. Tough.
1846  */
too_many_unix_fds(struct task_struct * p)1847 static inline bool too_many_unix_fds(struct task_struct *p)
1848 {
1849 	struct user_struct *user = current_user();
1850 
1851 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1852 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1853 	return false;
1854 }
1855 
unix_attach_fds(struct scm_cookie * scm,struct sk_buff * skb)1856 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1857 {
1858 	if (too_many_unix_fds(current))
1859 		return -ETOOMANYREFS;
1860 
1861 	UNIXCB(skb).fp = scm->fp;
1862 	scm->fp = NULL;
1863 
1864 	if (unix_prepare_fpl(UNIXCB(skb).fp))
1865 		return -ENOMEM;
1866 
1867 	return 0;
1868 }
1869 
unix_detach_fds(struct scm_cookie * scm,struct sk_buff * skb)1870 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1871 {
1872 	scm->fp = UNIXCB(skb).fp;
1873 	UNIXCB(skb).fp = NULL;
1874 
1875 	unix_destroy_fpl(scm->fp);
1876 }
1877 
unix_peek_fds(struct scm_cookie * scm,struct sk_buff * skb)1878 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1879 {
1880 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1881 }
1882 
unix_destruct_scm(struct sk_buff * skb)1883 static void unix_destruct_scm(struct sk_buff *skb)
1884 {
1885 	struct scm_cookie scm;
1886 
1887 	memset(&scm, 0, sizeof(scm));
1888 	scm.pid  = UNIXCB(skb).pid;
1889 	if (UNIXCB(skb).fp)
1890 		unix_detach_fds(&scm, skb);
1891 
1892 	/* Alas, it calls VFS */
1893 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1894 	scm_destroy(&scm);
1895 	sock_wfree(skb);
1896 }
1897 
unix_scm_to_skb(struct scm_cookie * scm,struct sk_buff * skb,bool send_fds)1898 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1899 {
1900 	int err = 0;
1901 
1902 	UNIXCB(skb).pid  = get_pid(scm->pid);
1903 	UNIXCB(skb).uid = scm->creds.uid;
1904 	UNIXCB(skb).gid = scm->creds.gid;
1905 	UNIXCB(skb).fp = NULL;
1906 	unix_get_secdata(scm, skb);
1907 	if (scm->fp && send_fds)
1908 		err = unix_attach_fds(scm, skb);
1909 
1910 	skb->destructor = unix_destruct_scm;
1911 	return err;
1912 }
1913 
unix_passcred_enabled(const struct socket * sock,const struct sock * other)1914 static bool unix_passcred_enabled(const struct socket *sock,
1915 				  const struct sock *other)
1916 {
1917 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1918 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1919 	       !other->sk_socket ||
1920 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1921 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1922 }
1923 
1924 /*
1925  * Some apps rely on write() giving SCM_CREDENTIALS
1926  * We include credentials if source or destination socket
1927  * asserted SOCK_PASSCRED.
1928  */
maybe_add_creds(struct sk_buff * skb,const struct socket * sock,const struct sock * other)1929 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1930 			    const struct sock *other)
1931 {
1932 	if (UNIXCB(skb).pid)
1933 		return;
1934 	if (unix_passcred_enabled(sock, other)) {
1935 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1936 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1937 	}
1938 }
1939 
unix_skb_scm_eq(struct sk_buff * skb,struct scm_cookie * scm)1940 static bool unix_skb_scm_eq(struct sk_buff *skb,
1941 			    struct scm_cookie *scm)
1942 {
1943 	return UNIXCB(skb).pid == scm->pid &&
1944 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1945 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1946 	       unix_secdata_eq(scm, skb);
1947 }
1948 
scm_stat_add(struct sock * sk,struct sk_buff * skb)1949 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1950 {
1951 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1952 	struct unix_sock *u = unix_sk(sk);
1953 
1954 	if (unlikely(fp && fp->count)) {
1955 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1956 		unix_add_edges(fp, u);
1957 	}
1958 }
1959 
scm_stat_del(struct sock * sk,struct sk_buff * skb)1960 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1961 {
1962 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1963 	struct unix_sock *u = unix_sk(sk);
1964 
1965 	if (unlikely(fp && fp->count)) {
1966 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1967 		unix_del_edges(fp);
1968 	}
1969 }
1970 
1971 /*
1972  *	Send AF_UNIX data.
1973  */
1974 
unix_dgram_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)1975 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1976 			      size_t len)
1977 {
1978 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1979 	struct sock *sk = sock->sk, *other = NULL;
1980 	struct unix_sock *u = unix_sk(sk);
1981 	struct scm_cookie scm;
1982 	struct sk_buff *skb;
1983 	int data_len = 0;
1984 	int sk_locked;
1985 	long timeo;
1986 	int err;
1987 
1988 	err = scm_send(sock, msg, &scm, false);
1989 	if (err < 0)
1990 		return err;
1991 
1992 	wait_for_unix_gc(scm.fp);
1993 
1994 	err = -EOPNOTSUPP;
1995 	if (msg->msg_flags&MSG_OOB)
1996 		goto out;
1997 
1998 	if (msg->msg_namelen) {
1999 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
2000 		if (err)
2001 			goto out;
2002 
2003 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
2004 							    msg->msg_name,
2005 							    &msg->msg_namelen,
2006 							    NULL);
2007 		if (err)
2008 			goto out;
2009 	} else {
2010 		sunaddr = NULL;
2011 		err = -ENOTCONN;
2012 		other = unix_peer_get(sk);
2013 		if (!other)
2014 			goto out;
2015 	}
2016 
2017 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
2018 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
2019 	    !READ_ONCE(u->addr)) {
2020 		err = unix_autobind(sk);
2021 		if (err)
2022 			goto out;
2023 	}
2024 
2025 	err = -EMSGSIZE;
2026 	if (len > READ_ONCE(sk->sk_sndbuf) - 32)
2027 		goto out;
2028 
2029 	if (len > SKB_MAX_ALLOC) {
2030 		data_len = min_t(size_t,
2031 				 len - SKB_MAX_ALLOC,
2032 				 MAX_SKB_FRAGS * PAGE_SIZE);
2033 		data_len = PAGE_ALIGN(data_len);
2034 
2035 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2036 	}
2037 
2038 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2039 				   msg->msg_flags & MSG_DONTWAIT, &err,
2040 				   PAGE_ALLOC_COSTLY_ORDER);
2041 	if (skb == NULL)
2042 		goto out;
2043 
2044 	err = unix_scm_to_skb(&scm, skb, true);
2045 	if (err < 0)
2046 		goto out_free;
2047 
2048 	skb_put(skb, len - data_len);
2049 	skb->data_len = data_len;
2050 	skb->len = len;
2051 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2052 	if (err)
2053 		goto out_free;
2054 
2055 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2056 
2057 restart:
2058 	if (!other) {
2059 		err = -ECONNRESET;
2060 		if (sunaddr == NULL)
2061 			goto out_free;
2062 
2063 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2064 					sk->sk_type);
2065 		if (IS_ERR(other)) {
2066 			err = PTR_ERR(other);
2067 			other = NULL;
2068 			goto out_free;
2069 		}
2070 	}
2071 
2072 	if (sk_filter(other, skb) < 0) {
2073 		/* Toss the packet but do not return any error to the sender */
2074 		err = len;
2075 		goto out_free;
2076 	}
2077 
2078 	sk_locked = 0;
2079 	unix_state_lock(other);
2080 restart_locked:
2081 	err = -EPERM;
2082 	if (!unix_may_send(sk, other))
2083 		goto out_unlock;
2084 
2085 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2086 		/*
2087 		 *	Check with 1003.1g - what should
2088 		 *	datagram error
2089 		 */
2090 		unix_state_unlock(other);
2091 		sock_put(other);
2092 
2093 		if (!sk_locked)
2094 			unix_state_lock(sk);
2095 
2096 		err = 0;
2097 		if (sk->sk_type == SOCK_SEQPACKET) {
2098 			/* We are here only when racing with unix_release_sock()
2099 			 * is clearing @other. Never change state to TCP_CLOSE
2100 			 * unlike SOCK_DGRAM wants.
2101 			 */
2102 			unix_state_unlock(sk);
2103 			err = -EPIPE;
2104 		} else if (unix_peer(sk) == other) {
2105 			unix_peer(sk) = NULL;
2106 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2107 
2108 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2109 			unix_state_unlock(sk);
2110 
2111 			unix_dgram_disconnected(sk, other);
2112 			sock_put(other);
2113 			err = -ECONNREFUSED;
2114 		} else {
2115 			unix_state_unlock(sk);
2116 		}
2117 
2118 		other = NULL;
2119 		if (err)
2120 			goto out_free;
2121 		goto restart;
2122 	}
2123 
2124 	err = -EPIPE;
2125 	if (other->sk_shutdown & RCV_SHUTDOWN)
2126 		goto out_unlock;
2127 
2128 	if (sk->sk_type != SOCK_SEQPACKET) {
2129 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2130 		if (err)
2131 			goto out_unlock;
2132 	}
2133 
2134 	/* other == sk && unix_peer(other) != sk if
2135 	 * - unix_peer(sk) == NULL, destination address bound to sk
2136 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2137 	 */
2138 	if (other != sk &&
2139 	    unlikely(unix_peer(other) != sk &&
2140 	    unix_recvq_full_lockless(other))) {
2141 		if (timeo) {
2142 			timeo = unix_wait_for_peer(other, timeo);
2143 
2144 			err = sock_intr_errno(timeo);
2145 			if (signal_pending(current))
2146 				goto out_free;
2147 
2148 			goto restart;
2149 		}
2150 
2151 		if (!sk_locked) {
2152 			unix_state_unlock(other);
2153 			unix_state_double_lock(sk, other);
2154 		}
2155 
2156 		if (unix_peer(sk) != other ||
2157 		    unix_dgram_peer_wake_me(sk, other)) {
2158 			err = -EAGAIN;
2159 			sk_locked = 1;
2160 			goto out_unlock;
2161 		}
2162 
2163 		if (!sk_locked) {
2164 			sk_locked = 1;
2165 			goto restart_locked;
2166 		}
2167 	}
2168 
2169 	if (unlikely(sk_locked))
2170 		unix_state_unlock(sk);
2171 
2172 	if (sock_flag(other, SOCK_RCVTSTAMP))
2173 		__net_timestamp(skb);
2174 	maybe_add_creds(skb, sock, other);
2175 	scm_stat_add(other, skb);
2176 	skb_queue_tail(&other->sk_receive_queue, skb);
2177 	unix_state_unlock(other);
2178 	other->sk_data_ready(other);
2179 	sock_put(other);
2180 	scm_destroy(&scm);
2181 	return len;
2182 
2183 out_unlock:
2184 	if (sk_locked)
2185 		unix_state_unlock(sk);
2186 	unix_state_unlock(other);
2187 out_free:
2188 	kfree_skb(skb);
2189 out:
2190 	if (other)
2191 		sock_put(other);
2192 	scm_destroy(&scm);
2193 	return err;
2194 }
2195 
2196 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2197  * bytes, and a minimum of a full page.
2198  */
2199 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2200 
2201 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
queue_oob(struct socket * sock,struct msghdr * msg,struct sock * other,struct scm_cookie * scm,bool fds_sent)2202 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2203 		     struct scm_cookie *scm, bool fds_sent)
2204 {
2205 	struct unix_sock *ousk = unix_sk(other);
2206 	struct sk_buff *skb;
2207 	int err = 0;
2208 
2209 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2210 
2211 	if (!skb)
2212 		return err;
2213 
2214 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2215 	if (err < 0) {
2216 		kfree_skb(skb);
2217 		return err;
2218 	}
2219 	skb_put(skb, 1);
2220 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2221 
2222 	if (err) {
2223 		kfree_skb(skb);
2224 		return err;
2225 	}
2226 
2227 	unix_state_lock(other);
2228 
2229 	if (sock_flag(other, SOCK_DEAD) ||
2230 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2231 		unix_state_unlock(other);
2232 		kfree_skb(skb);
2233 		return -EPIPE;
2234 	}
2235 
2236 	maybe_add_creds(skb, sock, other);
2237 	scm_stat_add(other, skb);
2238 
2239 	spin_lock(&other->sk_receive_queue.lock);
2240 	WRITE_ONCE(ousk->oob_skb, skb);
2241 	__skb_queue_tail(&other->sk_receive_queue, skb);
2242 	spin_unlock(&other->sk_receive_queue.lock);
2243 
2244 	sk_send_sigurg(other);
2245 	unix_state_unlock(other);
2246 	other->sk_data_ready(other);
2247 
2248 	return err;
2249 }
2250 #endif
2251 
unix_stream_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2252 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2253 			       size_t len)
2254 {
2255 	struct sock *sk = sock->sk;
2256 	struct sock *other = NULL;
2257 	int err, size;
2258 	struct sk_buff *skb;
2259 	int sent = 0;
2260 	struct scm_cookie scm;
2261 	bool fds_sent = false;
2262 	int data_len;
2263 
2264 	err = scm_send(sock, msg, &scm, false);
2265 	if (err < 0)
2266 		return err;
2267 
2268 	wait_for_unix_gc(scm.fp);
2269 
2270 	err = -EOPNOTSUPP;
2271 	if (msg->msg_flags & MSG_OOB) {
2272 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2273 		if (len)
2274 			len--;
2275 		else
2276 #endif
2277 			goto out_err;
2278 	}
2279 
2280 	if (msg->msg_namelen) {
2281 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2282 		goto out_err;
2283 	} else {
2284 		err = -ENOTCONN;
2285 		other = unix_peer(sk);
2286 		if (!other)
2287 			goto out_err;
2288 	}
2289 
2290 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2291 		goto pipe_err;
2292 
2293 	while (sent < len) {
2294 		size = len - sent;
2295 
2296 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2297 			skb = sock_alloc_send_pskb(sk, 0, 0,
2298 						   msg->msg_flags & MSG_DONTWAIT,
2299 						   &err, 0);
2300 		} else {
2301 			/* Keep two messages in the pipe so it schedules better */
2302 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2303 
2304 			/* allow fallback to order-0 allocations */
2305 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2306 
2307 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2308 
2309 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2310 
2311 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2312 						   msg->msg_flags & MSG_DONTWAIT, &err,
2313 						   get_order(UNIX_SKB_FRAGS_SZ));
2314 		}
2315 		if (!skb)
2316 			goto out_err;
2317 
2318 		/* Only send the fds in the first buffer */
2319 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2320 		if (err < 0) {
2321 			kfree_skb(skb);
2322 			goto out_err;
2323 		}
2324 		fds_sent = true;
2325 
2326 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2327 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2328 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2329 						   sk->sk_allocation);
2330 			if (err < 0) {
2331 				kfree_skb(skb);
2332 				goto out_err;
2333 			}
2334 			size = err;
2335 			refcount_add(size, &sk->sk_wmem_alloc);
2336 		} else {
2337 			skb_put(skb, size - data_len);
2338 			skb->data_len = data_len;
2339 			skb->len = size;
2340 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2341 			if (err) {
2342 				kfree_skb(skb);
2343 				goto out_err;
2344 			}
2345 		}
2346 
2347 		unix_state_lock(other);
2348 
2349 		if (sock_flag(other, SOCK_DEAD) ||
2350 		    (other->sk_shutdown & RCV_SHUTDOWN))
2351 			goto pipe_err_free;
2352 
2353 		maybe_add_creds(skb, sock, other);
2354 		scm_stat_add(other, skb);
2355 		skb_queue_tail(&other->sk_receive_queue, skb);
2356 		unix_state_unlock(other);
2357 		other->sk_data_ready(other);
2358 		sent += size;
2359 	}
2360 
2361 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2362 	if (msg->msg_flags & MSG_OOB) {
2363 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2364 		if (err)
2365 			goto out_err;
2366 		sent++;
2367 	}
2368 #endif
2369 
2370 	scm_destroy(&scm);
2371 
2372 	return sent;
2373 
2374 pipe_err_free:
2375 	unix_state_unlock(other);
2376 	kfree_skb(skb);
2377 pipe_err:
2378 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2379 		send_sig(SIGPIPE, current, 0);
2380 	err = -EPIPE;
2381 out_err:
2382 	scm_destroy(&scm);
2383 	return sent ? : err;
2384 }
2385 
unix_seqpacket_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2386 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2387 				  size_t len)
2388 {
2389 	int err;
2390 	struct sock *sk = sock->sk;
2391 
2392 	err = sock_error(sk);
2393 	if (err)
2394 		return err;
2395 
2396 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2397 		return -ENOTCONN;
2398 
2399 	if (msg->msg_namelen)
2400 		msg->msg_namelen = 0;
2401 
2402 	return unix_dgram_sendmsg(sock, msg, len);
2403 }
2404 
unix_seqpacket_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2405 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2406 				  size_t size, int flags)
2407 {
2408 	struct sock *sk = sock->sk;
2409 
2410 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2411 		return -ENOTCONN;
2412 
2413 	return unix_dgram_recvmsg(sock, msg, size, flags);
2414 }
2415 
unix_copy_addr(struct msghdr * msg,struct sock * sk)2416 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2417 {
2418 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2419 
2420 	if (addr) {
2421 		msg->msg_namelen = addr->len;
2422 		memcpy(msg->msg_name, addr->name, addr->len);
2423 	}
2424 }
2425 
__unix_dgram_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2426 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2427 			 int flags)
2428 {
2429 	struct scm_cookie scm;
2430 	struct socket *sock = sk->sk_socket;
2431 	struct unix_sock *u = unix_sk(sk);
2432 	struct sk_buff *skb, *last;
2433 	long timeo;
2434 	int skip;
2435 	int err;
2436 
2437 	err = -EOPNOTSUPP;
2438 	if (flags&MSG_OOB)
2439 		goto out;
2440 
2441 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2442 
2443 	do {
2444 		mutex_lock(&u->iolock);
2445 
2446 		skip = sk_peek_offset(sk, flags);
2447 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2448 					      &skip, &err, &last);
2449 		if (skb) {
2450 			if (!(flags & MSG_PEEK))
2451 				scm_stat_del(sk, skb);
2452 			break;
2453 		}
2454 
2455 		mutex_unlock(&u->iolock);
2456 
2457 		if (err != -EAGAIN)
2458 			break;
2459 	} while (timeo &&
2460 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2461 					      &err, &timeo, last));
2462 
2463 	if (!skb) { /* implies iolock unlocked */
2464 		unix_state_lock(sk);
2465 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2466 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2467 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2468 			err = 0;
2469 		unix_state_unlock(sk);
2470 		goto out;
2471 	}
2472 
2473 	if (wq_has_sleeper(&u->peer_wait))
2474 		wake_up_interruptible_sync_poll(&u->peer_wait,
2475 						EPOLLOUT | EPOLLWRNORM |
2476 						EPOLLWRBAND);
2477 
2478 	if (msg->msg_name) {
2479 		unix_copy_addr(msg, skb->sk);
2480 
2481 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2482 						      msg->msg_name,
2483 						      &msg->msg_namelen);
2484 	}
2485 
2486 	if (size > skb->len - skip)
2487 		size = skb->len - skip;
2488 	else if (size < skb->len - skip)
2489 		msg->msg_flags |= MSG_TRUNC;
2490 
2491 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2492 	if (err)
2493 		goto out_free;
2494 
2495 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2496 		__sock_recv_timestamp(msg, sk, skb);
2497 
2498 	memset(&scm, 0, sizeof(scm));
2499 
2500 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2501 	unix_set_secdata(&scm, skb);
2502 
2503 	if (!(flags & MSG_PEEK)) {
2504 		if (UNIXCB(skb).fp)
2505 			unix_detach_fds(&scm, skb);
2506 
2507 		sk_peek_offset_bwd(sk, skb->len);
2508 	} else {
2509 		/* It is questionable: on PEEK we could:
2510 		   - do not return fds - good, but too simple 8)
2511 		   - return fds, and do not return them on read (old strategy,
2512 		     apparently wrong)
2513 		   - clone fds (I chose it for now, it is the most universal
2514 		     solution)
2515 
2516 		   POSIX 1003.1g does not actually define this clearly
2517 		   at all. POSIX 1003.1g doesn't define a lot of things
2518 		   clearly however!
2519 
2520 		*/
2521 
2522 		sk_peek_offset_fwd(sk, size);
2523 
2524 		if (UNIXCB(skb).fp)
2525 			unix_peek_fds(&scm, skb);
2526 	}
2527 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2528 
2529 	scm_recv_unix(sock, msg, &scm, flags);
2530 
2531 out_free:
2532 	skb_free_datagram(sk, skb);
2533 	mutex_unlock(&u->iolock);
2534 out:
2535 	return err;
2536 }
2537 
unix_dgram_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2538 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2539 			      int flags)
2540 {
2541 	struct sock *sk = sock->sk;
2542 
2543 #ifdef CONFIG_BPF_SYSCALL
2544 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2545 
2546 	if (prot != &unix_dgram_proto)
2547 		return prot->recvmsg(sk, msg, size, flags, NULL);
2548 #endif
2549 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2550 }
2551 
unix_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2552 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2553 {
2554 	struct unix_sock *u = unix_sk(sk);
2555 	struct sk_buff *skb;
2556 	int err;
2557 
2558 	mutex_lock(&u->iolock);
2559 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2560 	mutex_unlock(&u->iolock);
2561 	if (!skb)
2562 		return err;
2563 
2564 	return recv_actor(sk, skb);
2565 }
2566 
2567 /*
2568  *	Sleep until more data has arrived. But check for races..
2569  */
unix_stream_data_wait(struct sock * sk,long timeo,struct sk_buff * last,unsigned int last_len,bool freezable)2570 static long unix_stream_data_wait(struct sock *sk, long timeo,
2571 				  struct sk_buff *last, unsigned int last_len,
2572 				  bool freezable)
2573 {
2574 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2575 	struct sk_buff *tail;
2576 	DEFINE_WAIT(wait);
2577 
2578 	unix_state_lock(sk);
2579 
2580 	for (;;) {
2581 		prepare_to_wait(sk_sleep(sk), &wait, state);
2582 
2583 		tail = skb_peek_tail(&sk->sk_receive_queue);
2584 		if (tail != last ||
2585 		    (tail && tail->len != last_len) ||
2586 		    sk->sk_err ||
2587 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2588 		    signal_pending(current) ||
2589 		    !timeo)
2590 			break;
2591 
2592 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2593 		unix_state_unlock(sk);
2594 		timeo = schedule_timeout(timeo);
2595 		unix_state_lock(sk);
2596 
2597 		if (sock_flag(sk, SOCK_DEAD))
2598 			break;
2599 
2600 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2601 	}
2602 
2603 	finish_wait(sk_sleep(sk), &wait);
2604 	unix_state_unlock(sk);
2605 	return timeo;
2606 }
2607 
2608 struct unix_stream_read_state {
2609 	int (*recv_actor)(struct sk_buff *, int, int,
2610 			  struct unix_stream_read_state *);
2611 	struct socket *socket;
2612 	struct msghdr *msg;
2613 	struct pipe_inode_info *pipe;
2614 	size_t size;
2615 	int flags;
2616 	unsigned int splice_flags;
2617 };
2618 
2619 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
unix_stream_recv_urg(struct unix_stream_read_state * state)2620 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2621 {
2622 	struct sk_buff *oob_skb, *read_skb = NULL;
2623 	struct socket *sock = state->socket;
2624 	struct sock *sk = sock->sk;
2625 	struct unix_sock *u = unix_sk(sk);
2626 	int chunk = 1;
2627 
2628 	mutex_lock(&u->iolock);
2629 	unix_state_lock(sk);
2630 	spin_lock(&sk->sk_receive_queue.lock);
2631 
2632 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2633 		spin_unlock(&sk->sk_receive_queue.lock);
2634 		unix_state_unlock(sk);
2635 		mutex_unlock(&u->iolock);
2636 		return -EINVAL;
2637 	}
2638 
2639 	oob_skb = u->oob_skb;
2640 
2641 	if (!(state->flags & MSG_PEEK)) {
2642 		WRITE_ONCE(u->oob_skb, NULL);
2643 
2644 		if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue &&
2645 		    !unix_skb_len(oob_skb->prev)) {
2646 			read_skb = oob_skb->prev;
2647 			__skb_unlink(read_skb, &sk->sk_receive_queue);
2648 		}
2649 	}
2650 
2651 	spin_unlock(&sk->sk_receive_queue.lock);
2652 	unix_state_unlock(sk);
2653 
2654 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2655 
2656 	if (!(state->flags & MSG_PEEK))
2657 		UNIXCB(oob_skb).consumed += 1;
2658 
2659 	mutex_unlock(&u->iolock);
2660 
2661 	consume_skb(read_skb);
2662 
2663 	if (chunk < 0)
2664 		return -EFAULT;
2665 
2666 	state->msg->msg_flags |= MSG_OOB;
2667 	return 1;
2668 }
2669 
manage_oob(struct sk_buff * skb,struct sock * sk,int flags,int copied)2670 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2671 				  int flags, int copied)
2672 {
2673 	struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2674 	struct unix_sock *u = unix_sk(sk);
2675 
2676 	if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2677 		return skb;
2678 
2679 	spin_lock(&sk->sk_receive_queue.lock);
2680 
2681 	if (!unix_skb_len(skb)) {
2682 		if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2683 			skb = NULL;
2684 		} else if (flags & MSG_PEEK) {
2685 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2686 		} else {
2687 			read_skb = skb;
2688 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2689 			__skb_unlink(read_skb, &sk->sk_receive_queue);
2690 		}
2691 
2692 		if (!skb)
2693 			goto unlock;
2694 	}
2695 
2696 	if (skb != u->oob_skb)
2697 		goto unlock;
2698 
2699 	if (copied) {
2700 		skb = NULL;
2701 	} else if (!(flags & MSG_PEEK)) {
2702 		WRITE_ONCE(u->oob_skb, NULL);
2703 
2704 		if (!sock_flag(sk, SOCK_URGINLINE)) {
2705 			__skb_unlink(skb, &sk->sk_receive_queue);
2706 			unread_skb = skb;
2707 			skb = skb_peek(&sk->sk_receive_queue);
2708 		}
2709 	} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2710 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
2711 	}
2712 
2713 unlock:
2714 	spin_unlock(&sk->sk_receive_queue.lock);
2715 
2716 	consume_skb(read_skb);
2717 	kfree_skb(unread_skb);
2718 
2719 	return skb;
2720 }
2721 #endif
2722 
unix_stream_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2723 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2724 {
2725 	struct unix_sock *u = unix_sk(sk);
2726 	struct sk_buff *skb;
2727 	int err;
2728 
2729 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2730 		return -ENOTCONN;
2731 
2732 	mutex_lock(&u->iolock);
2733 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2734 	mutex_unlock(&u->iolock);
2735 	if (!skb)
2736 		return err;
2737 
2738 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2739 	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2740 		bool drop = false;
2741 
2742 		unix_state_lock(sk);
2743 
2744 		if (sock_flag(sk, SOCK_DEAD)) {
2745 			unix_state_unlock(sk);
2746 			kfree_skb(skb);
2747 			return -ECONNRESET;
2748 		}
2749 
2750 		spin_lock(&sk->sk_receive_queue.lock);
2751 		if (likely(skb == u->oob_skb)) {
2752 			WRITE_ONCE(u->oob_skb, NULL);
2753 			drop = true;
2754 		}
2755 		spin_unlock(&sk->sk_receive_queue.lock);
2756 
2757 		unix_state_unlock(sk);
2758 
2759 		if (drop) {
2760 			kfree_skb(skb);
2761 			return -EAGAIN;
2762 		}
2763 	}
2764 #endif
2765 
2766 	return recv_actor(sk, skb);
2767 }
2768 
unix_stream_read_generic(struct unix_stream_read_state * state,bool freezable)2769 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2770 				    bool freezable)
2771 {
2772 	struct scm_cookie scm;
2773 	struct socket *sock = state->socket;
2774 	struct sock *sk = sock->sk;
2775 	struct unix_sock *u = unix_sk(sk);
2776 	int copied = 0;
2777 	int flags = state->flags;
2778 	int noblock = flags & MSG_DONTWAIT;
2779 	bool check_creds = false;
2780 	int target;
2781 	int err = 0;
2782 	long timeo;
2783 	int skip;
2784 	size_t size = state->size;
2785 	unsigned int last_len;
2786 
2787 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2788 		err = -EINVAL;
2789 		goto out;
2790 	}
2791 
2792 	if (unlikely(flags & MSG_OOB)) {
2793 		err = -EOPNOTSUPP;
2794 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2795 		err = unix_stream_recv_urg(state);
2796 #endif
2797 		goto out;
2798 	}
2799 
2800 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2801 	timeo = sock_rcvtimeo(sk, noblock);
2802 
2803 	memset(&scm, 0, sizeof(scm));
2804 
2805 	/* Lock the socket to prevent queue disordering
2806 	 * while sleeps in memcpy_tomsg
2807 	 */
2808 	mutex_lock(&u->iolock);
2809 
2810 	skip = max(sk_peek_offset(sk, flags), 0);
2811 
2812 	do {
2813 		struct sk_buff *skb, *last;
2814 		int chunk;
2815 
2816 redo:
2817 		unix_state_lock(sk);
2818 		if (sock_flag(sk, SOCK_DEAD)) {
2819 			err = -ECONNRESET;
2820 			goto unlock;
2821 		}
2822 		last = skb = skb_peek(&sk->sk_receive_queue);
2823 		last_len = last ? last->len : 0;
2824 
2825 again:
2826 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2827 		if (skb) {
2828 			skb = manage_oob(skb, sk, flags, copied);
2829 			if (!skb && copied) {
2830 				unix_state_unlock(sk);
2831 				break;
2832 			}
2833 		}
2834 #endif
2835 		if (skb == NULL) {
2836 			if (copied >= target)
2837 				goto unlock;
2838 
2839 			/*
2840 			 *	POSIX 1003.1g mandates this order.
2841 			 */
2842 
2843 			err = sock_error(sk);
2844 			if (err)
2845 				goto unlock;
2846 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2847 				goto unlock;
2848 
2849 			unix_state_unlock(sk);
2850 			if (!timeo) {
2851 				err = -EAGAIN;
2852 				break;
2853 			}
2854 
2855 			mutex_unlock(&u->iolock);
2856 
2857 			timeo = unix_stream_data_wait(sk, timeo, last,
2858 						      last_len, freezable);
2859 
2860 			if (signal_pending(current)) {
2861 				err = sock_intr_errno(timeo);
2862 				scm_destroy(&scm);
2863 				goto out;
2864 			}
2865 
2866 			mutex_lock(&u->iolock);
2867 			goto redo;
2868 unlock:
2869 			unix_state_unlock(sk);
2870 			break;
2871 		}
2872 
2873 		while (skip >= unix_skb_len(skb)) {
2874 			skip -= unix_skb_len(skb);
2875 			last = skb;
2876 			last_len = skb->len;
2877 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2878 			if (!skb)
2879 				goto again;
2880 		}
2881 
2882 		unix_state_unlock(sk);
2883 
2884 		if (check_creds) {
2885 			/* Never glue messages from different writers */
2886 			if (!unix_skb_scm_eq(skb, &scm))
2887 				break;
2888 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2889 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2890 			/* Copy credentials */
2891 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2892 			unix_set_secdata(&scm, skb);
2893 			check_creds = true;
2894 		}
2895 
2896 		/* Copy address just once */
2897 		if (state->msg && state->msg->msg_name) {
2898 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2899 					 state->msg->msg_name);
2900 			unix_copy_addr(state->msg, skb->sk);
2901 
2902 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2903 							      state->msg->msg_name,
2904 							      &state->msg->msg_namelen);
2905 
2906 			sunaddr = NULL;
2907 		}
2908 
2909 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2910 		chunk = state->recv_actor(skb, skip, chunk, state);
2911 		if (chunk < 0) {
2912 			if (copied == 0)
2913 				copied = -EFAULT;
2914 			break;
2915 		}
2916 		copied += chunk;
2917 		size -= chunk;
2918 
2919 		/* Mark read part of skb as used */
2920 		if (!(flags & MSG_PEEK)) {
2921 			UNIXCB(skb).consumed += chunk;
2922 
2923 			sk_peek_offset_bwd(sk, chunk);
2924 
2925 			if (UNIXCB(skb).fp) {
2926 				scm_stat_del(sk, skb);
2927 				unix_detach_fds(&scm, skb);
2928 			}
2929 
2930 			if (unix_skb_len(skb))
2931 				break;
2932 
2933 			skb_unlink(skb, &sk->sk_receive_queue);
2934 			consume_skb(skb);
2935 
2936 			if (scm.fp)
2937 				break;
2938 		} else {
2939 			/* It is questionable, see note in unix_dgram_recvmsg.
2940 			 */
2941 			if (UNIXCB(skb).fp)
2942 				unix_peek_fds(&scm, skb);
2943 
2944 			sk_peek_offset_fwd(sk, chunk);
2945 
2946 			if (UNIXCB(skb).fp)
2947 				break;
2948 
2949 			skip = 0;
2950 			last = skb;
2951 			last_len = skb->len;
2952 			unix_state_lock(sk);
2953 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2954 			if (skb)
2955 				goto again;
2956 			unix_state_unlock(sk);
2957 			break;
2958 		}
2959 	} while (size);
2960 
2961 	mutex_unlock(&u->iolock);
2962 	if (state->msg)
2963 		scm_recv_unix(sock, state->msg, &scm, flags);
2964 	else
2965 		scm_destroy(&scm);
2966 out:
2967 	return copied ? : err;
2968 }
2969 
unix_stream_read_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2970 static int unix_stream_read_actor(struct sk_buff *skb,
2971 				  int skip, int chunk,
2972 				  struct unix_stream_read_state *state)
2973 {
2974 	int ret;
2975 
2976 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2977 				    state->msg, chunk);
2978 	return ret ?: chunk;
2979 }
2980 
__unix_stream_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2981 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2982 			  size_t size, int flags)
2983 {
2984 	struct unix_stream_read_state state = {
2985 		.recv_actor = unix_stream_read_actor,
2986 		.socket = sk->sk_socket,
2987 		.msg = msg,
2988 		.size = size,
2989 		.flags = flags
2990 	};
2991 
2992 	return unix_stream_read_generic(&state, true);
2993 }
2994 
unix_stream_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2995 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2996 			       size_t size, int flags)
2997 {
2998 	struct unix_stream_read_state state = {
2999 		.recv_actor = unix_stream_read_actor,
3000 		.socket = sock,
3001 		.msg = msg,
3002 		.size = size,
3003 		.flags = flags
3004 	};
3005 
3006 #ifdef CONFIG_BPF_SYSCALL
3007 	struct sock *sk = sock->sk;
3008 	const struct proto *prot = READ_ONCE(sk->sk_prot);
3009 
3010 	if (prot != &unix_stream_proto)
3011 		return prot->recvmsg(sk, msg, size, flags, NULL);
3012 #endif
3013 	return unix_stream_read_generic(&state, true);
3014 }
3015 
unix_stream_splice_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)3016 static int unix_stream_splice_actor(struct sk_buff *skb,
3017 				    int skip, int chunk,
3018 				    struct unix_stream_read_state *state)
3019 {
3020 	return skb_splice_bits(skb, state->socket->sk,
3021 			       UNIXCB(skb).consumed + skip,
3022 			       state->pipe, chunk, state->splice_flags);
3023 }
3024 
unix_stream_splice_read(struct socket * sock,loff_t * ppos,struct pipe_inode_info * pipe,size_t size,unsigned int flags)3025 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
3026 				       struct pipe_inode_info *pipe,
3027 				       size_t size, unsigned int flags)
3028 {
3029 	struct unix_stream_read_state state = {
3030 		.recv_actor = unix_stream_splice_actor,
3031 		.socket = sock,
3032 		.pipe = pipe,
3033 		.size = size,
3034 		.splice_flags = flags,
3035 	};
3036 
3037 	if (unlikely(*ppos))
3038 		return -ESPIPE;
3039 
3040 	if (sock->file->f_flags & O_NONBLOCK ||
3041 	    flags & SPLICE_F_NONBLOCK)
3042 		state.flags = MSG_DONTWAIT;
3043 
3044 	return unix_stream_read_generic(&state, false);
3045 }
3046 
unix_shutdown(struct socket * sock,int mode)3047 static int unix_shutdown(struct socket *sock, int mode)
3048 {
3049 	struct sock *sk = sock->sk;
3050 	struct sock *other;
3051 
3052 	if (mode < SHUT_RD || mode > SHUT_RDWR)
3053 		return -EINVAL;
3054 	/* This maps:
3055 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3056 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3057 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3058 	 */
3059 	++mode;
3060 
3061 	unix_state_lock(sk);
3062 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3063 	other = unix_peer(sk);
3064 	if (other)
3065 		sock_hold(other);
3066 	unix_state_unlock(sk);
3067 	sk->sk_state_change(sk);
3068 
3069 	if (other &&
3070 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3071 
3072 		int peer_mode = 0;
3073 		const struct proto *prot = READ_ONCE(other->sk_prot);
3074 
3075 		if (prot->unhash)
3076 			prot->unhash(other);
3077 		if (mode&RCV_SHUTDOWN)
3078 			peer_mode |= SEND_SHUTDOWN;
3079 		if (mode&SEND_SHUTDOWN)
3080 			peer_mode |= RCV_SHUTDOWN;
3081 		unix_state_lock(other);
3082 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3083 		unix_state_unlock(other);
3084 		other->sk_state_change(other);
3085 		if (peer_mode == SHUTDOWN_MASK)
3086 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3087 		else if (peer_mode & RCV_SHUTDOWN)
3088 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3089 	}
3090 	if (other)
3091 		sock_put(other);
3092 
3093 	return 0;
3094 }
3095 
unix_inq_len(struct sock * sk)3096 long unix_inq_len(struct sock *sk)
3097 {
3098 	struct sk_buff *skb;
3099 	long amount = 0;
3100 
3101 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3102 		return -EINVAL;
3103 
3104 	spin_lock(&sk->sk_receive_queue.lock);
3105 	if (sk->sk_type == SOCK_STREAM ||
3106 	    sk->sk_type == SOCK_SEQPACKET) {
3107 		skb_queue_walk(&sk->sk_receive_queue, skb)
3108 			amount += unix_skb_len(skb);
3109 	} else {
3110 		skb = skb_peek(&sk->sk_receive_queue);
3111 		if (skb)
3112 			amount = skb->len;
3113 	}
3114 	spin_unlock(&sk->sk_receive_queue.lock);
3115 
3116 	return amount;
3117 }
3118 EXPORT_SYMBOL_GPL(unix_inq_len);
3119 
unix_outq_len(struct sock * sk)3120 long unix_outq_len(struct sock *sk)
3121 {
3122 	return sk_wmem_alloc_get(sk);
3123 }
3124 EXPORT_SYMBOL_GPL(unix_outq_len);
3125 
unix_open_file(struct sock * sk)3126 static int unix_open_file(struct sock *sk)
3127 {
3128 	struct path path;
3129 	struct file *f;
3130 	int fd;
3131 
3132 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3133 		return -EPERM;
3134 
3135 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3136 		return -ENOENT;
3137 
3138 	path = unix_sk(sk)->path;
3139 	if (!path.dentry)
3140 		return -ENOENT;
3141 
3142 	path_get(&path);
3143 
3144 	fd = get_unused_fd_flags(O_CLOEXEC);
3145 	if (fd < 0)
3146 		goto out;
3147 
3148 	f = dentry_open(&path, O_PATH, current_cred());
3149 	if (IS_ERR(f)) {
3150 		put_unused_fd(fd);
3151 		fd = PTR_ERR(f);
3152 		goto out;
3153 	}
3154 
3155 	fd_install(fd, f);
3156 out:
3157 	path_put(&path);
3158 
3159 	return fd;
3160 }
3161 
unix_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3162 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3163 {
3164 	struct sock *sk = sock->sk;
3165 	long amount = 0;
3166 	int err;
3167 
3168 	switch (cmd) {
3169 	case SIOCOUTQ:
3170 		amount = unix_outq_len(sk);
3171 		err = put_user(amount, (int __user *)arg);
3172 		break;
3173 	case SIOCINQ:
3174 		amount = unix_inq_len(sk);
3175 		if (amount < 0)
3176 			err = amount;
3177 		else
3178 			err = put_user(amount, (int __user *)arg);
3179 		break;
3180 	case SIOCUNIXFILE:
3181 		err = unix_open_file(sk);
3182 		break;
3183 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3184 	case SIOCATMARK:
3185 		{
3186 			struct unix_sock *u = unix_sk(sk);
3187 			struct sk_buff *skb;
3188 			int answ = 0;
3189 
3190 			mutex_lock(&u->iolock);
3191 
3192 			skb = skb_peek(&sk->sk_receive_queue);
3193 			if (skb) {
3194 				struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3195 				struct sk_buff *next_skb;
3196 
3197 				next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3198 
3199 				if (skb == oob_skb ||
3200 				    (!unix_skb_len(skb) &&
3201 				     (!oob_skb || next_skb == oob_skb)))
3202 					answ = 1;
3203 			}
3204 
3205 			mutex_unlock(&u->iolock);
3206 
3207 			err = put_user(answ, (int __user *)arg);
3208 		}
3209 		break;
3210 #endif
3211 	default:
3212 		err = -ENOIOCTLCMD;
3213 		break;
3214 	}
3215 	return err;
3216 }
3217 
3218 #ifdef CONFIG_COMPAT
unix_compat_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3219 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3220 {
3221 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3222 }
3223 #endif
3224 
unix_poll(struct file * file,struct socket * sock,poll_table * wait)3225 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3226 {
3227 	struct sock *sk = sock->sk;
3228 	unsigned char state;
3229 	__poll_t mask;
3230 	u8 shutdown;
3231 
3232 	sock_poll_wait(file, sock, wait);
3233 	mask = 0;
3234 	shutdown = READ_ONCE(sk->sk_shutdown);
3235 	state = READ_ONCE(sk->sk_state);
3236 
3237 	/* exceptional events? */
3238 	if (READ_ONCE(sk->sk_err))
3239 		mask |= EPOLLERR;
3240 	if (shutdown == SHUTDOWN_MASK)
3241 		mask |= EPOLLHUP;
3242 	if (shutdown & RCV_SHUTDOWN)
3243 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3244 
3245 	/* readable? */
3246 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3247 		mask |= EPOLLIN | EPOLLRDNORM;
3248 	if (sk_is_readable(sk))
3249 		mask |= EPOLLIN | EPOLLRDNORM;
3250 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3251 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3252 		mask |= EPOLLPRI;
3253 #endif
3254 
3255 	/* Connection-based need to check for termination and startup */
3256 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3257 	    state == TCP_CLOSE)
3258 		mask |= EPOLLHUP;
3259 
3260 	/*
3261 	 * we set writable also when the other side has shut down the
3262 	 * connection. This prevents stuck sockets.
3263 	 */
3264 	if (unix_writable(sk, state))
3265 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3266 
3267 	return mask;
3268 }
3269 
unix_dgram_poll(struct file * file,struct socket * sock,poll_table * wait)3270 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3271 				    poll_table *wait)
3272 {
3273 	struct sock *sk = sock->sk, *other;
3274 	unsigned int writable;
3275 	unsigned char state;
3276 	__poll_t mask;
3277 	u8 shutdown;
3278 
3279 	sock_poll_wait(file, sock, wait);
3280 	mask = 0;
3281 	shutdown = READ_ONCE(sk->sk_shutdown);
3282 	state = READ_ONCE(sk->sk_state);
3283 
3284 	/* exceptional events? */
3285 	if (READ_ONCE(sk->sk_err) ||
3286 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3287 		mask |= EPOLLERR |
3288 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3289 
3290 	if (shutdown & RCV_SHUTDOWN)
3291 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3292 	if (shutdown == SHUTDOWN_MASK)
3293 		mask |= EPOLLHUP;
3294 
3295 	/* readable? */
3296 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3297 		mask |= EPOLLIN | EPOLLRDNORM;
3298 	if (sk_is_readable(sk))
3299 		mask |= EPOLLIN | EPOLLRDNORM;
3300 
3301 	/* Connection-based need to check for termination and startup */
3302 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3303 		mask |= EPOLLHUP;
3304 
3305 	/* No write status requested, avoid expensive OUT tests. */
3306 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3307 		return mask;
3308 
3309 	writable = unix_writable(sk, state);
3310 	if (writable) {
3311 		unix_state_lock(sk);
3312 
3313 		other = unix_peer(sk);
3314 		if (other && unix_peer(other) != sk &&
3315 		    unix_recvq_full_lockless(other) &&
3316 		    unix_dgram_peer_wake_me(sk, other))
3317 			writable = 0;
3318 
3319 		unix_state_unlock(sk);
3320 	}
3321 
3322 	if (writable)
3323 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3324 	else
3325 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3326 
3327 	return mask;
3328 }
3329 
3330 #ifdef CONFIG_PROC_FS
3331 
3332 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3333 
3334 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3335 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3336 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3337 
unix_from_bucket(struct seq_file * seq,loff_t * pos)3338 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3339 {
3340 	unsigned long offset = get_offset(*pos);
3341 	unsigned long bucket = get_bucket(*pos);
3342 	unsigned long count = 0;
3343 	struct sock *sk;
3344 
3345 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3346 	     sk; sk = sk_next(sk)) {
3347 		if (++count == offset)
3348 			break;
3349 	}
3350 
3351 	return sk;
3352 }
3353 
unix_get_first(struct seq_file * seq,loff_t * pos)3354 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3355 {
3356 	unsigned long bucket = get_bucket(*pos);
3357 	struct net *net = seq_file_net(seq);
3358 	struct sock *sk;
3359 
3360 	while (bucket < UNIX_HASH_SIZE) {
3361 		spin_lock(&net->unx.table.locks[bucket]);
3362 
3363 		sk = unix_from_bucket(seq, pos);
3364 		if (sk)
3365 			return sk;
3366 
3367 		spin_unlock(&net->unx.table.locks[bucket]);
3368 
3369 		*pos = set_bucket_offset(++bucket, 1);
3370 	}
3371 
3372 	return NULL;
3373 }
3374 
unix_get_next(struct seq_file * seq,struct sock * sk,loff_t * pos)3375 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3376 				  loff_t *pos)
3377 {
3378 	unsigned long bucket = get_bucket(*pos);
3379 
3380 	sk = sk_next(sk);
3381 	if (sk)
3382 		return sk;
3383 
3384 
3385 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3386 
3387 	*pos = set_bucket_offset(++bucket, 1);
3388 
3389 	return unix_get_first(seq, pos);
3390 }
3391 
unix_seq_start(struct seq_file * seq,loff_t * pos)3392 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3393 {
3394 	if (!*pos)
3395 		return SEQ_START_TOKEN;
3396 
3397 	return unix_get_first(seq, pos);
3398 }
3399 
unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3400 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3401 {
3402 	++*pos;
3403 
3404 	if (v == SEQ_START_TOKEN)
3405 		return unix_get_first(seq, pos);
3406 
3407 	return unix_get_next(seq, v, pos);
3408 }
3409 
unix_seq_stop(struct seq_file * seq,void * v)3410 static void unix_seq_stop(struct seq_file *seq, void *v)
3411 {
3412 	struct sock *sk = v;
3413 
3414 	if (sk)
3415 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3416 }
3417 
unix_seq_show(struct seq_file * seq,void * v)3418 static int unix_seq_show(struct seq_file *seq, void *v)
3419 {
3420 
3421 	if (v == SEQ_START_TOKEN)
3422 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3423 			 "Inode Path\n");
3424 	else {
3425 		struct sock *s = v;
3426 		struct unix_sock *u = unix_sk(s);
3427 		unix_state_lock(s);
3428 
3429 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3430 			s,
3431 			refcount_read(&s->sk_refcnt),
3432 			0,
3433 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3434 			s->sk_type,
3435 			s->sk_socket ?
3436 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3437 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3438 			sock_i_ino(s));
3439 
3440 		if (u->addr) {	// under a hash table lock here
3441 			int i, len;
3442 			seq_putc(seq, ' ');
3443 
3444 			i = 0;
3445 			len = u->addr->len -
3446 				offsetof(struct sockaddr_un, sun_path);
3447 			if (u->addr->name->sun_path[0]) {
3448 				len--;
3449 			} else {
3450 				seq_putc(seq, '@');
3451 				i++;
3452 			}
3453 			for ( ; i < len; i++)
3454 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3455 					 '@');
3456 		}
3457 		unix_state_unlock(s);
3458 		seq_putc(seq, '\n');
3459 	}
3460 
3461 	return 0;
3462 }
3463 
3464 static const struct seq_operations unix_seq_ops = {
3465 	.start  = unix_seq_start,
3466 	.next   = unix_seq_next,
3467 	.stop   = unix_seq_stop,
3468 	.show   = unix_seq_show,
3469 };
3470 
3471 #ifdef CONFIG_BPF_SYSCALL
3472 struct bpf_unix_iter_state {
3473 	struct seq_net_private p;
3474 	unsigned int cur_sk;
3475 	unsigned int end_sk;
3476 	unsigned int max_sk;
3477 	struct sock **batch;
3478 	bool st_bucket_done;
3479 };
3480 
3481 struct bpf_iter__unix {
3482 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3483 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3484 	uid_t uid __aligned(8);
3485 };
3486 
unix_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3487 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3488 			      struct unix_sock *unix_sk, uid_t uid)
3489 {
3490 	struct bpf_iter__unix ctx;
3491 
3492 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3493 	ctx.meta = meta;
3494 	ctx.unix_sk = unix_sk;
3495 	ctx.uid = uid;
3496 	return bpf_iter_run_prog(prog, &ctx);
3497 }
3498 
bpf_iter_unix_hold_batch(struct seq_file * seq,struct sock * start_sk)3499 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3500 
3501 {
3502 	struct bpf_unix_iter_state *iter = seq->private;
3503 	unsigned int expected = 1;
3504 	struct sock *sk;
3505 
3506 	sock_hold(start_sk);
3507 	iter->batch[iter->end_sk++] = start_sk;
3508 
3509 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3510 		if (iter->end_sk < iter->max_sk) {
3511 			sock_hold(sk);
3512 			iter->batch[iter->end_sk++] = sk;
3513 		}
3514 
3515 		expected++;
3516 	}
3517 
3518 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3519 
3520 	return expected;
3521 }
3522 
bpf_iter_unix_put_batch(struct bpf_unix_iter_state * iter)3523 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3524 {
3525 	while (iter->cur_sk < iter->end_sk)
3526 		sock_put(iter->batch[iter->cur_sk++]);
3527 }
3528 
bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state * iter,unsigned int new_batch_sz)3529 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3530 				       unsigned int new_batch_sz)
3531 {
3532 	struct sock **new_batch;
3533 
3534 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3535 			     GFP_USER | __GFP_NOWARN);
3536 	if (!new_batch)
3537 		return -ENOMEM;
3538 
3539 	bpf_iter_unix_put_batch(iter);
3540 	kvfree(iter->batch);
3541 	iter->batch = new_batch;
3542 	iter->max_sk = new_batch_sz;
3543 
3544 	return 0;
3545 }
3546 
bpf_iter_unix_batch(struct seq_file * seq,loff_t * pos)3547 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3548 					loff_t *pos)
3549 {
3550 	struct bpf_unix_iter_state *iter = seq->private;
3551 	unsigned int expected;
3552 	bool resized = false;
3553 	struct sock *sk;
3554 
3555 	if (iter->st_bucket_done)
3556 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3557 
3558 again:
3559 	/* Get a new batch */
3560 	iter->cur_sk = 0;
3561 	iter->end_sk = 0;
3562 
3563 	sk = unix_get_first(seq, pos);
3564 	if (!sk)
3565 		return NULL; /* Done */
3566 
3567 	expected = bpf_iter_unix_hold_batch(seq, sk);
3568 
3569 	if (iter->end_sk == expected) {
3570 		iter->st_bucket_done = true;
3571 		return sk;
3572 	}
3573 
3574 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3575 		resized = true;
3576 		goto again;
3577 	}
3578 
3579 	return sk;
3580 }
3581 
bpf_iter_unix_seq_start(struct seq_file * seq,loff_t * pos)3582 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3583 {
3584 	if (!*pos)
3585 		return SEQ_START_TOKEN;
3586 
3587 	/* bpf iter does not support lseek, so it always
3588 	 * continue from where it was stop()-ped.
3589 	 */
3590 	return bpf_iter_unix_batch(seq, pos);
3591 }
3592 
bpf_iter_unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3593 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3594 {
3595 	struct bpf_unix_iter_state *iter = seq->private;
3596 	struct sock *sk;
3597 
3598 	/* Whenever seq_next() is called, the iter->cur_sk is
3599 	 * done with seq_show(), so advance to the next sk in
3600 	 * the batch.
3601 	 */
3602 	if (iter->cur_sk < iter->end_sk)
3603 		sock_put(iter->batch[iter->cur_sk++]);
3604 
3605 	++*pos;
3606 
3607 	if (iter->cur_sk < iter->end_sk)
3608 		sk = iter->batch[iter->cur_sk];
3609 	else
3610 		sk = bpf_iter_unix_batch(seq, pos);
3611 
3612 	return sk;
3613 }
3614 
bpf_iter_unix_seq_show(struct seq_file * seq,void * v)3615 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3616 {
3617 	struct bpf_iter_meta meta;
3618 	struct bpf_prog *prog;
3619 	struct sock *sk = v;
3620 	uid_t uid;
3621 	bool slow;
3622 	int ret;
3623 
3624 	if (v == SEQ_START_TOKEN)
3625 		return 0;
3626 
3627 	slow = lock_sock_fast(sk);
3628 
3629 	if (unlikely(sk_unhashed(sk))) {
3630 		ret = SEQ_SKIP;
3631 		goto unlock;
3632 	}
3633 
3634 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3635 	meta.seq = seq;
3636 	prog = bpf_iter_get_info(&meta, false);
3637 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3638 unlock:
3639 	unlock_sock_fast(sk, slow);
3640 	return ret;
3641 }
3642 
bpf_iter_unix_seq_stop(struct seq_file * seq,void * v)3643 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3644 {
3645 	struct bpf_unix_iter_state *iter = seq->private;
3646 	struct bpf_iter_meta meta;
3647 	struct bpf_prog *prog;
3648 
3649 	if (!v) {
3650 		meta.seq = seq;
3651 		prog = bpf_iter_get_info(&meta, true);
3652 		if (prog)
3653 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3654 	}
3655 
3656 	if (iter->cur_sk < iter->end_sk)
3657 		bpf_iter_unix_put_batch(iter);
3658 }
3659 
3660 static const struct seq_operations bpf_iter_unix_seq_ops = {
3661 	.start	= bpf_iter_unix_seq_start,
3662 	.next	= bpf_iter_unix_seq_next,
3663 	.stop	= bpf_iter_unix_seq_stop,
3664 	.show	= bpf_iter_unix_seq_show,
3665 };
3666 #endif
3667 #endif
3668 
3669 static const struct net_proto_family unix_family_ops = {
3670 	.family = PF_UNIX,
3671 	.create = unix_create,
3672 	.owner	= THIS_MODULE,
3673 };
3674 
3675 
unix_net_init(struct net * net)3676 static int __net_init unix_net_init(struct net *net)
3677 {
3678 	int i;
3679 
3680 	net->unx.sysctl_max_dgram_qlen = 10;
3681 	if (unix_sysctl_register(net))
3682 		goto out;
3683 
3684 #ifdef CONFIG_PROC_FS
3685 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3686 			     sizeof(struct seq_net_private)))
3687 		goto err_sysctl;
3688 #endif
3689 
3690 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3691 					      sizeof(spinlock_t), GFP_KERNEL);
3692 	if (!net->unx.table.locks)
3693 		goto err_proc;
3694 
3695 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3696 						sizeof(struct hlist_head),
3697 						GFP_KERNEL);
3698 	if (!net->unx.table.buckets)
3699 		goto free_locks;
3700 
3701 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3702 		spin_lock_init(&net->unx.table.locks[i]);
3703 		lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3704 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3705 	}
3706 
3707 	return 0;
3708 
3709 free_locks:
3710 	kvfree(net->unx.table.locks);
3711 err_proc:
3712 #ifdef CONFIG_PROC_FS
3713 	remove_proc_entry("unix", net->proc_net);
3714 err_sysctl:
3715 #endif
3716 	unix_sysctl_unregister(net);
3717 out:
3718 	return -ENOMEM;
3719 }
3720 
unix_net_exit(struct net * net)3721 static void __net_exit unix_net_exit(struct net *net)
3722 {
3723 	kvfree(net->unx.table.buckets);
3724 	kvfree(net->unx.table.locks);
3725 	unix_sysctl_unregister(net);
3726 	remove_proc_entry("unix", net->proc_net);
3727 }
3728 
3729 static struct pernet_operations unix_net_ops = {
3730 	.init = unix_net_init,
3731 	.exit = unix_net_exit,
3732 };
3733 
3734 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(unix,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3735 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3736 		     struct unix_sock *unix_sk, uid_t uid)
3737 
3738 #define INIT_BATCH_SZ 16
3739 
3740 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3741 {
3742 	struct bpf_unix_iter_state *iter = priv_data;
3743 	int err;
3744 
3745 	err = bpf_iter_init_seq_net(priv_data, aux);
3746 	if (err)
3747 		return err;
3748 
3749 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3750 	if (err) {
3751 		bpf_iter_fini_seq_net(priv_data);
3752 		return err;
3753 	}
3754 
3755 	return 0;
3756 }
3757 
bpf_iter_fini_unix(void * priv_data)3758 static void bpf_iter_fini_unix(void *priv_data)
3759 {
3760 	struct bpf_unix_iter_state *iter = priv_data;
3761 
3762 	bpf_iter_fini_seq_net(priv_data);
3763 	kvfree(iter->batch);
3764 }
3765 
3766 static const struct bpf_iter_seq_info unix_seq_info = {
3767 	.seq_ops		= &bpf_iter_unix_seq_ops,
3768 	.init_seq_private	= bpf_iter_init_unix,
3769 	.fini_seq_private	= bpf_iter_fini_unix,
3770 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3771 };
3772 
3773 static const struct bpf_func_proto *
bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3774 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3775 			     const struct bpf_prog *prog)
3776 {
3777 	switch (func_id) {
3778 	case BPF_FUNC_setsockopt:
3779 		return &bpf_sk_setsockopt_proto;
3780 	case BPF_FUNC_getsockopt:
3781 		return &bpf_sk_getsockopt_proto;
3782 	default:
3783 		return NULL;
3784 	}
3785 }
3786 
3787 static struct bpf_iter_reg unix_reg_info = {
3788 	.target			= "unix",
3789 	.ctx_arg_info_size	= 1,
3790 	.ctx_arg_info		= {
3791 		{ offsetof(struct bpf_iter__unix, unix_sk),
3792 		  PTR_TO_BTF_ID_OR_NULL },
3793 	},
3794 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3795 	.seq_info		= &unix_seq_info,
3796 };
3797 
bpf_iter_register(void)3798 static void __init bpf_iter_register(void)
3799 {
3800 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3801 	if (bpf_iter_reg_target(&unix_reg_info))
3802 		pr_warn("Warning: could not register bpf iterator unix\n");
3803 }
3804 #endif
3805 
af_unix_init(void)3806 static int __init af_unix_init(void)
3807 {
3808 	int i, rc = -1;
3809 
3810 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3811 
3812 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3813 		spin_lock_init(&bsd_socket_locks[i]);
3814 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3815 	}
3816 
3817 	rc = proto_register(&unix_dgram_proto, 1);
3818 	if (rc != 0) {
3819 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3820 		goto out;
3821 	}
3822 
3823 	rc = proto_register(&unix_stream_proto, 1);
3824 	if (rc != 0) {
3825 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3826 		proto_unregister(&unix_dgram_proto);
3827 		goto out;
3828 	}
3829 
3830 	sock_register(&unix_family_ops);
3831 	register_pernet_subsys(&unix_net_ops);
3832 	unix_bpf_build_proto();
3833 
3834 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3835 	bpf_iter_register();
3836 #endif
3837 
3838 out:
3839 	return rc;
3840 }
3841 
3842 /* Later than subsys_initcall() because we depend on stuff initialised there */
3843 fs_initcall(af_unix_init);
3844