• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  linux/fs/pipe.c
3  *
4  *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
5  */
6 
7 #include <linux/mm.h>
8 #include <linux/file.h>
9 #include <linux/poll.h>
10 #include <linux/slab.h>
11 #include <linux/module.h>
12 #include <linux/init.h>
13 #include <linux/fs.h>
14 #include <linux/log2.h>
15 #include <linux/mount.h>
16 #include <linux/magic.h>
17 #include <linux/pipe_fs_i.h>
18 #include <linux/uio.h>
19 #include <linux/highmem.h>
20 #include <linux/pagemap.h>
21 #include <linux/audit.h>
22 #include <linux/syscalls.h>
23 #include <linux/fcntl.h>
24 
25 #include <asm/uaccess.h>
26 #include <asm/ioctls.h>
27 
28 #include "internal.h"
29 
30 /*
31  * New pipe buffers will be restricted to this size while the user is exceeding
32  * their pipe buffer quota. The general pipe use case needs at least two
33  * buffers: one for data yet to be read, and one for new data. If this is less
34  * than two, then a write to a non-empty pipe may block even if the pipe is not
35  * full. This can occur with GNU make jobserver or similar uses of pipes as
36  * semaphores: multiple processes may be waiting to write tokens back to the
37  * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
38  *
39  * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
40  * own risk, namely: pipe writes to non-full pipes may block until the pipe is
41  * emptied.
42  */
43 #define PIPE_MIN_DEF_BUFFERS 2
44 
45 /*
46  * The max size that a non-root user is allowed to grow the pipe. Can
47  * be set by root in /proc/sys/fs/pipe-max-size
48  */
49 unsigned int pipe_max_size = 1048576;
50 
51 /*
52  * Minimum pipe size, as required by POSIX
53  */
54 unsigned int pipe_min_size = PAGE_SIZE;
55 
56 /* Maximum allocatable pages per user. Hard limit is unset by default, soft
57  * matches default values.
58  */
59 unsigned long pipe_user_pages_hard;
60 unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
61 
62 /*
63  * We use a start+len construction, which provides full use of the
64  * allocated memory.
65  * -- Florian Coosmann (FGC)
66  *
67  * Reads with count = 0 should always return 0.
68  * -- Julian Bradfield 1999-06-07.
69  *
70  * FIFOs and Pipes now generate SIGIO for both readers and writers.
71  * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
72  *
73  * pipe_read & write cleanup
74  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
75  */
76 
pipe_lock_nested(struct pipe_inode_info * pipe,int subclass)77 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
78 {
79 	if (pipe->files)
80 		mutex_lock_nested(&pipe->mutex, subclass);
81 }
82 
pipe_lock(struct pipe_inode_info * pipe)83 void pipe_lock(struct pipe_inode_info *pipe)
84 {
85 	/*
86 	 * pipe_lock() nests non-pipe inode locks (for writing to a file)
87 	 */
88 	pipe_lock_nested(pipe, I_MUTEX_PARENT);
89 }
90 EXPORT_SYMBOL(pipe_lock);
91 
pipe_unlock(struct pipe_inode_info * pipe)92 void pipe_unlock(struct pipe_inode_info *pipe)
93 {
94 	if (pipe->files)
95 		mutex_unlock(&pipe->mutex);
96 }
97 EXPORT_SYMBOL(pipe_unlock);
98 
__pipe_lock(struct pipe_inode_info * pipe)99 static inline void __pipe_lock(struct pipe_inode_info *pipe)
100 {
101 	mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
102 }
103 
__pipe_unlock(struct pipe_inode_info * pipe)104 static inline void __pipe_unlock(struct pipe_inode_info *pipe)
105 {
106 	mutex_unlock(&pipe->mutex);
107 }
108 
pipe_double_lock(struct pipe_inode_info * pipe1,struct pipe_inode_info * pipe2)109 void pipe_double_lock(struct pipe_inode_info *pipe1,
110 		      struct pipe_inode_info *pipe2)
111 {
112 	BUG_ON(pipe1 == pipe2);
113 
114 	if (pipe1 < pipe2) {
115 		pipe_lock_nested(pipe1, I_MUTEX_PARENT);
116 		pipe_lock_nested(pipe2, I_MUTEX_CHILD);
117 	} else {
118 		pipe_lock_nested(pipe2, I_MUTEX_PARENT);
119 		pipe_lock_nested(pipe1, I_MUTEX_CHILD);
120 	}
121 }
122 
123 /* Drop the inode semaphore and wait for a pipe event, atomically */
pipe_wait(struct pipe_inode_info * pipe)124 void pipe_wait(struct pipe_inode_info *pipe)
125 {
126 	DEFINE_WAIT(wait);
127 
128 	/*
129 	 * Pipes are system-local resources, so sleeping on them
130 	 * is considered a noninteractive wait:
131 	 */
132 	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
133 	pipe_unlock(pipe);
134 	schedule();
135 	finish_wait(&pipe->wait, &wait);
136 	pipe_lock(pipe);
137 }
138 
anon_pipe_buf_release(struct pipe_inode_info * pipe,struct pipe_buffer * buf)139 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
140 				  struct pipe_buffer *buf)
141 {
142 	struct page *page = buf->page;
143 
144 	/*
145 	 * If nobody else uses this page, and we don't already have a
146 	 * temporary page, let's keep track of it as a one-deep
147 	 * allocation cache. (Otherwise just release our reference to it)
148 	 */
149 	if (page_count(page) == 1 && !pipe->tmp_page)
150 		pipe->tmp_page = page;
151 	else
152 		page_cache_release(page);
153 }
154 
155 /**
156  * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
157  * @pipe:	the pipe that the buffer belongs to
158  * @buf:	the buffer to attempt to steal
159  *
160  * Description:
161  *	This function attempts to steal the &struct page attached to
162  *	@buf. If successful, this function returns 0 and returns with
163  *	the page locked. The caller may then reuse the page for whatever
164  *	he wishes; the typical use is insertion into a different file
165  *	page cache.
166  */
generic_pipe_buf_steal(struct pipe_inode_info * pipe,struct pipe_buffer * buf)167 int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
168 			   struct pipe_buffer *buf)
169 {
170 	struct page *page = buf->page;
171 
172 	/*
173 	 * A reference of one is golden, that means that the owner of this
174 	 * page is the only one holding a reference to it. lock the page
175 	 * and return OK.
176 	 */
177 	if (page_count(page) == 1) {
178 		lock_page(page);
179 		return 0;
180 	}
181 
182 	return 1;
183 }
184 EXPORT_SYMBOL(generic_pipe_buf_steal);
185 
186 /**
187  * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
188  * @pipe:	the pipe that the buffer belongs to
189  * @buf:	the buffer to get a reference to
190  *
191  * Description:
192  *	This function grabs an extra reference to @buf. It's used in
193  *	in the tee() system call, when we duplicate the buffers in one
194  *	pipe into another.
195  */
generic_pipe_buf_get(struct pipe_inode_info * pipe,struct pipe_buffer * buf)196 bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
197 {
198 	return try_get_page(buf->page);
199 }
200 EXPORT_SYMBOL(generic_pipe_buf_get);
201 
202 /**
203  * generic_pipe_buf_confirm - verify contents of the pipe buffer
204  * @info:	the pipe that the buffer belongs to
205  * @buf:	the buffer to confirm
206  *
207  * Description:
208  *	This function does nothing, because the generic pipe code uses
209  *	pages that are always good when inserted into the pipe.
210  */
generic_pipe_buf_confirm(struct pipe_inode_info * info,struct pipe_buffer * buf)211 int generic_pipe_buf_confirm(struct pipe_inode_info *info,
212 			     struct pipe_buffer *buf)
213 {
214 	return 0;
215 }
216 EXPORT_SYMBOL(generic_pipe_buf_confirm);
217 
218 /**
219  * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
220  * @pipe:	the pipe that the buffer belongs to
221  * @buf:	the buffer to put a reference to
222  *
223  * Description:
224  *	This function releases a reference to @buf.
225  */
generic_pipe_buf_release(struct pipe_inode_info * pipe,struct pipe_buffer * buf)226 void generic_pipe_buf_release(struct pipe_inode_info *pipe,
227 			      struct pipe_buffer *buf)
228 {
229 	page_cache_release(buf->page);
230 }
231 EXPORT_SYMBOL(generic_pipe_buf_release);
232 
233 static const struct pipe_buf_operations anon_pipe_buf_ops = {
234 	.can_merge = 1,
235 	.confirm = generic_pipe_buf_confirm,
236 	.release = anon_pipe_buf_release,
237 	.steal = generic_pipe_buf_steal,
238 	.get = generic_pipe_buf_get,
239 };
240 
241 static const struct pipe_buf_operations packet_pipe_buf_ops = {
242 	.can_merge = 0,
243 	.confirm = generic_pipe_buf_confirm,
244 	.release = anon_pipe_buf_release,
245 	.steal = generic_pipe_buf_steal,
246 	.get = generic_pipe_buf_get,
247 };
248 
249 static ssize_t
pipe_read(struct kiocb * iocb,struct iov_iter * to)250 pipe_read(struct kiocb *iocb, struct iov_iter *to)
251 {
252 	size_t total_len = iov_iter_count(to);
253 	struct file *filp = iocb->ki_filp;
254 	struct pipe_inode_info *pipe = filp->private_data;
255 	int do_wakeup;
256 	ssize_t ret;
257 
258 	/* Null read succeeds. */
259 	if (unlikely(total_len == 0))
260 		return 0;
261 
262 	do_wakeup = 0;
263 	ret = 0;
264 	__pipe_lock(pipe);
265 	for (;;) {
266 		int bufs = pipe->nrbufs;
267 		if (bufs) {
268 			int curbuf = pipe->curbuf;
269 			struct pipe_buffer *buf = pipe->bufs + curbuf;
270 			const struct pipe_buf_operations *ops = buf->ops;
271 			size_t chars = buf->len;
272 			size_t written;
273 			int error;
274 
275 			if (chars > total_len)
276 				chars = total_len;
277 
278 			error = ops->confirm(pipe, buf);
279 			if (error) {
280 				if (!ret)
281 					ret = error;
282 				break;
283 			}
284 
285 			written = copy_page_to_iter(buf->page, buf->offset, chars, to);
286 			if (unlikely(written < chars)) {
287 				if (!ret)
288 					ret = -EFAULT;
289 				break;
290 			}
291 			ret += chars;
292 			buf->offset += chars;
293 			buf->len -= chars;
294 
295 			/* Was it a packet buffer? Clean up and exit */
296 			if (buf->flags & PIPE_BUF_FLAG_PACKET) {
297 				total_len = chars;
298 				buf->len = 0;
299 			}
300 
301 			if (!buf->len) {
302 				buf->ops = NULL;
303 				ops->release(pipe, buf);
304 				curbuf = (curbuf + 1) & (pipe->buffers - 1);
305 				pipe->curbuf = curbuf;
306 				pipe->nrbufs = --bufs;
307 				do_wakeup = 1;
308 			}
309 			total_len -= chars;
310 			if (!total_len)
311 				break;	/* common path: read succeeded */
312 		}
313 		if (bufs)	/* More to do? */
314 			continue;
315 		if (!pipe->writers)
316 			break;
317 		if (!pipe->waiting_writers) {
318 			/* syscall merging: Usually we must not sleep
319 			 * if O_NONBLOCK is set, or if we got some data.
320 			 * But if a writer sleeps in kernel space, then
321 			 * we can wait for that data without violating POSIX.
322 			 */
323 			if (ret)
324 				break;
325 			if (filp->f_flags & O_NONBLOCK) {
326 				ret = -EAGAIN;
327 				break;
328 			}
329 		}
330 		if (signal_pending(current)) {
331 			if (!ret)
332 				ret = -ERESTARTSYS;
333 			break;
334 		}
335 		if (do_wakeup) {
336 			wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
337  			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
338 		}
339 		pipe_wait(pipe);
340 	}
341 	__pipe_unlock(pipe);
342 
343 	/* Signal writers asynchronously that there is more room. */
344 	if (do_wakeup) {
345 		wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
346 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
347 	}
348 	if (ret > 0)
349 		file_accessed(filp);
350 	return ret;
351 }
352 
is_packetized(struct file * file)353 static inline int is_packetized(struct file *file)
354 {
355 	return (file->f_flags & O_DIRECT) != 0;
356 }
357 
358 static ssize_t
pipe_write(struct kiocb * iocb,struct iov_iter * from)359 pipe_write(struct kiocb *iocb, struct iov_iter *from)
360 {
361 	struct file *filp = iocb->ki_filp;
362 	struct pipe_inode_info *pipe = filp->private_data;
363 	ssize_t ret = 0;
364 	int do_wakeup = 0;
365 	size_t total_len = iov_iter_count(from);
366 	ssize_t chars;
367 
368 	/* Null write succeeds. */
369 	if (unlikely(total_len == 0))
370 		return 0;
371 
372 	__pipe_lock(pipe);
373 
374 	if (!pipe->readers) {
375 		send_sig(SIGPIPE, current, 0);
376 		ret = -EPIPE;
377 		goto out;
378 	}
379 
380 	/* We try to merge small writes */
381 	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
382 	if (pipe->nrbufs && chars != 0) {
383 		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
384 							(pipe->buffers - 1);
385 		struct pipe_buffer *buf = pipe->bufs + lastbuf;
386 		const struct pipe_buf_operations *ops = buf->ops;
387 		int offset = buf->offset + buf->len;
388 
389 		if (ops->can_merge && offset + chars <= PAGE_SIZE) {
390 			ret = ops->confirm(pipe, buf);
391 			if (ret)
392 				goto out;
393 
394 			ret = copy_page_from_iter(buf->page, offset, chars, from);
395 			if (unlikely(ret < chars)) {
396 				ret = -EFAULT;
397 				goto out;
398 			}
399 			do_wakeup = 1;
400 			buf->len += ret;
401 			if (!iov_iter_count(from))
402 				goto out;
403 		}
404 	}
405 
406 	for (;;) {
407 		int bufs;
408 
409 		if (!pipe->readers) {
410 			send_sig(SIGPIPE, current, 0);
411 			if (!ret)
412 				ret = -EPIPE;
413 			break;
414 		}
415 		bufs = pipe->nrbufs;
416 		if (bufs < pipe->buffers) {
417 			int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
418 			struct pipe_buffer *buf = pipe->bufs + newbuf;
419 			struct page *page = pipe->tmp_page;
420 			int copied;
421 
422 			if (!page) {
423 				page = alloc_page(GFP_HIGHUSER);
424 				if (unlikely(!page)) {
425 					ret = ret ? : -ENOMEM;
426 					break;
427 				}
428 				pipe->tmp_page = page;
429 			}
430 			/* Always wake up, even if the copy fails. Otherwise
431 			 * we lock up (O_NONBLOCK-)readers that sleep due to
432 			 * syscall merging.
433 			 * FIXME! Is this really true?
434 			 */
435 			do_wakeup = 1;
436 			copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
437 			if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
438 				if (!ret)
439 					ret = -EFAULT;
440 				break;
441 			}
442 			ret += copied;
443 
444 			/* Insert it into the buffer array */
445 			buf->page = page;
446 			buf->ops = &anon_pipe_buf_ops;
447 			buf->offset = 0;
448 			buf->len = copied;
449 			buf->flags = 0;
450 			if (is_packetized(filp)) {
451 				buf->ops = &packet_pipe_buf_ops;
452 				buf->flags = PIPE_BUF_FLAG_PACKET;
453 			}
454 			pipe->nrbufs = ++bufs;
455 			pipe->tmp_page = NULL;
456 
457 			if (!iov_iter_count(from))
458 				break;
459 		}
460 		if (bufs < pipe->buffers)
461 			continue;
462 		if (filp->f_flags & O_NONBLOCK) {
463 			if (!ret)
464 				ret = -EAGAIN;
465 			break;
466 		}
467 		if (signal_pending(current)) {
468 			if (!ret)
469 				ret = -ERESTARTSYS;
470 			break;
471 		}
472 		if (do_wakeup) {
473 			wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
474 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
475 			do_wakeup = 0;
476 		}
477 		pipe->waiting_writers++;
478 		pipe_wait(pipe);
479 		pipe->waiting_writers--;
480 	}
481 out:
482 	__pipe_unlock(pipe);
483 	if (do_wakeup) {
484 		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
485 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
486 	}
487 	if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
488 		int err = file_update_time(filp);
489 		if (err)
490 			ret = err;
491 		sb_end_write(file_inode(filp)->i_sb);
492 	}
493 	return ret;
494 }
495 
pipe_ioctl(struct file * filp,unsigned int cmd,unsigned long arg)496 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
497 {
498 	struct pipe_inode_info *pipe = filp->private_data;
499 	int count, buf, nrbufs;
500 
501 	switch (cmd) {
502 		case FIONREAD:
503 			__pipe_lock(pipe);
504 			count = 0;
505 			buf = pipe->curbuf;
506 			nrbufs = pipe->nrbufs;
507 			while (--nrbufs >= 0) {
508 				count += pipe->bufs[buf].len;
509 				buf = (buf+1) & (pipe->buffers - 1);
510 			}
511 			__pipe_unlock(pipe);
512 
513 			return put_user(count, (int __user *)arg);
514 		default:
515 			return -ENOIOCTLCMD;
516 	}
517 }
518 
519 /* No kernel lock held - fine */
520 static unsigned int
pipe_poll(struct file * filp,poll_table * wait)521 pipe_poll(struct file *filp, poll_table *wait)
522 {
523 	unsigned int mask;
524 	struct pipe_inode_info *pipe = filp->private_data;
525 	int nrbufs;
526 
527 	poll_wait(filp, &pipe->wait, wait);
528 
529 	/* Reading only -- no need for acquiring the semaphore.  */
530 	nrbufs = pipe->nrbufs;
531 	mask = 0;
532 	if (filp->f_mode & FMODE_READ) {
533 		mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
534 		if (!pipe->writers && filp->f_version != pipe->w_counter)
535 			mask |= POLLHUP;
536 	}
537 
538 	if (filp->f_mode & FMODE_WRITE) {
539 		mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
540 		/*
541 		 * Most Unices do not set POLLERR for FIFOs but on Linux they
542 		 * behave exactly like pipes for poll().
543 		 */
544 		if (!pipe->readers)
545 			mask |= POLLERR;
546 	}
547 
548 	return mask;
549 }
550 
put_pipe_info(struct inode * inode,struct pipe_inode_info * pipe)551 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
552 {
553 	int kill = 0;
554 
555 	spin_lock(&inode->i_lock);
556 	if (!--pipe->files) {
557 		inode->i_pipe = NULL;
558 		kill = 1;
559 	}
560 	spin_unlock(&inode->i_lock);
561 
562 	if (kill)
563 		free_pipe_info(pipe);
564 }
565 
566 static int
pipe_release(struct inode * inode,struct file * file)567 pipe_release(struct inode *inode, struct file *file)
568 {
569 	struct pipe_inode_info *pipe = file->private_data;
570 
571 	__pipe_lock(pipe);
572 	if (file->f_mode & FMODE_READ)
573 		pipe->readers--;
574 	if (file->f_mode & FMODE_WRITE)
575 		pipe->writers--;
576 
577 	if (pipe->readers || pipe->writers) {
578 		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
579 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
580 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
581 	}
582 	__pipe_unlock(pipe);
583 
584 	put_pipe_info(inode, pipe);
585 	return 0;
586 }
587 
588 static int
pipe_fasync(int fd,struct file * filp,int on)589 pipe_fasync(int fd, struct file *filp, int on)
590 {
591 	struct pipe_inode_info *pipe = filp->private_data;
592 	int retval = 0;
593 
594 	__pipe_lock(pipe);
595 	if (filp->f_mode & FMODE_READ)
596 		retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
597 	if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
598 		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
599 		if (retval < 0 && (filp->f_mode & FMODE_READ))
600 			/* this can happen only if on == T */
601 			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
602 	}
603 	__pipe_unlock(pipe);
604 	return retval;
605 }
606 
account_pipe_buffers(struct pipe_inode_info * pipe,unsigned long old,unsigned long new)607 static void account_pipe_buffers(struct pipe_inode_info *pipe,
608                                  unsigned long old, unsigned long new)
609 {
610 	atomic_long_add(new - old, &pipe->user->pipe_bufs);
611 }
612 
too_many_pipe_buffers_soft(struct user_struct * user)613 static bool too_many_pipe_buffers_soft(struct user_struct *user)
614 {
615 	return pipe_user_pages_soft &&
616 	       atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft;
617 }
618 
too_many_pipe_buffers_hard(struct user_struct * user)619 static bool too_many_pipe_buffers_hard(struct user_struct *user)
620 {
621 	return pipe_user_pages_hard &&
622 	       atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard;
623 }
624 
alloc_pipe_info(void)625 struct pipe_inode_info *alloc_pipe_info(void)
626 {
627 	struct pipe_inode_info *pipe;
628 
629 	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
630 	if (pipe) {
631 		unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
632 		struct user_struct *user = get_current_user();
633 
634 		if (pipe_bufs * PAGE_SIZE > pipe_max_size && !capable(CAP_SYS_RESOURCE))
635 			pipe_bufs = pipe_max_size >> PAGE_SHIFT;
636 
637 		if (!too_many_pipe_buffers_hard(user)) {
638 			if (too_many_pipe_buffers_soft(user))
639 				pipe_bufs = PIPE_MIN_DEF_BUFFERS;
640 			pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL);
641 		}
642 
643 		if (pipe->bufs) {
644 			init_waitqueue_head(&pipe->wait);
645 			pipe->r_counter = pipe->w_counter = 1;
646 			pipe->buffers = pipe_bufs;
647 			pipe->user = user;
648 			account_pipe_buffers(pipe, 0, pipe_bufs);
649 			mutex_init(&pipe->mutex);
650 			return pipe;
651 		}
652 		free_uid(user);
653 		kfree(pipe);
654 	}
655 
656 	return NULL;
657 }
658 
free_pipe_info(struct pipe_inode_info * pipe)659 void free_pipe_info(struct pipe_inode_info *pipe)
660 {
661 	int i;
662 
663 	account_pipe_buffers(pipe, pipe->buffers, 0);
664 	free_uid(pipe->user);
665 	for (i = 0; i < pipe->buffers; i++) {
666 		struct pipe_buffer *buf = pipe->bufs + i;
667 		if (buf->ops)
668 			buf->ops->release(pipe, buf);
669 	}
670 	if (pipe->tmp_page)
671 		__free_page(pipe->tmp_page);
672 	kfree(pipe->bufs);
673 	kfree(pipe);
674 }
675 
676 static struct vfsmount *pipe_mnt __read_mostly;
677 
678 /*
679  * pipefs_dname() is called from d_path().
680  */
pipefs_dname(struct dentry * dentry,char * buffer,int buflen)681 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
682 {
683 	return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
684 				d_inode(dentry)->i_ino);
685 }
686 
687 static const struct dentry_operations pipefs_dentry_operations = {
688 	.d_dname	= pipefs_dname,
689 };
690 
get_pipe_inode(void)691 static struct inode * get_pipe_inode(void)
692 {
693 	struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
694 	struct pipe_inode_info *pipe;
695 
696 	if (!inode)
697 		goto fail_inode;
698 
699 	inode->i_ino = get_next_ino();
700 
701 	pipe = alloc_pipe_info();
702 	if (!pipe)
703 		goto fail_iput;
704 
705 	inode->i_pipe = pipe;
706 	pipe->files = 2;
707 	pipe->readers = pipe->writers = 1;
708 	inode->i_fop = &pipefifo_fops;
709 
710 	/*
711 	 * Mark the inode dirty from the very beginning,
712 	 * that way it will never be moved to the dirty
713 	 * list because "mark_inode_dirty()" will think
714 	 * that it already _is_ on the dirty list.
715 	 */
716 	inode->i_state = I_DIRTY;
717 	inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
718 	inode->i_uid = current_fsuid();
719 	inode->i_gid = current_fsgid();
720 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
721 
722 	return inode;
723 
724 fail_iput:
725 	iput(inode);
726 
727 fail_inode:
728 	return NULL;
729 }
730 
create_pipe_files(struct file ** res,int flags)731 int create_pipe_files(struct file **res, int flags)
732 {
733 	int err;
734 	struct inode *inode = get_pipe_inode();
735 	struct file *f;
736 	struct path path;
737 	static struct qstr name = { .name = "" };
738 
739 	if (!inode)
740 		return -ENFILE;
741 
742 	err = -ENOMEM;
743 	path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
744 	if (!path.dentry)
745 		goto err_inode;
746 	path.mnt = mntget(pipe_mnt);
747 
748 	d_instantiate(path.dentry, inode);
749 
750 	f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);
751 	if (IS_ERR(f)) {
752 		err = PTR_ERR(f);
753 		goto err_dentry;
754 	}
755 
756 	f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
757 	f->private_data = inode->i_pipe;
758 
759 	res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);
760 	if (IS_ERR(res[0])) {
761 		err = PTR_ERR(res[0]);
762 		goto err_file;
763 	}
764 
765 	path_get(&path);
766 	res[0]->private_data = inode->i_pipe;
767 	res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK);
768 	res[1] = f;
769 	return 0;
770 
771 err_file:
772 	put_filp(f);
773 err_dentry:
774 	free_pipe_info(inode->i_pipe);
775 	path_put(&path);
776 	return err;
777 
778 err_inode:
779 	free_pipe_info(inode->i_pipe);
780 	iput(inode);
781 	return err;
782 }
783 
__do_pipe_flags(int * fd,struct file ** files,int flags)784 static int __do_pipe_flags(int *fd, struct file **files, int flags)
785 {
786 	int error;
787 	int fdw, fdr;
788 
789 	if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
790 		return -EINVAL;
791 
792 	error = create_pipe_files(files, flags);
793 	if (error)
794 		return error;
795 
796 	error = get_unused_fd_flags(flags);
797 	if (error < 0)
798 		goto err_read_pipe;
799 	fdr = error;
800 
801 	error = get_unused_fd_flags(flags);
802 	if (error < 0)
803 		goto err_fdr;
804 	fdw = error;
805 
806 	audit_fd_pair(fdr, fdw);
807 	fd[0] = fdr;
808 	fd[1] = fdw;
809 	return 0;
810 
811  err_fdr:
812 	put_unused_fd(fdr);
813  err_read_pipe:
814 	fput(files[0]);
815 	fput(files[1]);
816 	return error;
817 }
818 
do_pipe_flags(int * fd,int flags)819 int do_pipe_flags(int *fd, int flags)
820 {
821 	struct file *files[2];
822 	int error = __do_pipe_flags(fd, files, flags);
823 	if (!error) {
824 		fd_install(fd[0], files[0]);
825 		fd_install(fd[1], files[1]);
826 	}
827 	return error;
828 }
829 
830 /*
831  * sys_pipe() is the normal C calling standard for creating
832  * a pipe. It's not the way Unix traditionally does this, though.
833  */
SYSCALL_DEFINE2(pipe2,int __user *,fildes,int,flags)834 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
835 {
836 	struct file *files[2];
837 	int fd[2];
838 	int error;
839 
840 	error = __do_pipe_flags(fd, files, flags);
841 	if (!error) {
842 		if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
843 			fput(files[0]);
844 			fput(files[1]);
845 			put_unused_fd(fd[0]);
846 			put_unused_fd(fd[1]);
847 			error = -EFAULT;
848 		} else {
849 			fd_install(fd[0], files[0]);
850 			fd_install(fd[1], files[1]);
851 		}
852 	}
853 	return error;
854 }
855 
SYSCALL_DEFINE1(pipe,int __user *,fildes)856 SYSCALL_DEFINE1(pipe, int __user *, fildes)
857 {
858 	return sys_pipe2(fildes, 0);
859 }
860 
wait_for_partner(struct pipe_inode_info * pipe,unsigned int * cnt)861 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
862 {
863 	int cur = *cnt;
864 
865 	while (cur == *cnt) {
866 		pipe_wait(pipe);
867 		if (signal_pending(current))
868 			break;
869 	}
870 	return cur == *cnt ? -ERESTARTSYS : 0;
871 }
872 
wake_up_partner(struct pipe_inode_info * pipe)873 static void wake_up_partner(struct pipe_inode_info *pipe)
874 {
875 	wake_up_interruptible(&pipe->wait);
876 }
877 
fifo_open(struct inode * inode,struct file * filp)878 static int fifo_open(struct inode *inode, struct file *filp)
879 {
880 	struct pipe_inode_info *pipe;
881 	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
882 	int ret;
883 
884 	filp->f_version = 0;
885 
886 	spin_lock(&inode->i_lock);
887 	if (inode->i_pipe) {
888 		pipe = inode->i_pipe;
889 		pipe->files++;
890 		spin_unlock(&inode->i_lock);
891 	} else {
892 		spin_unlock(&inode->i_lock);
893 		pipe = alloc_pipe_info();
894 		if (!pipe)
895 			return -ENOMEM;
896 		pipe->files = 1;
897 		spin_lock(&inode->i_lock);
898 		if (unlikely(inode->i_pipe)) {
899 			inode->i_pipe->files++;
900 			spin_unlock(&inode->i_lock);
901 			free_pipe_info(pipe);
902 			pipe = inode->i_pipe;
903 		} else {
904 			inode->i_pipe = pipe;
905 			spin_unlock(&inode->i_lock);
906 		}
907 	}
908 	filp->private_data = pipe;
909 	/* OK, we have a pipe and it's pinned down */
910 
911 	__pipe_lock(pipe);
912 
913 	/* We can only do regular read/write on fifos */
914 	filp->f_mode &= (FMODE_READ | FMODE_WRITE);
915 
916 	switch (filp->f_mode) {
917 	case FMODE_READ:
918 	/*
919 	 *  O_RDONLY
920 	 *  POSIX.1 says that O_NONBLOCK means return with the FIFO
921 	 *  opened, even when there is no process writing the FIFO.
922 	 */
923 		pipe->r_counter++;
924 		if (pipe->readers++ == 0)
925 			wake_up_partner(pipe);
926 
927 		if (!is_pipe && !pipe->writers) {
928 			if ((filp->f_flags & O_NONBLOCK)) {
929 				/* suppress POLLHUP until we have
930 				 * seen a writer */
931 				filp->f_version = pipe->w_counter;
932 			} else {
933 				if (wait_for_partner(pipe, &pipe->w_counter))
934 					goto err_rd;
935 			}
936 		}
937 		break;
938 
939 	case FMODE_WRITE:
940 	/*
941 	 *  O_WRONLY
942 	 *  POSIX.1 says that O_NONBLOCK means return -1 with
943 	 *  errno=ENXIO when there is no process reading the FIFO.
944 	 */
945 		ret = -ENXIO;
946 		if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
947 			goto err;
948 
949 		pipe->w_counter++;
950 		if (!pipe->writers++)
951 			wake_up_partner(pipe);
952 
953 		if (!is_pipe && !pipe->readers) {
954 			if (wait_for_partner(pipe, &pipe->r_counter))
955 				goto err_wr;
956 		}
957 		break;
958 
959 	case FMODE_READ | FMODE_WRITE:
960 	/*
961 	 *  O_RDWR
962 	 *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
963 	 *  This implementation will NEVER block on a O_RDWR open, since
964 	 *  the process can at least talk to itself.
965 	 */
966 
967 		pipe->readers++;
968 		pipe->writers++;
969 		pipe->r_counter++;
970 		pipe->w_counter++;
971 		if (pipe->readers == 1 || pipe->writers == 1)
972 			wake_up_partner(pipe);
973 		break;
974 
975 	default:
976 		ret = -EINVAL;
977 		goto err;
978 	}
979 
980 	/* Ok! */
981 	__pipe_unlock(pipe);
982 	return 0;
983 
984 err_rd:
985 	if (!--pipe->readers)
986 		wake_up_interruptible(&pipe->wait);
987 	ret = -ERESTARTSYS;
988 	goto err;
989 
990 err_wr:
991 	if (!--pipe->writers)
992 		wake_up_interruptible(&pipe->wait);
993 	ret = -ERESTARTSYS;
994 	goto err;
995 
996 err:
997 	__pipe_unlock(pipe);
998 
999 	put_pipe_info(inode, pipe);
1000 	return ret;
1001 }
1002 
1003 const struct file_operations pipefifo_fops = {
1004 	.open		= fifo_open,
1005 	.llseek		= no_llseek,
1006 	.read_iter	= pipe_read,
1007 	.write_iter	= pipe_write,
1008 	.poll		= pipe_poll,
1009 	.unlocked_ioctl	= pipe_ioctl,
1010 	.release	= pipe_release,
1011 	.fasync		= pipe_fasync,
1012 };
1013 
1014 /*
1015  * Allocate a new array of pipe buffers and copy the info over. Returns the
1016  * pipe size if successful, or return -ERROR on error.
1017  */
pipe_set_size(struct pipe_inode_info * pipe,unsigned long nr_pages)1018 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
1019 {
1020 	struct pipe_buffer *bufs;
1021 
1022 	if (!nr_pages)
1023 		return -EINVAL;
1024 
1025 	/*
1026 	 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
1027 	 * expect a lot of shrink+grow operations, just free and allocate
1028 	 * again like we would do for growing. If the pipe currently
1029 	 * contains more buffers than arg, then return busy.
1030 	 */
1031 	if (nr_pages < pipe->nrbufs)
1032 		return -EBUSY;
1033 
1034 	bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
1035 	if (unlikely(!bufs))
1036 		return -ENOMEM;
1037 
1038 	/*
1039 	 * The pipe array wraps around, so just start the new one at zero
1040 	 * and adjust the indexes.
1041 	 */
1042 	if (pipe->nrbufs) {
1043 		unsigned int tail;
1044 		unsigned int head;
1045 
1046 		tail = pipe->curbuf + pipe->nrbufs;
1047 		if (tail < pipe->buffers)
1048 			tail = 0;
1049 		else
1050 			tail &= (pipe->buffers - 1);
1051 
1052 		head = pipe->nrbufs - tail;
1053 		if (head)
1054 			memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
1055 		if (tail)
1056 			memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
1057 	}
1058 
1059 	account_pipe_buffers(pipe, pipe->buffers, nr_pages);
1060 	pipe->curbuf = 0;
1061 	kfree(pipe->bufs);
1062 	pipe->bufs = bufs;
1063 	pipe->buffers = nr_pages;
1064 	return nr_pages * PAGE_SIZE;
1065 }
1066 
1067 /*
1068  * Currently we rely on the pipe array holding a power-of-2 number
1069  * of pages. Returns 0 on error.
1070  */
round_pipe_size(unsigned int size)1071 static inline unsigned int round_pipe_size(unsigned int size)
1072 {
1073 	unsigned long nr_pages;
1074 
1075 	if (size < pipe_min_size)
1076 		size = pipe_min_size;
1077 
1078 	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1079 	if (nr_pages == 0)
1080 		return 0;
1081 
1082 	return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
1083 }
1084 
1085 /*
1086  * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
1087  * will return an error.
1088  */
pipe_proc_fn(struct ctl_table * table,int write,void __user * buf,size_t * lenp,loff_t * ppos)1089 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
1090 		 size_t *lenp, loff_t *ppos)
1091 {
1092 	unsigned int rounded_pipe_max_size;
1093 	int ret;
1094 
1095 	ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
1096 	if (ret < 0 || !write)
1097 		return ret;
1098 
1099 	rounded_pipe_max_size = round_pipe_size(pipe_max_size);
1100 	if (rounded_pipe_max_size == 0)
1101 		return -EINVAL;
1102 
1103 	pipe_max_size = rounded_pipe_max_size;
1104 	return ret;
1105 }
1106 
1107 /*
1108  * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1109  * location, so checking ->i_pipe is not enough to verify that this is a
1110  * pipe.
1111  */
get_pipe_info(struct file * file)1112 struct pipe_inode_info *get_pipe_info(struct file *file)
1113 {
1114 	return file->f_op == &pipefifo_fops ? file->private_data : NULL;
1115 }
1116 
pipe_fcntl(struct file * file,unsigned int cmd,unsigned long arg)1117 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1118 {
1119 	struct pipe_inode_info *pipe;
1120 	long ret;
1121 
1122 	pipe = get_pipe_info(file);
1123 	if (!pipe)
1124 		return -EBADF;
1125 
1126 	__pipe_lock(pipe);
1127 
1128 	switch (cmd) {
1129 	case F_SETPIPE_SZ: {
1130 		unsigned int size, nr_pages;
1131 
1132 		size = round_pipe_size(arg);
1133 		nr_pages = size >> PAGE_SHIFT;
1134 
1135 		ret = -EINVAL;
1136 		if (!nr_pages)
1137 			goto out;
1138 
1139 		if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
1140 			ret = -EPERM;
1141 			goto out;
1142 		} else if ((too_many_pipe_buffers_hard(pipe->user) ||
1143 			    too_many_pipe_buffers_soft(pipe->user)) &&
1144 		           !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
1145 			ret = -EPERM;
1146 			goto out;
1147 		}
1148 		ret = pipe_set_size(pipe, nr_pages);
1149 		break;
1150 		}
1151 	case F_GETPIPE_SZ:
1152 		ret = pipe->buffers * PAGE_SIZE;
1153 		break;
1154 	default:
1155 		ret = -EINVAL;
1156 		break;
1157 	}
1158 
1159 out:
1160 	__pipe_unlock(pipe);
1161 	return ret;
1162 }
1163 
1164 static const struct super_operations pipefs_ops = {
1165 	.destroy_inode = free_inode_nonrcu,
1166 	.statfs = simple_statfs,
1167 };
1168 
1169 /*
1170  * pipefs should _never_ be mounted by userland - too much of security hassle,
1171  * no real gain from having the whole whorehouse mounted. So we don't need
1172  * any operations on the root directory. However, we need a non-trivial
1173  * d_name - pipe: will go nicely and kill the special-casing in procfs.
1174  */
pipefs_mount(struct file_system_type * fs_type,int flags,const char * dev_name,void * data)1175 static struct dentry *pipefs_mount(struct file_system_type *fs_type,
1176 			 int flags, const char *dev_name, void *data)
1177 {
1178 	return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
1179 			&pipefs_dentry_operations, PIPEFS_MAGIC);
1180 }
1181 
1182 static struct file_system_type pipe_fs_type = {
1183 	.name		= "pipefs",
1184 	.mount		= pipefs_mount,
1185 	.kill_sb	= kill_anon_super,
1186 };
1187 
init_pipe_fs(void)1188 static int __init init_pipe_fs(void)
1189 {
1190 	int err = register_filesystem(&pipe_fs_type);
1191 
1192 	if (!err) {
1193 		pipe_mnt = kern_mount(&pipe_fs_type);
1194 		if (IS_ERR(pipe_mnt)) {
1195 			err = PTR_ERR(pipe_mnt);
1196 			unregister_filesystem(&pipe_fs_type);
1197 		}
1198 	}
1199 	return err;
1200 }
1201 
1202 fs_initcall(init_pipe_fs);
1203