Commit c33585c5 authored by Manfred Spraul's avatar Manfred Spraul Committed by Linus Torvalds

[PATCH] pipe bugfix /cleanup

pipe_write contains a wakeup storm, 2 writers that write into the same
fifo can wake each other up, and spend 100% cpu time with
wakeup/schedule, without making any progress.

The only regression I'm aware of is that

  $ dd if=/dev/zero | grep not_there

will fail due to OOM, because grep does something like

	for(;;) {
		rlen = read(fd, buf, len);
		if (rlen == len) {
			len *= 2;
			buf = realloc(buf, len);
		}
	}

if it operates on pipes, and due to the improved syscall merging, read
will always return the maximum possible amount of data. But that's a grep
bug, not a kernel problem.
parent d8ac8dd7
...@@ -25,6 +25,9 @@ ...@@ -25,6 +25,9 @@
* *
* FIFOs and Pipes now generate SIGIO for both readers and writers. * FIFOs and Pipes now generate SIGIO for both readers and writers.
* -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
*
* pipe_read & write cleanup
* -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
*/ */
/* Drop the inode semaphore and wait for a pipe event, atomically */ /* Drop the inode semaphore and wait for a pipe event, atomically */
...@@ -44,97 +47,81 @@ static ssize_t ...@@ -44,97 +47,81 @@ static ssize_t
pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos) pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
{ {
struct inode *inode = filp->f_dentry->d_inode; struct inode *inode = filp->f_dentry->d_inode;
ssize_t size, read, ret; int do_wakeup;
ssize_t ret;
/* Seeks are not allowed on pipes. */
ret = -ESPIPE; /* pread is not allowed on pipes. */
read = 0; if (unlikely(ppos != &filp->f_pos))
if (ppos != &filp->f_pos) return -ESPIPE;
goto out_nolock;
/* Null read succeeds. */
/* Always return 0 on null read. */ if (unlikely(count == 0))
return 0;
do_wakeup = 0;
ret = 0; ret = 0;
if (count == 0) down(PIPE_SEM(*inode));
goto out_nolock; for (;;) {
int size = PIPE_LEN(*inode);
if (size) {
char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
ssize_t chars = PIPE_MAX_RCHUNK(*inode);
/* Get the pipe semaphore */ if (chars > count)
ret = -ERESTARTSYS; chars = count;
if (down_interruptible(PIPE_SEM(*inode))) if (chars > size)
goto out_nolock; chars = size;
if (copy_to_user(buf, pipebuf, chars)) {
if (!ret) ret = -EFAULT;
break;
}
ret += chars;
if (PIPE_EMPTY(*inode)) { PIPE_START(*inode) += chars;
do_more_read: PIPE_START(*inode) &= (PIPE_SIZE - 1);
ret = 0; PIPE_LEN(*inode) -= chars;
count -= chars;
buf += chars;
do_wakeup = 1;
}
if (!count)
break; /* common path: read succeeded */
if (PIPE_LEN(*inode)) /* test for cyclic buffers */
continue;
if (!PIPE_WRITERS(*inode)) if (!PIPE_WRITERS(*inode))
goto out; break;
if (!PIPE_WAITING_WRITERS(*inode)) {
ret = -EAGAIN; /* syscall merging: Usually we must not sleep
if (filp->f_flags & O_NONBLOCK) * if O_NONBLOCK is set, or if we got some data.
goto out; * But if a writer sleeps in kernel space, then
* we can wait for that data without violating POSIX.
for (;;) { */
PIPE_WAITING_READERS(*inode)++; if (ret)
pipe_wait(inode);
PIPE_WAITING_READERS(*inode)--;
ret = -ERESTARTSYS;
if (signal_pending(current))
goto out;
ret = 0;
if (!PIPE_EMPTY(*inode))
break; break;
if (!PIPE_WRITERS(*inode)) if (filp->f_flags & O_NONBLOCK) {
goto out; ret = -EAGAIN;
break;
}
} }
if (signal_pending(current)) {
if (!ret) ret = -ERESTARTSYS;
break;
}
if (do_wakeup) {
wake_up_interruptible(PIPE_WAIT(*inode));
kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
}
pipe_wait(inode);
} }
up(PIPE_SEM(*inode));
/* Read what data is available. */ /* Signal writers asynchronously that there is more room. */
ret = -EFAULT; if (do_wakeup) {
while (count > 0 && (size = PIPE_LEN(*inode))) {
char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
ssize_t chars = PIPE_MAX_RCHUNK(*inode);
if (chars > count)
chars = count;
if (chars > size)
chars = size;
if (copy_to_user(buf, pipebuf, chars))
goto out;
read += chars;
PIPE_START(*inode) += chars;
PIPE_START(*inode) &= (PIPE_SIZE - 1);
PIPE_LEN(*inode) -= chars;
count -= chars;
buf += chars;
}
/* Cache behaviour optimization */
if (!PIPE_LEN(*inode))
PIPE_START(*inode) = 0;
if (count && PIPE_WAITING_WRITERS(*inode) && !(filp->f_flags & O_NONBLOCK)) {
/*
* We know that we are going to sleep: signal
* writers synchronously that there is more
* room.
*/
wake_up_interruptible_sync(PIPE_WAIT(*inode)); wake_up_interruptible_sync(PIPE_WAIT(*inode));
kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
if (!PIPE_EMPTY(*inode))
BUG();
goto do_more_read;
} }
/* Signal writers asynchronously that there is more room. */ if (ret > 0)
wake_up_interruptible(PIPE_WAIT(*inode)); UPDATE_ATIME(inode);
kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
ret = read;
out:
up(PIPE_SEM(*inode));
out_nolock:
if (read)
ret = read;
return ret; return ret;
} }
...@@ -142,116 +129,90 @@ static ssize_t ...@@ -142,116 +129,90 @@ static ssize_t
pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos) pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos)
{ {
struct inode *inode = filp->f_dentry->d_inode; struct inode *inode = filp->f_dentry->d_inode;
ssize_t free, written, ret; ssize_t ret;
size_t min;
/* Seeks are not allowed on pipes. */ int do_wakeup;
ret = -ESPIPE;
written = 0; /* pwrite is not allowed on pipes. */
if (ppos != &filp->f_pos) if (unlikely(ppos != &filp->f_pos))
goto out_nolock; return -ESPIPE;
/* Null write succeeds. */ /* Null write succeeds. */
if (unlikely(count == 0))
return 0;
do_wakeup = 0;
ret = 0; ret = 0;
if (count == 0) min = count;
goto out_nolock; if (min > PIPE_BUF)
min = 1;
ret = -ERESTARTSYS; down(PIPE_SEM(*inode));
if (down_interruptible(PIPE_SEM(*inode))) for (;;) {
goto out_nolock; int free;
if (!PIPE_READERS(*inode)) {
/* No readers yields SIGPIPE. */ send_sig(SIGPIPE, current, 0);
if (!PIPE_READERS(*inode)) if (!ret) ret = -EPIPE;
goto sigpipe; break;
/* If count <= PIPE_BUF, we have to make it atomic. */
free = (count <= PIPE_BUF ? count : 1);
/* Wait, or check for, available space. */
if (filp->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
if (PIPE_FREE(*inode) < free)
goto out;
} else {
while (PIPE_FREE(*inode) < free) {
PIPE_WAITING_WRITERS(*inode)++;
pipe_wait(inode);
PIPE_WAITING_WRITERS(*inode)--;
ret = -ERESTARTSYS;
if (signal_pending(current))
goto out;
if (!PIPE_READERS(*inode))
goto sigpipe;
} }
} free = PIPE_FREE(*inode);
if (free >= min) {
/* Copy into available space. */ /* transfer data */
ret = -EFAULT; ssize_t chars = PIPE_MAX_WCHUNK(*inode);
while (count > 0) { char *pipebuf = PIPE_BASE(*inode) + PIPE_END(*inode);
int space; /* Always wakeup, even if the copy fails. Otherwise
char *pipebuf = PIPE_BASE(*inode) + PIPE_END(*inode); * we lock up (O_NONBLOCK-)readers that sleep due to
ssize_t chars = PIPE_MAX_WCHUNK(*inode); * syscall merging.
*/
if ((space = PIPE_FREE(*inode)) != 0) { do_wakeup = 1;
if (chars > count) if (chars > count)
chars = count; chars = count;
if (chars > space) if (chars > free)
chars = space; chars = free;
if (copy_from_user(pipebuf, buf, chars)) if (copy_from_user(pipebuf, buf, chars)) {
goto out; if (!ret) ret = -EFAULT;
break;
}
written += chars; ret += chars;
PIPE_LEN(*inode) += chars; PIPE_LEN(*inode) += chars;
count -= chars; count -= chars;
buf += chars; buf += chars;
space = PIPE_FREE(*inode); }
if (!count)
break;
if (PIPE_FREE(*inode) && ret) {
/* handle cyclic data buffers */
min = 1;
continue; continue;
} }
if (filp->f_flags & O_NONBLOCK) {
ret = written; if (!ret) ret = -EAGAIN;
if (filp->f_flags & O_NONBLOCK)
break; break;
}
do { if (signal_pending(current)) {
/* if (!ret) ret = -ERESTARTSYS;
* Synchronous wake-up: it knows that this process break;
* is going to give up this CPU, so it doesn't have }
* to do idle reschedules. if (do_wakeup) {
*/
wake_up_interruptible_sync(PIPE_WAIT(*inode)); wake_up_interruptible_sync(PIPE_WAIT(*inode));
kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
PIPE_WAITING_WRITERS(*inode)++; do_wakeup = 0;
pipe_wait(inode); }
PIPE_WAITING_WRITERS(*inode)--; PIPE_WAITING_WRITERS(*inode)++;
if (signal_pending(current)) pipe_wait(inode);
goto out; PIPE_WAITING_WRITERS(*inode)--;
if (!PIPE_READERS(*inode))
goto sigpipe;
} while (!PIPE_FREE(*inode));
ret = -EFAULT;
} }
/* Signal readers asynchronously that there is more data. */
wake_up_interruptible(PIPE_WAIT(*inode));
kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
mark_inode_dirty(inode);
out:
up(PIPE_SEM(*inode)); up(PIPE_SEM(*inode));
out_nolock: if (do_wakeup) {
if (written) wake_up_interruptible(PIPE_WAIT(*inode));
ret = written; kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
}
if (ret > 0) {
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
mark_inode_dirty(inode);
}
return ret; return ret;
sigpipe:
if (written)
goto out;
up(PIPE_SEM(*inode));
send_sig(SIGPIPE, current, 0);
return -EPIPE;
} }
static ssize_t static ssize_t
...@@ -525,7 +486,7 @@ struct inode* pipe_new(struct inode* inode) ...@@ -525,7 +486,7 @@ struct inode* pipe_new(struct inode* inode)
PIPE_BASE(*inode) = (char*) page; PIPE_BASE(*inode) = (char*) page;
PIPE_START(*inode) = PIPE_LEN(*inode) = 0; PIPE_START(*inode) = PIPE_LEN(*inode) = 0;
PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0; PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0;
PIPE_WAITING_READERS(*inode) = PIPE_WAITING_WRITERS(*inode) = 0; PIPE_WAITING_WRITERS(*inode) = 0;
PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1; PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
*PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL; *PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL;
......
...@@ -9,7 +9,6 @@ struct pipe_inode_info { ...@@ -9,7 +9,6 @@ struct pipe_inode_info {
unsigned int start; unsigned int start;
unsigned int readers; unsigned int readers;
unsigned int writers; unsigned int writers;
unsigned int waiting_readers;
unsigned int waiting_writers; unsigned int waiting_writers;
unsigned int r_counter; unsigned int r_counter;
unsigned int w_counter; unsigned int w_counter;
...@@ -28,7 +27,6 @@ struct pipe_inode_info { ...@@ -28,7 +27,6 @@ struct pipe_inode_info {
#define PIPE_LEN(inode) ((inode).i_pipe->len) #define PIPE_LEN(inode) ((inode).i_pipe->len)
#define PIPE_READERS(inode) ((inode).i_pipe->readers) #define PIPE_READERS(inode) ((inode).i_pipe->readers)
#define PIPE_WRITERS(inode) ((inode).i_pipe->writers) #define PIPE_WRITERS(inode) ((inode).i_pipe->writers)
#define PIPE_WAITING_READERS(inode) ((inode).i_pipe->waiting_readers)
#define PIPE_WAITING_WRITERS(inode) ((inode).i_pipe->waiting_writers) #define PIPE_WAITING_WRITERS(inode) ((inode).i_pipe->waiting_writers)
#define PIPE_RCOUNTER(inode) ((inode).i_pipe->r_counter) #define PIPE_RCOUNTER(inode) ((inode).i_pipe->r_counter)
#define PIPE_WCOUNTER(inode) ((inode).i_pipe->w_counter) #define PIPE_WCOUNTER(inode) ((inode).i_pipe->w_counter)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment