Commit f751cfc0 authored by Davide Libenzi's avatar Davide Libenzi Committed by Linus Torvalds

[PATCH] sys_epoll 0.15

Latest version of the epoll interfaces.
parent ecf2c214
......@@ -737,6 +737,10 @@ ENTRY(sys_call_table)
.long sys_free_hugepages
.long sys_exit_group
.long sys_lookup_dcookie
.long sys_epoll_create
.long sys_epoll_ctl /* 255 */
.long sys_epoll_wait
.rept NR_syscalls-(.-sys_call_table)/4
.long sys_ni_syscall
......
......@@ -7,14 +7,14 @@
#
FONTMAPFILE = cp437.uni
obj-y += mem.o tty_io.o n_tty.o tty_ioctl.o pty.o misc.o random.o
obj-y += mem.o tty_io.o n_tty.o tty_ioctl.o pty.o misc.o random.o eventpoll.o
# All of the (potential) objects that export symbols.
# This list comes from 'grep -l EXPORT_SYMBOL *.[hc]'.
export-objs := busmouse.o vt.o generic_serial.o ip2main.o \
ite_gpio.o keyboard.o misc.o nvram.o random.o rtc.o \
selection.o sonypi.o sysrq.o tty_io.o tty_ioctl.o
selection.o sonypi.o sysrq.o tty_io.o tty_ioctl.o eventpoll.o
obj-$(CONFIG_VT) += vt_ioctl.o vc_screen.o consolemap.o consolemap_deftbl.o selection.o keyboard.o
obj-$(CONFIG_HW_CONSOLE) += vt.o defkeymap.o
......
/*
* drivers/char/eventpoll.c ( Efficent event polling implementation )
* Copyright (C) 2001,...,2002 Davide Libenzi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Davide Libenzi <davidel@xmailserver.org>
*
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/miscdevice.h>
#include <linux/random.h>
#include <linux/smp_lock.h>
#include <linux/wrapper.h>
#include <linux/string.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/wait.h>
#include <linux/fcblist.h>
#include <linux/rwsem.h>
#include <asm/bitops.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/mman.h>
#include <asm/atomic.h>
#include <linux/eventpoll.h>
#define EVENTPOLLFS_MAGIC 0x03111965 /* My birthday should work for this :) */
#define DEBUG_EPOLL 0
#if DEBUG_EPOLL > 0
#define DPRINTK(x) printk x
#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
#else /* #if DEBUG_EPOLL > 0 */
#define DPRINTK(x) (void) 0
#define DNPRINTK(n, x) (void) 0
#endif /* #if DEBUG_EPOLL > 0 */
#define DEBUG_DPI 0
#if DEBUG_DPI != 0
#define DPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
#else /* #if DEBUG_DPI != 0 */
#define DPI_SLAB_DEBUG 0
#endif /* #if DEBUG_DPI != 0 */
#define INITIAL_HASH_BITS 7
#define MAX_HASH_BITS 18
#define RESIZE_LENGTH 2
#define DPI_MEM_ALLOC() (struct epitem *) kmem_cache_alloc(dpi_cache, SLAB_KERNEL)
#define DPI_MEM_FREE(p) kmem_cache_free(dpi_cache, p)
#define IS_FILE_EPOLL(f) ((f)->f_op == &eventpoll_fops)
/*
* Type used for versioning events snapshots inside the double buffer.
*/
typedef unsigned long long event_version_t;
/*
* This structure is stored inside the "private_data" member of the file
* structure and rapresent the main data sructure for the eventpoll
* interface.
*/
struct eventpoll {
/*
* Protect the evenpoll interface from sys_epoll_ctl(2), ioctl(EP_POLL)
* and ->write() concurrency. It basically serialize the add/remove/edit
* of items in the interest set.
*/
struct rw_semaphore acsem;
/*
* Protect the this structure access. When the "acsem" is acquired
* togheter with this one, "acsem" should be acquired first. Or,
* "lock" nests inside "acsem".
*/
rwlock_t lock;
/* Wait queue used by sys_epoll_wait() and ioctl(EP_POLL) */
wait_queue_head_t wq;
/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;
/* This is the hash used to store the "struct epitem" elements */
struct list_head *hash;
unsigned int hbits;
unsigned int hmask;
atomic_t hents;
atomic_t resize;
/* Number of pages currently allocated in each side of the double buffer */
int numpages;
/*
* Current page set pointer, switched from "pages0" and "pages1" each time
* ep_poll() returns events to the caller.
*/
char **pages;
/* Each one of these contains the pages allocated for each side of
* the double buffer.
*/
char *pages0[MAX_EVENTPOLL_PAGES];
char *pages1[MAX_EVENTPOLL_PAGES];
/*
* Variable containing the vma base address where the double buffer
* pages are mapped onto.
*/
unsigned long vmabase;
/*
* Certain functions cannot be called if the double buffer pages are
* not allocated and if the memory mapping is not in place. This tells
* us that everything is setup to fully use the interface.
*/
atomic_t mmapped;
/* Number of events currently available inside the current snapshot */
int eventcnt;
/*
* Variable storing the current "version" of the snapshot. It is used
* to validate the validity of the current slot pointed by the "index"
* member of a "struct epitem".
*/
event_version_t ver;
};
/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the hash.
*/
struct epitem {
/* List header used to link this structure to the eventpoll hash */
struct list_head llink;
/* The "container" of this item */
struct eventpoll *ep;
/* The file this item refers to */
struct file *file;
/* The structure that describe the interested events and the source fd */
struct pollfd pfd;
/*
* The index inside the current double buffer that stores the active
* event slot for this item ( file ).
*/
int index;
/*
* The version that is used to validate if the current slot is still
* valid or if it refers to an old snapshot. It is matches togheter
* with the one inside the eventpoll structure.
*/
event_version_t ver;
};
static int ep_getfd(int *efd, struct inode **einode, struct file **efile);
static int ep_alloc_pages(char **pages, int numpages);
static int ep_free_pages(char **pages, int numpages);
static int ep_init(struct eventpoll *ep);
static void ep_free(struct eventpoll *ep);
static struct epitem *ep_find_nl(struct eventpoll *ep, int fd);
static struct epitem *ep_find(struct eventpoll *ep, int fd);
static int ep_hashresize(struct eventpoll *ep, unsigned long *kflags);
static int ep_insert(struct eventpoll *ep, struct pollfd *pfd);
static int ep_remove(struct eventpoll *ep, struct epitem *dpi);
static void notify_proc(struct file *file, void *data, unsigned long *local,
long *event);
static int open_eventpoll(struct inode *inode, struct file *file);
static int close_eventpoll(struct inode *inode, struct file *file);
static unsigned int poll_eventpoll(struct file *file, poll_table *wait);
static int write_eventpoll(struct file *file, const char *buffer, size_t count,
loff_t *ppos);
static int ep_poll(struct eventpoll *ep, struct evpoll *dvp);
static int ep_do_alloc_pages(struct eventpoll *ep, int numpages);
static int ioctl_eventpoll(struct inode *inode, struct file *file,
unsigned int cmd, unsigned long arg);
static void eventpoll_mm_open(struct vm_area_struct * vma);
static void eventpoll_mm_close(struct vm_area_struct * vma);
static int mmap_eventpoll(struct file *file, struct vm_area_struct *vma);
static int eventpollfs_delete_dentry(struct dentry *dentry);
static struct inode *get_eventpoll_inode(void);
static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
int flags, char *dev_name, void *data);
/* Slab cache used to allocate "struct epitem" */
static kmem_cache_t *dpi_cache;
/* Virtual fs used to allocate inodes for eventpoll files */
static struct vfsmount *eventpoll_mnt;
/* File callbacks that implement the eventpoll file behaviour */
static struct file_operations eventpoll_fops = {
.write = write_eventpoll,
.ioctl = ioctl_eventpoll,
.mmap = mmap_eventpoll,
.open = open_eventpoll,
.release = close_eventpoll,
.poll = poll_eventpoll
};
/* Memory mapping callbacks for the eventpoll file */
static struct vm_operations_struct eventpoll_mmap_ops = {
.open = eventpoll_mm_open,
.close = eventpoll_mm_close,
};
/*
* The "struct miscdevice" is used to register the eventpoll device
* to make it suitable to be openend from a /dev file.
*/
static struct miscdevice eventpoll_miscdev = {
EVENTPOLL_MINOR, "eventpoll", &eventpoll_fops
};
/*
* This is used to register the virtual file system from where
* eventpoll inodes are allocated.
*/
static struct file_system_type eventpoll_fs_type = {
.name = "eventpollfs",
.get_sb = eventpollfs_get_sb,
.kill_sb = kill_anon_super,
};
/* Very basic directory entry operations for the eventpoll virtual file system */
static struct dentry_operations eventpollfs_dentry_operations = {
.d_delete = eventpollfs_delete_dentry,
};
/*
* It opens an eventpoll file descriptor by allocating space for "maxfds"
* file descriptors. It is the kernel part of the userspace epoll_create(2).
*/
asmlinkage int sys_epoll_create(int maxfds)
{
int error = -EINVAL, fd;
unsigned long addr;
struct inode *inode;
struct file *file;
struct eventpoll *ep;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
current, maxfds));
/*
* It is not possible to store more than MAX_FDS_IN_EVENTPOLL file
* descriptors inside the eventpoll interface.
*/
if (maxfds > MAX_FDS_IN_EVENTPOLL)
goto eexit_1;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure, and inode and a free file descriptor.
*/
error = ep_getfd(&fd, &inode, &file);
if (error)
goto eexit_1;
/*
* Calls the code to initialize the eventpoll file. This code is
* the same as the "open" file operation callback because inside
* ep_getfd() we did what the kernel usually does before invoking
* corresponding file "open" callback.
*/
error = open_eventpoll(inode, file);
if (error)
goto eexit_2;
/* The "private_data" member is setup by open_eventpoll() */
ep = file->private_data;
/* Alloc pages for the event double buffer */
error = ep_do_alloc_pages(ep, EP_FDS_PAGES(maxfds + 1));
if (error)
goto eexit_2;
/*
* Create a user space mapping of the event double buffer to
* avoid kernel to user space memory copy when returning events
* to the caller.
*/
down_write(&current->mm->mmap_sem);
addr = do_mmap_pgoff(file, 0, EP_MAP_SIZE(maxfds + 1), PROT_READ,
MAP_PRIVATE, 0);
up_write(&current->mm->mmap_sem);
error = PTR_ERR((void *) addr);
if (IS_ERR((void *) addr))
goto eexit_2;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
current, maxfds, fd));
return fd;
eexit_2:
sys_close(fd);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
current, maxfds, error));
return error;
}
/*
* The following function implement the controller interface for the eventpoll
* file that enable the insertion/removal/change of file descriptors inside
* the interest set. It rapresents the kernel part of the user spcae epoll_ctl(2).
*/
asmlinkage int sys_epoll_ctl(int epfd, int op, int fd, unsigned int events)
{
int error = -EBADF;
struct file *file;
struct eventpoll *ep;
struct epitem *dpi;
struct pollfd pfd;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %u)\n",
current, epfd, op, fd, events));
file = fget(epfd);
if (!file)
goto eexit_1;
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
if (!IS_FILE_EPOLL(file))
goto eexit_2;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = file->private_data;
down_write(&ep->acsem);
pfd.fd = fd;
pfd.events = events | POLLERR | POLLHUP;
pfd.revents = 0;
dpi = ep_find(ep, fd);
error = -EINVAL;
switch (op) {
case EP_CTL_ADD:
if (!dpi)
error = ep_insert(ep, &pfd);
else
error = -EEXIST;
break;
case EP_CTL_DEL:
if (dpi)
error = ep_remove(ep, dpi);
else
error = -ENOENT;
break;
case EP_CTL_MOD:
if (dpi) {
dpi->pfd.events = events;
error = 0;
} else
error = -ENOENT;
break;
}
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %u) = %d\n",
current, epfd, op, fd, events, error));
up_write(&ep->acsem);
eexit_2:
fput(file);
eexit_1:
return error;
}
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
asmlinkage int sys_epoll_wait(int epfd, struct pollfd const **events, int timeout)
{
int error = -EBADF;
void *eaddr;
struct file *file;
struct eventpoll *ep;
struct evpoll dvp;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d)\n",
current, epfd, events, timeout));
file = fget(epfd);
if (!file)
goto eexit_1;
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
if (!IS_FILE_EPOLL(file))
goto eexit_2;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = file->private_data;
/*
* It is possible that the user created an eventpoll file by open()ing
* the corresponding /dev/ file and he did not perform the correct
* initialization required by the old /dev/epoll interface. This test
* protect us from this scenario.
*/
error = -EINVAL;
if (!atomic_read(&ep->mmapped))
goto eexit_2;
dvp.ep_timeout = timeout;
error = ep_poll(ep, &dvp);
if (error > 0) {
eaddr = (void *) (ep->vmabase + dvp.ep_resoff);
if (copy_to_user(events, &eaddr, sizeof(struct pollfd *)))
error = -EFAULT;
}
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d) = %d\n",
current, epfd, events, timeout, error));
eexit_2:
fput(file);
eexit_1:
return error;
}
/*
* Creates the file descriptor to be used by the epoll interface.
*/
static int ep_getfd(int *efd, struct inode **einode, struct file **efile)
{
struct qstr this;
char name[32];
struct dentry *dentry;
struct inode *inode;
struct file *file;
int error, fd;
/* Get an ready to use file */
error = -ENFILE;
file = get_empty_filp();
if (!file)
goto eexit_1;
/* Allocates an inode from the eventpoll file system */
inode = get_eventpoll_inode();
error = PTR_ERR(inode);
if (IS_ERR(inode))
goto eexit_2;
/* Allocates a free descriptor to plug the file onto */
error = get_unused_fd();
if (error < 0)
goto eexit_3;
fd = error;
/*
* Link the inode to a directory entry by creating a unique name
* using the inode number.
*/
error = -ENOMEM;
sprintf(name, "[%lu]", inode->i_ino);
this.name = name;
this.len = strlen(name);
this.hash = inode->i_ino;
dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);
if (!dentry)
goto eexit_4;
dentry->d_op = &eventpollfs_dentry_operations;
d_add(dentry, inode);
file->f_vfsmnt = mntget(eventpoll_mnt);
file->f_dentry = dget(dentry);
/*
* Initialize the file as read/write because it could be used
* with write() to add/remove/change interest sets.
*/
file->f_pos = 0;
file->f_flags = O_RDWR;
file->f_op = &eventpoll_fops;
file->f_mode = FMODE_READ | FMODE_WRITE;
file->f_version = 0;
file->private_data = NULL;
/* Install the new setup file into the allocated fd. */
fd_install(fd, file);
*efd = fd;
*einode = inode;
*efile = file;
return 0;
eexit_4:
put_unused_fd(fd);
eexit_3:
iput(inode);
eexit_2:
put_filp(file);
eexit_1:
return error;
}
static int ep_alloc_pages(char **pages, int numpages)
{
int ii;
for (ii = 0; ii < numpages; ii++) {
pages[ii] = (char *) __get_free_pages(GFP_KERNEL, 0);
if (!pages[ii]) {
for (--ii; ii >= 0; ii--) {
ClearPageReserved(virt_to_page(pages[ii]));
free_pages((unsigned long) pages[ii], 0);
}
return -ENOMEM;
}
SetPageReserved(virt_to_page(pages[ii]));
}
return 0;
}
static int ep_free_pages(char **pages, int numpages)
{
int ii;
for (ii = 0; ii < numpages; ii++) {
ClearPageReserved(virt_to_page(pages[ii]));
free_pages((unsigned long) pages[ii], 0);
}
return 0;
}
static int ep_init(struct eventpoll *ep)
{
int ii, hentries;
init_rwsem(&ep->acsem);
rwlock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
ep->hbits = INITIAL_HASH_BITS;
ep->hmask = (1 << ep->hbits) - 1;
atomic_set(&ep->hents, 0);
atomic_set(&ep->resize, 0);
atomic_set(&ep->mmapped, 0);
ep->numpages = 0;
ep->vmabase = 0;
ep->pages = ep->pages0;
ep->eventcnt = 0;
ep->ver = 1;
hentries = ep->hmask + 1;
if (!(ep->hash = (struct list_head *) vmalloc(hentries * sizeof(struct list_head))))
return -ENOMEM;
for (ii = 0; ii < hentries; ii++)
INIT_LIST_HEAD(&ep->hash[ii]);
return 0;
}
static void ep_free(struct eventpoll *ep)
{
int ii;
struct list_head *lsthead;
/*
* Walks through the whole hash by unregistering file callbacks and
* freeing each "struct epitem".
*/
for (ii = 0; ii <= ep->hmask; ii++) {
lsthead = &ep->hash[ii];
while (!list_empty(lsthead)) {
struct epitem *dpi = list_entry(lsthead->next, struct epitem, llink);
file_notify_delcb(dpi->file, notify_proc);
list_del(lsthead->next);
DPI_MEM_FREE(dpi);
}
}
/*
* At this point we can free the hash and the pages used for the event
* double buffer. The ep_free() function is called from the "close"
* file operations callback, and this garanties us that the pages are
* already unmapped.
*/
vfree(ep->hash);
if (ep->numpages > 0) {
ep_free_pages(ep->pages0, ep->numpages);
ep_free_pages(ep->pages1, ep->numpages);
}
}
/*
* No lock version of ep_find(), used when the code had to acquire the lock
* before calling the function.
*/
static struct epitem *ep_find_nl(struct eventpoll *ep, int fd)
{
struct epitem *dpi = NULL;
struct list_head *lsthead, *lnk;
lsthead = &ep->hash[fd & ep->hmask];
list_for_each(lnk, lsthead) {
dpi = list_entry(lnk, struct epitem, llink);
if (dpi->pfd.fd == fd) break;
dpi = NULL;
}
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%d) -> %p\n",
current, fd, dpi));
return dpi;
}
static struct epitem *ep_find(struct eventpoll *ep, int fd)
{
struct epitem *dpi;
unsigned long flags;
read_lock_irqsave(&ep->lock, flags);
dpi = ep_find_nl(ep, fd);
read_unlock_irqrestore(&ep->lock, flags);
return dpi;
}
static int ep_hashresize(struct eventpoll *ep, unsigned long *kflags)
{
struct list_head *hash, *oldhash;
unsigned int hbits = ep->hbits + 1;
unsigned int hmask = (1 << hbits) - 1;
int ii, res, hentries = hmask + 1;
unsigned long flags = *kflags;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_hashresize(%p) bits=%u\n",
current, ep, hbits));
write_unlock_irqrestore(&ep->lock, flags);
res = -ENOMEM;
if (!(hash = (struct list_head *) vmalloc(hentries * sizeof(struct list_head)))) {
write_lock_irqsave(&ep->lock, flags);
goto eexit_1;
}
for (ii = 0; ii < hentries; ii++)
INIT_LIST_HEAD(&hash[ii]);
write_lock_irqsave(&ep->lock, flags);
oldhash = ep->hash;
for (ii = 0; ii <= ep->hmask; ii++) {
struct list_head *oldhead = &oldhash[ii], *lnk;
while (!list_empty(oldhead)) {
struct epitem *dpi = list_entry(lnk = oldhead->next, struct epitem, llink);
list_del(lnk);
list_add(lnk, &hash[dpi->pfd.fd & hmask]);
}
}
ep->hash = hash;
ep->hbits = hbits;
ep->hmask = hmask;
write_unlock_irqrestore(&ep->lock, flags);
vfree(oldhash);
write_lock_irqsave(&ep->lock, flags);
res = 0;
eexit_1:
*kflags = flags;
atomic_dec(&ep->resize);
return res;
}
static int ep_insert(struct eventpoll *ep, struct pollfd *pfd)
{
int error;
struct epitem *dpi;
struct file *file;
unsigned long flags;
if (atomic_read(&ep->hents) >= (ep->numpages * POLLFD_X_PAGE))
return -E2BIG;
file = fget(pfd->fd);
if (!file)
return -EBADF;
error = -ENOMEM;
if (!(dpi = DPI_MEM_ALLOC()))
goto eexit_1;
INIT_LIST_HEAD(&dpi->llink);
dpi->ep = ep;
dpi->file = file;
dpi->pfd = *pfd;
dpi->index = -1;
dpi->ver = ep->ver - 1;
write_lock_irqsave(&ep->lock, flags);
list_add(&dpi->llink, &ep->hash[pfd->fd & ep->hmask]);
atomic_inc(&ep->hents);
if (!atomic_read(&ep->resize) &&
(atomic_read(&ep->hents) >> ep->hbits) > RESIZE_LENGTH &&
ep->hbits < MAX_HASH_BITS) {
atomic_inc(&ep->resize);
ep_hashresize(ep, &flags);
}
write_unlock_irqrestore(&ep->lock, flags);
file_notify_addcb(file, notify_proc, dpi);
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %d)\n",
current, ep, pfd->fd));
error = 0;
eexit_1:
fput(file);
return error;
}
/*
* Removes a "struct epitem" from the eventpoll hash and deallocates
* all the associated resources.
*/
static int ep_remove(struct eventpoll *ep, struct epitem *dpi)
{
unsigned long flags;
struct pollfd *pfd, *lpfd;
struct epitem *ldpi;
/* First, removes the callback from the file callback list */
file_notify_delcb(dpi->file, notify_proc);
write_lock_irqsave(&ep->lock, flags);
list_del(&dpi->llink);
atomic_dec(&ep->hents);
/*
* This is to remove stale events. We don't want that the removed file
* has a pending event that might be associated with a file inserted
* at a later time inside the eventpoll interface. this code checks
* if the currently removed file has a valid pending event and, if it does,
* manages things to remove it and decrement the currently available
* event count.
*/
if (dpi->index >= 0 && dpi->ver == ep->ver && dpi->index < ep->eventcnt) {
pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
EVENT_PAGE_OFFSET(dpi->index));
if (pfd->fd == dpi->pfd.fd && dpi->index < --ep->eventcnt) {
lpfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(ep->eventcnt)] +
EVENT_PAGE_OFFSET(ep->eventcnt));
*pfd = *lpfd;
if ((ldpi = ep_find_nl(ep, pfd->fd))) ldpi->index = dpi->index;
}
}
write_unlock_irqrestore(&ep->lock, flags);
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %d)\n",
current, ep, dpi->pfd.fd));
/* At this point it is safe to free the eventpoll item */
DPI_MEM_FREE(dpi);
return 0;
}
/*
* This is the event notify callback that is called from fs/fcblist.c because
* of the registration ( file_notify_addcb() ) done in ep_insert().
*/
static void notify_proc(struct file *file, void *data, unsigned long *local,
long *event)
{
struct epitem *dpi = data;
struct eventpoll *ep = dpi->ep;
struct pollfd *pfd;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: notify(%p, %p, %ld, %ld) ep=%p\n",
current, file, data, event[0], event[1], ep));
/*
* We don't need to disable IRQs here because the callback dispatch
* routine inside fs/fcblist.c already call us with disabled IRQ.
*/
write_lock(&ep->lock);
/* We're not expecting any of those events. Jump out soon ... */
if (!(dpi->pfd.events & event[1]))
goto out;
/*
* This logic determins if an active even slot is available for the
* currently signaled file, or if we have to make space for a new one
* and increment the number of ready file descriptors ( ep->eventcnt ).
*/
if (dpi->index < 0 || dpi->ver != ep->ver) {
if (ep->eventcnt >= (ep->numpages * POLLFD_X_PAGE))
goto out;
dpi->index = ep->eventcnt++;
dpi->ver = ep->ver;
pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
EVENT_PAGE_OFFSET(dpi->index));
*pfd = dpi->pfd;
} else {
pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
EVENT_PAGE_OFFSET(dpi->index));
if (pfd->fd != dpi->pfd.fd) {
if (ep->eventcnt >= (ep->numpages * POLLFD_X_PAGE))
goto out;
dpi->index = ep->eventcnt++;
pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
EVENT_PAGE_OFFSET(dpi->index));
*pfd = dpi->pfd;
}
}
/*
* Merge event bits into the corresponding event slot inside the
* double buffer.
*/
pfd->revents |= (pfd->events & event[1]);
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
wake_up(&ep->poll_wait);
out:
write_unlock(&ep->lock);
}
static int open_eventpoll(struct inode *inode, struct file *file)
{
int res;
struct eventpoll *ep;
if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL)))
return -ENOMEM;
memset(ep, 0, sizeof(*ep));
if ((res = ep_init(ep))) {
kfree(ep);
return res;
}
file->private_data = ep;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: open() ep=%p\n", current, ep));
return 0;
}
static int close_eventpoll(struct inode *inode, struct file *file)
{
struct eventpoll *ep = file->private_data;
if (ep) {
ep_free(ep);
kfree(ep);
}
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
return 0;
}
static unsigned int poll_eventpoll(struct file *file, poll_table *wait)
{
struct eventpoll *ep = file->private_data;
poll_wait(file, &ep->poll_wait, wait);
if (ep->eventcnt)
return POLLIN | POLLRDNORM;
return 0;
}
static int write_eventpoll(struct file *file, const char *buffer, size_t count,
loff_t *ppos)
{
int rcount;
struct eventpoll *ep = file->private_data;
struct epitem *dpi;
struct pollfd pfd;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: write(%p, %d)\n", current, ep, count));
/* The size of the write must be a multiple of sizeof(struct pollfd) */
rcount = -EINVAL;
if (count % sizeof(struct pollfd))
goto eexit_1;
/*
* And we have also to verify that that area is correctly accessible
* for the user.
*/
if ((rcount = verify_area(VERIFY_READ, buffer, count)))
goto eexit_1;
down_write(&ep->acsem);
rcount = 0;
while (count > 0) {
if (__copy_from_user(&pfd, buffer, sizeof(pfd))) {
rcount = -EFAULT;
goto eexit_2;
}
dpi = ep_find(ep, pfd.fd);
if (pfd.fd >= current->files->max_fds || !current->files->fd[pfd.fd])
pfd.events = POLLREMOVE;
if (pfd.events & POLLREMOVE) {
if (dpi) {
ep_remove(ep, dpi);
rcount += sizeof(pfd);
}
}
else if (dpi) {
dpi->pfd.events = pfd.events;
rcount += sizeof(pfd);
} else {
pfd.revents = 0;
if (!ep_insert(ep, &pfd))
rcount += sizeof(pfd);
}
buffer += sizeof(pfd);
count -= sizeof(pfd);
}
eexit_2:
up_write(&ep->acsem);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: write(%p, %d) = %d\n",
current, ep, count, rcount));
return rcount;
}
static int ep_poll(struct eventpoll *ep, struct evpoll *dvp)
{
int res = 0;
long timeout;
unsigned long flags;
wait_queue_t wait;
/*
* We don't want ep_poll() to be called if the correct sequence
* of operations are performed to initialize it. This won't happen
* for the system call interface but it could happen using the
* old /dev/epoll interface, that is maintained for compatibility.
*/
if (!atomic_read(&ep->mmapped))
return -EINVAL;
write_lock_irqsave(&ep->lock, flags);
res = 0;
if (!ep->eventcnt) {
/*
* We don't have any available event to return to the caller.
* We need to sleep here, and we will be wake up by
* notify_proc() when events will become available.
*/
init_waitqueue_entry(&wait, current);
add_wait_queue(&ep->wq, &wait);
/*
* Calculate the timeout by checking for the "infinite" value ( -1 )
* and the overflow condition ( > MAX_SCHEDULE_TIMEOUT / HZ ). The
* passed timeout is in milliseconds, that why (t * HZ) / 1000.
*/
timeout = dvp->ep_timeout == -1 || dvp->ep_timeout > MAX_SCHEDULE_TIMEOUT / HZ ?
MAX_SCHEDULE_TIMEOUT: (dvp->ep_timeout * HZ) / 1000;
for (;;) {
/*
* We don't want to sleep if the notify_proc() sends us
* a wakeup in between. That's why we set the task state
* to TASK_INTERRUPTIBLE before doing the checks.
*/
set_current_state(TASK_INTERRUPTIBLE);
if (ep->eventcnt || !timeout)
break;
if (signal_pending(current)) {
res = -EINTR;
break;
}
write_unlock_irqrestore(&ep->lock, flags);
timeout = schedule_timeout(timeout);
write_lock_irqsave(&ep->lock, flags);
}
remove_wait_queue(&ep->wq, &wait);
set_current_state(TASK_RUNNING);
}
/*
* If we've been wake up because of events became available, we need to:
*
* 1) null the number of available ready file descriptors
* 2) increment the version of the current ( next ) snapshot
* 3) swap the double buffer to return the current one to the caller
* 4) set the current ( for the user, previous for the interface ) offset
*/
if (!res && ep->eventcnt) {
res = ep->eventcnt;
ep->eventcnt = 0;
++ep->ver;
if (ep->pages == ep->pages0) {
ep->pages = ep->pages1;
dvp->ep_resoff = 0;
} else {
ep->pages = ep->pages0;
dvp->ep_resoff = ep->numpages * PAGE_SIZE;
}
}
write_unlock_irqrestore(&ep->lock, flags);
return res;
}
static int ep_do_alloc_pages(struct eventpoll *ep, int numpages)
{
int res, pgalloc, pgcpy;
unsigned long flags;
char **pages, **pages0, **pages1;
if (atomic_read(&ep->mmapped))
return -EBUSY;
if (numpages > MAX_EVENTPOLL_PAGES)
return -EINVAL;
pgalloc = numpages - ep->numpages;
if ((pages = (char **) vmalloc(2 * (pgalloc + 1) * sizeof(char *))) == NULL)
return -ENOMEM;
pages0 = &pages[0];
pages1 = &pages[pgalloc + 1];
if ((res = ep_alloc_pages(pages0, pgalloc)))
goto eexit_1;
if ((res = ep_alloc_pages(pages1, pgalloc))) {
ep_free_pages(pages0, pgalloc);
goto eexit_1;
}
write_lock_irqsave(&ep->lock, flags);
pgcpy = (ep->numpages + pgalloc) > numpages ? numpages - ep->numpages: pgalloc;
if (pgcpy > 0) {
memcpy(&ep->pages0[ep->numpages], pages0, pgcpy * sizeof(char *));
memcpy(&ep->pages1[ep->numpages], pages1, pgcpy * sizeof(char *));
ep->numpages += pgcpy;
}
write_unlock_irqrestore(&ep->lock, flags);
if (pgcpy < pgalloc) {
if (pgcpy < 0)
pgcpy = 0;
ep_free_pages(&pages0[pgcpy], pgalloc - pgcpy);
ep_free_pages(&pages1[pgcpy], pgalloc - pgcpy);
}
eexit_1:
vfree(pages);
return res;
}
static int ioctl_eventpoll(struct inode *inode, struct file *file,
unsigned int cmd, unsigned long arg)
{
int res;
struct eventpoll *ep = file->private_data;
struct epitem *dpi;
unsigned long flags;
struct pollfd pfd;
struct evpoll dvp;
switch (cmd) {
case EP_ALLOC:
res = ep_do_alloc_pages(ep, EP_FDS_PAGES(arg));
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_ALLOC, %lu) == %d\n",
current, ep, arg, res));
return res;
case EP_FREE:
if (atomic_read(&ep->mmapped))
return -EBUSY;
res = -EINVAL;
write_lock_irqsave(&ep->lock, flags);
if (ep->numpages > 0) {
ep_free_pages(ep->pages0, ep->numpages);
ep_free_pages(ep->pages1, ep->numpages);
ep->numpages = 0;
ep->pages = ep->pages0;
res = 0;
}
write_unlock_irqrestore(&ep->lock, flags);
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_FREE) == %d\n",
current, ep, res));
return res;
case EP_POLL:
if (copy_from_user(&dvp, (void *) arg, sizeof(struct evpoll)))
return -EFAULT;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_POLL, %d)\n",
current, ep, dvp.ep_timeout));
res = ep_poll(ep, &dvp);
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_POLL, %d) == %d\n",
current, ep, dvp.ep_timeout, res));
if (res > 0 && copy_to_user((void *) arg, &dvp, sizeof(struct evpoll)))
res = -EFAULT;
return res;
case EP_ISPOLLED:
if (copy_from_user(&pfd, (void *) arg, sizeof(struct pollfd)))
return 0;
read_lock_irqsave(&ep->lock, flags);
res = 0;
if (!(dpi = ep_find_nl(ep, pfd.fd)))
goto is_not_polled;
pfd = dpi->pfd;
res = 1;
is_not_polled:
read_unlock_irqrestore(&ep->lock, flags);
if (res)
copy_to_user((void *) arg, &pfd, sizeof(struct pollfd));
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_ISPOLLED, %d) == %d\n",
current, ep, pfd.fd, res));
return res;
}
return -EINVAL;
}
static void eventpoll_mm_open(struct vm_area_struct * vma)
{
struct file *file = vma->vm_file;
struct eventpoll *ep = file->private_data;
if (ep) atomic_inc(&ep->mmapped);
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mm_open(%p)\n", current, ep));
}
static void eventpoll_mm_close(struct vm_area_struct * vma)
{
struct file *file = vma->vm_file;
struct eventpoll *ep = file->private_data;
if (ep && atomic_dec_and_test(&ep->mmapped))
ep->vmabase = 0;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mm_close(%p)\n", current, ep));
}
static int mmap_eventpoll(struct file *file, struct vm_area_struct *vma)
{
struct eventpoll *ep = file->private_data;
unsigned long start;
int ii, res, numpages;
size_t mapsize;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mmap(%p, %lx, %lx)\n",
current, ep, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT));
/*
* We need the eventpoll file to be RW but we don't want it to be
* mapped RW. This test perform the test and reject RW mmaping.
*/
if (vma->vm_flags & VM_WRITE)
return -EACCES;
if ((vma->vm_pgoff << PAGE_SHIFT) != 0)
return -EINVAL;
/*
* We need to verify that the mapped area covers all the allocated
* double buffer.
*/
mapsize = PAGE_ALIGN(vma->vm_end - vma->vm_start);
numpages = mapsize >> PAGE_SHIFT;
res = -EINVAL;
if (numpages != (2 * ep->numpages))
goto eexit_1;
/*
* Map the double buffer starting from "vma->vm_start" up to
* "vma->vm_start + ep->numpages * PAGE_SIZE".
*/
start = vma->vm_start;
for (ii = 0; ii < ep->numpages; ii++) {
if ((res = remap_page_range(vma, start, __pa(ep->pages0[ii]),
PAGE_SIZE, vma->vm_page_prot)))
goto eexit_1;
start += PAGE_SIZE;
}
for (ii = 0; ii < ep->numpages; ii++) {
if ((res = remap_page_range(vma, start, __pa(ep->pages1[ii]),
PAGE_SIZE, vma->vm_page_prot)))
goto eexit_1;
start += PAGE_SIZE;
}
vma->vm_ops = &eventpoll_mmap_ops;
/* Saves the base mapping address for later use in sys_epoll_wait(2) */
ep->vmabase = vma->vm_start;
/*
* Ok, mapping has been done. We can open the door to functions that
* requires the mapping to be in place.
*/
atomic_set(&ep->mmapped, 1);
res = 0;
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mmap(%p, %lx, %lx) == %d\n",
current, ep, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT, res));
return res;
}
static int eventpollfs_delete_dentry(struct dentry *dentry)
{
return 1;
}
static struct inode *get_eventpoll_inode(void)
{
int error = -ENOMEM;
struct inode *inode = new_inode(eventpoll_mnt->mnt_sb);
if (!inode)
goto eexit_1;
inode->i_fop = &eventpoll_fops;
/*
* Mark the inode dirty from the very beginning,
* that way it will never be moved to the dirty
* list because "mark_inode_dirty()" will think
* that it already _is_ on the dirty list.
*/
inode->i_state = I_DIRTY;
inode->i_mode = S_IRUSR | S_IWUSR;
inode->i_uid = current->fsuid;
inode->i_gid = current->fsgid;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_blksize = PAGE_SIZE;
return inode;
eexit_1:
return ERR_PTR(error);
}
static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
int flags, char *dev_name, void *data)
{
return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC);
}
static int __init eventpoll_init(void)
{
int error;
/* Allocates slab cache used to allocate "struct epitem" items */
error = -ENOMEM;
dpi_cache = kmem_cache_create("eventpoll",
sizeof(struct epitem),
__alignof__(struct epitem),
DPI_SLAB_DEBUG, NULL, NULL);
if (!dpi_cache)
goto eexit_1;
/*
* Register the virtual file system that will be the source of inodes
* for the eventpoll files
*/
error = register_filesystem(&eventpoll_fs_type);
if (error)
goto eexit_2;
/* Mount the above commented virtual file system */
eventpoll_mnt = kern_mount(&eventpoll_fs_type);
error = PTR_ERR(eventpoll_mnt);
if (IS_ERR(eventpoll_mnt))
goto eexit_3;
/*
* This is to maintain compatibility with the old /dev/epoll interface.
* We need to register a misc device so that the caller can open(2) it
* through a file inside /dev.
*/
error = misc_register(&eventpoll_miscdev);
if (error)
goto eexit_4;
printk(KERN_INFO "[%p] eventpoll: driver installed.\n", current);
return error;
eexit_4:
mntput(eventpoll_mnt);
eexit_3:
unregister_filesystem(&eventpoll_fs_type);
eexit_2:
kmem_cache_destroy(dpi_cache);
eexit_1:
return error;
}
static void __exit eventpoll_exit(void)
{
/* Undo all operations done inside eventpoll_init() */
unregister_filesystem(&eventpoll_fs_type);
mntput(eventpoll_mnt);
misc_deregister(&eventpoll_miscdev);
kmem_cache_destroy(dpi_cache);
}
module_init(eventpoll_init);
module_exit(eventpoll_exit);
MODULE_LICENSE("GPL");
......@@ -6,14 +6,14 @@
#
export-objs := open.o dcache.o buffer.o bio.o inode.o dquot.o mpage.o aio.o \
fcntl.o read_write.o dcookies.o
fcntl.o read_write.o dcookies.o fcblist.o
obj-y := open.o read_write.o devices.o file_table.o buffer.o \
bio.o super.o block_dev.o char_dev.o stat.o exec.o pipe.o \
namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
dcache.o inode.o attr.o bad_inode.o file.o dnotify.o \
filesystems.o namespace.o seq_file.o xattr.o libfs.o \
fs-writeback.o mpage.o direct-io.o aio.o
fs-writeback.o mpage.o direct-io.o aio.o fcblist.o
ifneq ($(CONFIG_NFSD),n)
ifneq ($(CONFIG_NFSD),)
......
/*
* linux/fs/fcblist.c ( File event callbacks handling )
* Copyright (C) 2001,...,2002 Davide Libenzi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Davide Libenzi <davidel@xmailserver.org>
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/poll.h>
#include <asm/bitops.h>
#include <linux/fcblist.h>
long ion_band_table[NSIGPOLL] = {
ION_IN, /* POLL_IN */
ION_OUT, /* POLL_OUT */
ION_IN, /* POLL_MSG */
ION_ERR, /* POLL_ERR */
0, /* POLL_PRI */
ION_HUP /* POLL_HUP */
};
long poll_band_table[NSIGPOLL] = {
POLLIN | POLLRDNORM, /* POLL_IN */
POLLOUT | POLLWRNORM | POLLWRBAND, /* POLL_OUT */
POLLIN | POLLRDNORM | POLLMSG, /* POLL_MSG */
POLLERR, /* POLL_ERR */
POLLPRI | POLLRDBAND, /* POLL_PRI */
POLLHUP | POLLERR /* POLL_HUP */
};
/*
* Walk through the file callback list by calling each registered callback
* with the event that happened on the "filep" file. Callbacks are called
* by holding a read lock on the callback list lock, and also by keeping
* local IRQs disabled.
*/
void file_notify_event(struct file *filep, long *event)
{
unsigned long flags;
struct list_head *lnk, *lsthead;
read_lock_irqsave(&filep->f_cblock, flags);
lsthead = &filep->f_cblist;
list_for_each(lnk, lsthead) {
struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink);
fcbp->cbproc(filep, fcbp->data, fcbp->local, event);
}
read_unlock_irqrestore(&filep->f_cblock, flags);
}
/*
* Add a new callback to the list of file callbacks.
*/
int file_notify_addcb(struct file *filep,
void (*cbproc)(struct file *, void *, unsigned long *, long *),
void *data)
{
unsigned long flags;
struct fcb_struct *fcbp;
if (!(fcbp = (struct fcb_struct *) kmalloc(sizeof(struct fcb_struct), GFP_KERNEL)))
return -ENOMEM;
memset(fcbp, 0, sizeof(struct fcb_struct));
fcbp->cbproc = cbproc;
fcbp->data = data;
write_lock_irqsave(&filep->f_cblock, flags);
list_add_tail(&fcbp->llink, &filep->f_cblist);
write_unlock_irqrestore(&filep->f_cblock, flags);
return 0;
}
/*
* Removes the callback "cbproc" from the file callback list.
*/
int file_notify_delcb(struct file *filep,
void (*cbproc)(struct file *, void *, unsigned long *, long *))
{
unsigned long flags;
struct list_head *lnk, *lsthead;
write_lock_irqsave(&filep->f_cblock, flags);
lsthead = &filep->f_cblist;
list_for_each(lnk, lsthead) {
struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink);
if (fcbp->cbproc == cbproc) {
list_del(lnk);
write_unlock_irqrestore(&filep->f_cblock, flags);
kfree(fcbp);
return 0;
}
}
write_unlock_irqrestore(&filep->f_cblock, flags);
return -ENOENT;
}
/*
* It is called at file cleanup time and removes all the registered callbacks.
*/
void file_notify_cleanup(struct file *filep)
{
unsigned long flags;
struct list_head *lsthead;
write_lock_irqsave(&filep->f_cblock, flags);
lsthead = &filep->f_cblist;
while (!list_empty(lsthead)) {
struct fcb_struct *fcbp = list_entry(lsthead->next, struct fcb_struct, llink);
list_del(lsthead->next);
write_unlock_irqrestore(&filep->f_cblock, flags);
kfree(fcbp);
write_lock_irqsave(&filep->f_cblock, flags);
}
write_unlock_irqrestore(&filep->f_cblock, flags);
}
......@@ -8,6 +8,7 @@
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fcblist.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/smp_lock.h>
......@@ -58,6 +59,7 @@ struct file * get_empty_filp(void)
f->f_gid = current->fsgid;
f->f_owner.lock = RW_LOCK_UNLOCKED;
list_add(&f->f_list, &anon_list);
file_notify_init(f);
file_list_unlock();
return f;
}
......@@ -102,6 +104,7 @@ int init_private_file(struct file *filp, struct dentry *dentry, int mode)
filp->f_uid = current->fsuid;
filp->f_gid = current->fsgid;
filp->f_op = dentry->d_inode->i_fop;
file_notify_init(filp);
if (filp->f_op->open)
return filp->f_op->open(dentry->d_inode, filp);
else
......@@ -123,6 +126,7 @@ void __fput(struct file * file)
struct vfsmount * mnt = file->f_vfsmnt;
struct inode * inode = dentry->d_inode;
file_notify_cleanup(file);
locks_remove_flock(file);
if (file->f_op && file->f_op->release)
......
......@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/fcblist.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
......@@ -47,7 +48,7 @@ static ssize_t
pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
{
struct inode *inode = filp->f_dentry->d_inode;
int do_wakeup;
int do_wakeup, pfull;
ssize_t ret;
/* pread is not allowed on pipes. */
......@@ -63,6 +64,7 @@ pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
down(PIPE_SEM(*inode));
for (;;) {
int size = PIPE_LEN(*inode);
pfull = PIPE_FULL(*inode);
if (size) {
char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
ssize_t chars = PIPE_MAX_RCHUNK(*inode);
......@@ -108,12 +110,18 @@ pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
if (!ret) ret = -ERESTARTSYS;
break;
}
/* Send notification message */
if (pfull && !PIPE_FULL(*inode) && PIPE_WRITEFILE(*inode))
file_send_notify(PIPE_WRITEFILE(*inode), ION_OUT, POLLOUT | POLLWRNORM | POLLWRBAND);
if (do_wakeup) {
wake_up_interruptible(PIPE_WAIT(*inode));
kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
}
pipe_wait(inode);
}
/* Send notification message */
if (pfull && !PIPE_FULL(*inode) && PIPE_WRITEFILE(*inode))
file_send_notify(PIPE_WRITEFILE(*inode), ION_OUT, POLLOUT | POLLWRNORM | POLLWRBAND);
up(PIPE_SEM(*inode));
/* Signal writers asynchronously that there is more room. */
if (do_wakeup) {
......@@ -131,7 +139,7 @@ pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos)
struct inode *inode = filp->f_dentry->d_inode;
ssize_t ret;
size_t min;
int do_wakeup;
int do_wakeup, pempty;
/* pwrite is not allowed on pipes. */
if (unlikely(ppos != &filp->f_pos))
......@@ -149,6 +157,7 @@ pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos)
down(PIPE_SEM(*inode));
for (;;) {
int free;
pempty = PIPE_EMPTY(*inode);
if (!PIPE_READERS(*inode)) {
send_sig(SIGPIPE, current, 0);
if (!ret) ret = -EPIPE;
......@@ -194,6 +203,9 @@ pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos)
if (!ret) ret = -ERESTARTSYS;
break;
}
/* Send notification message */
if (pempty && !PIPE_EMPTY(*inode) && PIPE_READFILE(*inode))
file_send_notify(PIPE_READFILE(*inode), ION_IN, POLLIN | POLLRDNORM);
if (do_wakeup) {
wake_up_interruptible_sync(PIPE_WAIT(*inode));
kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
......@@ -203,6 +215,9 @@ pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos)
pipe_wait(inode);
PIPE_WAITING_WRITERS(*inode)--;
}
/* Send notification message */
if (pempty && !PIPE_EMPTY(*inode) && PIPE_READFILE(*inode))
file_send_notify(PIPE_READFILE(*inode), ION_IN, POLLIN | POLLRDNORM);
up(PIPE_SEM(*inode));
if (do_wakeup) {
wake_up_interruptible(PIPE_WAIT(*inode));
......@@ -266,9 +281,22 @@ pipe_poll(struct file *filp, poll_table *wait)
static int
pipe_release(struct inode *inode, int decr, int decw)
{
struct file *rdfile, *wrfile;
down(PIPE_SEM(*inode));
PIPE_READERS(*inode) -= decr;
PIPE_WRITERS(*inode) -= decw;
rdfile = PIPE_READFILE(*inode);
wrfile = PIPE_WRITEFILE(*inode);
if (decr && !PIPE_READERS(*inode)) {
PIPE_READFILE(*inode) = NULL;
if (wrfile)
file_send_notify(wrfile, ION_HUP, POLLHUP);
}
if (decw && !PIPE_WRITERS(*inode)) {
PIPE_WRITEFILE(*inode) = NULL;
if (rdfile)
file_send_notify(rdfile, ION_HUP, POLLHUP);
}
if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) {
struct pipe_inode_info *info = inode->i_pipe;
inode->i_pipe = NULL;
......@@ -488,6 +516,7 @@ struct inode* pipe_new(struct inode* inode)
PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0;
PIPE_WAITING_WRITERS(*inode) = 0;
PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
PIPE_READFILE(*inode) = PIPE_WRITEFILE(*inode) = NULL;
*PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL;
return inode;
......@@ -596,6 +625,9 @@ int do_pipe(int *fd)
f2->f_mode = 2;
f2->f_version = 0;
PIPE_READFILE(*inode) = f1;
PIPE_WRITEFILE(*inode) = f2;
fd_install(i, f1);
fd_install(j, f2);
fd[0] = i;
......
......@@ -15,6 +15,7 @@
#define POLLWRNORM 0x0100
#define POLLWRBAND 0x0200
#define POLLMSG 0x0400
#define POLLREMOVE 0x1000
struct pollfd {
int fd;
......
......@@ -258,6 +258,9 @@
#define __NR_free_hugepages 251
#define __NR_exit_group 252
#define __NR_lookup_dcookie 253
#define __NR_sys_epoll_create 254
#define __NR_sys_epoll_ctl 255
#define __NR_sys_epoll_wait 256
/* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
......
/*
* include/linux/eventpoll.h ( Efficent event polling implementation )
* Copyright (C) 2001,...,2002 Davide Libenzi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Davide Libenzi <davidel@xmailserver.org>
*
*/
#ifndef _LINUX_EVENTPOLL_H
#define _LINUX_EVENTPOLL_H
#define EVENTPOLL_MINOR 124
#define POLLFD_X_PAGE (PAGE_SIZE / sizeof(struct pollfd))
#define MAX_FDS_IN_EVENTPOLL (1024 * 128)
#define MAX_EVENTPOLL_PAGES (MAX_FDS_IN_EVENTPOLL / POLLFD_X_PAGE)
#define EVENT_PAGE_INDEX(n) ((n) / POLLFD_X_PAGE)
#define EVENT_PAGE_REM(n) ((n) % POLLFD_X_PAGE)
#define EVENT_PAGE_OFFSET(n) (((n) % POLLFD_X_PAGE) * sizeof(struct pollfd))
#define EP_FDS_PAGES(n) (((n) + POLLFD_X_PAGE - 1) / POLLFD_X_PAGE)
#define EP_MAP_SIZE(n) (EP_FDS_PAGES(n) * PAGE_SIZE * 2)
struct evpoll {
int ep_timeout;
unsigned long ep_resoff;
};
#define EP_ALLOC _IOR('P', 1, int)
#define EP_POLL _IOWR('P', 2, struct evpoll)
#define EP_FREE _IO('P', 3)
#define EP_ISPOLLED _IOWR('P', 4, struct pollfd)
#define EP_CTL_ADD 1
#define EP_CTL_DEL 2
#define EP_CTL_MOD 3
asmlinkage int sys_epoll_create(int maxfds);
asmlinkage int sys_epoll_ctl(int epfd, int op, int fd, unsigned int events);
asmlinkage int sys_epoll_wait(int epfd, struct pollfd const **events, int timeout);
#endif
/*
* include/linux/fcblist.h ( File event callbacks handling )
* Copyright (C) 2001,...,2002 Davide Libenzi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Davide Libenzi <davidel@xmailserver.org>
*
*/
#ifndef __LINUX_FCBLIST_H
#define __LINUX_FCBLIST_H
#include <linux/config.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/file.h>
/* file callback notification events */
#define ION_IN 1
#define ION_OUT 2
#define ION_HUP 3
#define ION_ERR 4
#define FCB_LOCAL_SIZE 4
struct fcb_struct {
struct list_head llink;
void (*cbproc)(struct file *, void *, unsigned long *, long *);
void *data;
unsigned long local[FCB_LOCAL_SIZE];
};
extern long ion_band_table[];
extern long poll_band_table[];
void file_notify_event(struct file *filep, long *event);
int file_notify_addcb(struct file *filep,
void (*cbproc)(struct file *, void *, unsigned long *, long *),
void *data);
int file_notify_delcb(struct file *filep,
void (*cbproc)(struct file *, void *, unsigned long *, long *));
void file_notify_cleanup(struct file *filep);
static inline void file_notify_init(struct file *filep)
{
rwlock_init(&filep->f_cblock);
INIT_LIST_HEAD(&filep->f_cblist);
}
static inline void file_send_notify(struct file *filep, long ioevt, long plevt)
{
long event[] = { ioevt, plevt, -1 };
file_notify_event(filep, event);
}
#endif
......@@ -506,6 +506,10 @@ struct file {
/* needed for tty driver, and maybe others */
void *private_data;
/* file callback list */
rwlock_t f_cblock;
struct list_head f_cblist;
};
extern spinlock_t files_lock;
#define file_list_lock() spin_lock(&files_lock);
......
......@@ -12,6 +12,8 @@ struct pipe_inode_info {
unsigned int waiting_writers;
unsigned int r_counter;
unsigned int w_counter;
struct file *rdfile;
struct file *wrfile;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
};
......@@ -30,6 +32,8 @@ struct pipe_inode_info {
#define PIPE_WAITING_WRITERS(inode) ((inode).i_pipe->waiting_writers)
#define PIPE_RCOUNTER(inode) ((inode).i_pipe->r_counter)
#define PIPE_WCOUNTER(inode) ((inode).i_pipe->w_counter)
#define PIPE_READFILE(inode) ((inode).i_pipe->rdfile)
#define PIPE_WRITEFILE(inode) ((inode).i_pipe->wrfile)
#define PIPE_FASYNC_READERS(inode) (&((inode).i_pipe->fasync_readers))
#define PIPE_FASYNC_WRITERS(inode) (&((inode).i_pipe->fasync_writers))
......
......@@ -4,7 +4,7 @@
/*
* system call entry points ... but not all are defined
*/
#define NR_syscalls 256
#define NR_syscalls 260
/*
* These are system calls that will be removed at some time
......
......@@ -52,6 +52,9 @@
#include <asm/atomic.h>
#include <net/dst.h>
#include <net/scm.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/fcblist.h>
/*
* This structure really needs to be cleaned up.
......@@ -766,8 +769,13 @@ static inline unsigned long sock_wspace(struct sock *sk)
static inline void sk_wake_async(struct sock *sk, int how, int band)
{
if (sk->socket && sk->socket->fasync_list)
sock_wake_async(sk->socket, how, band);
if (sk->socket) {
if (sk->socket->file)
file_send_notify(sk->socket->file, ion_band_table[band - POLL_IN],
poll_band_table[band - POLL_IN]);
if (sk->socket->fasync_list)
sock_wake_async(sk->socket, how, band);
}
}
#define SOCK_MIN_SNDBUF 2048
......
......@@ -476,8 +476,8 @@ void tcp_write_space(struct sock *sk)
if (sk->sleep && waitqueue_active(sk->sleep))
wake_up_interruptible(sk->sleep);
if (sock->fasync_list && !(sk->shutdown & SEND_SHUTDOWN))
sock_wake_async(sock, 2, POLL_OUT);
if (!(sk->shutdown & SEND_SHUTDOWN))
sk_wake_async(sk, 2, POLL_OUT);
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment