Commit 7ef9964e authored by Davide Libenzi's avatar Davide Libenzi Committed by Linus Torvalds

epoll: introduce resource usage limits

It has been thought that the per-user file descriptors limit would also
limit the resources that a normal user can request via the epoll
interface.  Vegard Nossum reported a very simple program (a modified
version attached) that can make a normal user to request a pretty large
amount of kernel memory, well within the its maximum number of fds.  To
solve such problem, default limits are now imposed, and /proc based
configuration has been introduced.  A new directory has been created,
named /proc/sys/fs/epoll/ and inside there, there are two configuration
points:

  max_user_instances = Maximum number of devices - per user

  max_user_watches   = Maximum number of "watched" fds - per user

The current default for "max_user_watches" limits the memory used by epoll
to store "watches", to 1/32 of the amount of the low RAM.  As example, a
256MB 32bit machine, will have "max_user_watches" set to roughly 90000.
That should be enough to not break existing heavy epoll users.  The
default value for "max_user_instances" is set to 128, that should be
enough too.

This also changes the userspace, because a new error code can now come out
from EPOLL_CTL_ADD (-ENOSPC).  The EMFILE from epoll_create() was already
listed, so that should be ok.

[akpm@linux-foundation.org: use get_current_user()]
Signed-off-by: default avatarDavide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: <stable@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Reported-by: default avatarVegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent b7d271df
...@@ -44,6 +44,7 @@ Table of Contents ...@@ -44,6 +44,7 @@ Table of Contents
2.14 /proc/<pid>/io - Display the IO accounting fields 2.14 /proc/<pid>/io - Display the IO accounting fields
2.15 /proc/<pid>/coredump_filter - Core dump filtering settings 2.15 /proc/<pid>/coredump_filter - Core dump filtering settings
2.16 /proc/<pid>/mountinfo - Information about mounts 2.16 /proc/<pid>/mountinfo - Information about mounts
2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
Preface Preface
...@@ -2483,4 +2484,30 @@ For more information on mount propagation see: ...@@ -2483,4 +2484,30 @@ For more information on mount propagation see:
Documentation/filesystems/sharedsubtree.txt Documentation/filesystems/sharedsubtree.txt
2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface
--------------------------------------------------------
This directory contains configuration options for the epoll(7) interface.
max_user_instances
------------------
This is the maximum number of epoll file descriptors that a single user can
have open at a given time. The default value is 128, and should be enough
for normal users.
max_user_watches
----------------
Every epoll file descriptor can store a number of files to be monitored
for event readiness. Each one of these monitored files constitutes a "watch".
This configuration option sets the maximum number of "watches" that are
allowed for each user.
Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
on a 64bit one.
The current default value for max_user_watches is the 1/32 of the available
low memory, divided for the "watch" cost in bytes.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
...@@ -102,6 +102,8 @@ ...@@ -102,6 +102,8 @@
#define EP_UNACTIVE_PTR ((void *) -1L) #define EP_UNACTIVE_PTR ((void *) -1L)
#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
struct epoll_filefd { struct epoll_filefd {
struct file *file; struct file *file;
int fd; int fd;
...@@ -200,6 +202,9 @@ struct eventpoll { ...@@ -200,6 +202,9 @@ struct eventpoll {
* holding ->lock. * holding ->lock.
*/ */
struct epitem *ovflist; struct epitem *ovflist;
/* The user that created the eventpoll descriptor */
struct user_struct *user;
}; };
/* Wait structure used by the poll hooks */ /* Wait structure used by the poll hooks */
...@@ -226,10 +231,18 @@ struct ep_pqueue { ...@@ -226,10 +231,18 @@ struct ep_pqueue {
struct epitem *epi; struct epitem *epi;
}; };
/*
* Configuration options available inside /proc/sys/fs/epoll/
*/
/* Maximum number of epoll devices, per user */
static int max_user_instances __read_mostly;
/* Maximum number of epoll watched descriptors, per user */
static int max_user_watches __read_mostly;
/* /*
* This mutex is used to serialize ep_free() and eventpoll_release_file(). * This mutex is used to serialize ep_free() and eventpoll_release_file().
*/ */
static struct mutex epmutex; static DEFINE_MUTEX(epmutex);
/* Safe wake up implementation */ /* Safe wake up implementation */
static struct poll_safewake psw; static struct poll_safewake psw;
...@@ -240,6 +253,33 @@ static struct kmem_cache *epi_cache __read_mostly; ...@@ -240,6 +253,33 @@ static struct kmem_cache *epi_cache __read_mostly;
/* Slab cache used to allocate "struct eppoll_entry" */ /* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __read_mostly; static struct kmem_cache *pwq_cache __read_mostly;
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
static int zero;
ctl_table epoll_table[] = {
{
.procname = "max_user_instances",
.data = &max_user_instances,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.extra1 = &zero,
},
{
.procname = "max_user_watches",
.data = &max_user_watches,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.extra1 = &zero,
},
{ .ctl_name = 0 }
};
#endif /* CONFIG_SYSCTL */
/* Setup the structure that is used as key for the RB tree */ /* Setup the structure that is used as key for the RB tree */
static inline void ep_set_ffd(struct epoll_filefd *ffd, static inline void ep_set_ffd(struct epoll_filefd *ffd,
...@@ -402,6 +442,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) ...@@ -402,6 +442,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
/* At this point it is safe to free the eventpoll item */ /* At this point it is safe to free the eventpoll item */
kmem_cache_free(epi_cache, epi); kmem_cache_free(epi_cache, epi);
atomic_dec(&ep->user->epoll_watches);
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n", DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
current, ep, file)); current, ep, file));
...@@ -449,6 +491,8 @@ static void ep_free(struct eventpoll *ep) ...@@ -449,6 +491,8 @@ static void ep_free(struct eventpoll *ep)
mutex_unlock(&epmutex); mutex_unlock(&epmutex);
mutex_destroy(&ep->mtx); mutex_destroy(&ep->mtx);
atomic_dec(&ep->user->epoll_devs);
free_uid(ep->user);
kfree(ep); kfree(ep);
} }
...@@ -532,10 +576,19 @@ void eventpoll_release_file(struct file *file) ...@@ -532,10 +576,19 @@ void eventpoll_release_file(struct file *file)
static int ep_alloc(struct eventpoll **pep) static int ep_alloc(struct eventpoll **pep)
{ {
struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL); int error;
struct user_struct *user;
struct eventpoll *ep;
if (!ep) user = get_current_user();
return -ENOMEM; error = -EMFILE;
if (unlikely(atomic_read(&user->epoll_devs) >=
max_user_instances))
goto free_uid;
error = -ENOMEM;
ep = kzalloc(sizeof(*ep), GFP_KERNEL);
if (unlikely(!ep))
goto free_uid;
spin_lock_init(&ep->lock); spin_lock_init(&ep->lock);
mutex_init(&ep->mtx); mutex_init(&ep->mtx);
...@@ -544,12 +597,17 @@ static int ep_alloc(struct eventpoll **pep) ...@@ -544,12 +597,17 @@ static int ep_alloc(struct eventpoll **pep)
INIT_LIST_HEAD(&ep->rdllist); INIT_LIST_HEAD(&ep->rdllist);
ep->rbr = RB_ROOT; ep->rbr = RB_ROOT;
ep->ovflist = EP_UNACTIVE_PTR; ep->ovflist = EP_UNACTIVE_PTR;
ep->user = user;
*pep = ep; *pep = ep;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
current, ep)); current, ep));
return 0; return 0;
free_uid:
free_uid(user);
return error;
} }
/* /*
...@@ -703,9 +761,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, ...@@ -703,9 +761,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
struct epitem *epi; struct epitem *epi;
struct ep_pqueue epq; struct ep_pqueue epq;
error = -ENOMEM; if (unlikely(atomic_read(&ep->user->epoll_watches) >=
max_user_watches))
return -ENOSPC;
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
goto error_return; return -ENOMEM;
/* Item initialization follow here ... */ /* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->rdllink);
...@@ -735,6 +795,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, ...@@ -735,6 +795,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
* install process. Namely an allocation for a wait queue failed due * install process. Namely an allocation for a wait queue failed due
* high memory pressure. * high memory pressure.
*/ */
error = -ENOMEM;
if (epi->nwait < 0) if (epi->nwait < 0)
goto error_unregister; goto error_unregister;
...@@ -765,6 +826,8 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, ...@@ -765,6 +826,8 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
spin_unlock_irqrestore(&ep->lock, flags); spin_unlock_irqrestore(&ep->lock, flags);
atomic_inc(&ep->user->epoll_watches);
/* We have to call this outside the lock */ /* We have to call this outside the lock */
if (pwake) if (pwake)
ep_poll_safewake(&psw, &ep->poll_wait); ep_poll_safewake(&psw, &ep->poll_wait);
...@@ -789,7 +852,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, ...@@ -789,7 +852,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
spin_unlock_irqrestore(&ep->lock, flags); spin_unlock_irqrestore(&ep->lock, flags);
kmem_cache_free(epi_cache, epi); kmem_cache_free(epi_cache, epi);
error_return:
return error; return error;
} }
...@@ -1078,6 +1141,7 @@ asmlinkage long sys_epoll_create1(int flags) ...@@ -1078,6 +1141,7 @@ asmlinkage long sys_epoll_create1(int flags)
flags & O_CLOEXEC); flags & O_CLOEXEC);
if (fd < 0) if (fd < 0)
ep_free(ep); ep_free(ep);
atomic_inc(&ep->user->epoll_devs);
error_return: error_return:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
...@@ -1299,7 +1363,12 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, ...@@ -1299,7 +1363,12 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
static int __init eventpoll_init(void) static int __init eventpoll_init(void)
{ {
mutex_init(&epmutex); struct sysinfo si;
si_meminfo(&si);
max_user_instances = 128;
max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) /
EP_ITEM_COST;
/* Initialize the structure used to perform safe poll wait head wake ups */ /* Initialize the structure used to perform safe poll wait head wake ups */
ep_poll_safewake_init(&psw); ep_poll_safewake_init(&psw);
......
...@@ -630,6 +630,10 @@ struct user_struct { ...@@ -630,6 +630,10 @@ struct user_struct {
atomic_t inotify_watches; /* How many inotify watches does this user have? */ atomic_t inotify_watches; /* How many inotify watches does this user have? */
atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
#endif #endif
#ifdef CONFIG_EPOLL
atomic_t epoll_devs; /* The number of epoll descriptors currently open */
atomic_t epoll_watches; /* The number of file descriptors currently watched */
#endif
#ifdef CONFIG_POSIX_MQUEUE #ifdef CONFIG_POSIX_MQUEUE
/* protected by mq_lock */ /* protected by mq_lock */
unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
......
...@@ -176,6 +176,9 @@ extern struct ctl_table random_table[]; ...@@ -176,6 +176,9 @@ extern struct ctl_table random_table[];
#ifdef CONFIG_INOTIFY_USER #ifdef CONFIG_INOTIFY_USER
extern struct ctl_table inotify_table[]; extern struct ctl_table inotify_table[];
#endif #endif
#ifdef CONFIG_EPOLL
extern struct ctl_table epoll_table[];
#endif
#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
int sysctl_legacy_va_layout; int sysctl_legacy_va_layout;
...@@ -1325,6 +1328,13 @@ static struct ctl_table fs_table[] = { ...@@ -1325,6 +1328,13 @@ static struct ctl_table fs_table[] = {
.child = inotify_table, .child = inotify_table,
}, },
#endif #endif
#ifdef CONFIG_EPOLL
{
.procname = "epoll",
.mode = 0555,
.child = epoll_table,
},
#endif
#endif #endif
{ {
.ctl_name = KERN_SETUID_DUMPABLE, .ctl_name = KERN_SETUID_DUMPABLE,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment