Commit cc31edce authored by Paul Menage's avatar Paul Menage Committed by Linus Torvalds

cgroups: convert tasks file to use a seq_file with shared pid array

Rather than pre-generating the entire text for the "tasks" file each
time the file is opened, we instead just generate/update the array of
process ids and use a seq_file to report these to userspace.  All open
file handles on the same "tasks" file can share a pid array, which may
be updated any time that no thread is actively reading the array.  By
sharing the array, the potential for userspace to DoS the system by
opening many handles on the same "tasks" file is removed.

[Based on a patch by Lai Jiangshan, extended to use seq_file]
Signed-off-by: default avatarPaul Menage <menage@google.com>
Reviewed-by: default avatarLai Jiangshan <laijs@cn.fujitsu.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 146aa1bd
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <linux/rcupdate.h> #include <linux/rcupdate.h>
#include <linux/cgroupstats.h> #include <linux/cgroupstats.h>
#include <linux/prio_heap.h> #include <linux/prio_heap.h>
#include <linux/rwsem.h>
#ifdef CONFIG_CGROUPS #ifdef CONFIG_CGROUPS
...@@ -136,6 +137,15 @@ struct cgroup { ...@@ -136,6 +137,15 @@ struct cgroup {
* release_list_lock * release_list_lock
*/ */
struct list_head release_list; struct list_head release_list;
/* pids_mutex protects the fields below */
struct rw_semaphore pids_mutex;
/* Array of process ids in the cgroup */
pid_t *tasks_pids;
/* How many files are using the current tasks_pids array */
int pids_use_count;
/* Length of the current tasks_pids array */
int pids_length;
}; };
/* A css_set is a structure holding pointers to a set of /* A css_set is a structure holding pointers to a set of
......
...@@ -868,6 +868,14 @@ static struct super_operations cgroup_ops = { ...@@ -868,6 +868,14 @@ static struct super_operations cgroup_ops = {
.remount_fs = cgroup_remount, .remount_fs = cgroup_remount,
}; };
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
INIT_LIST_HEAD(&cgrp->sibling);
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
init_rwsem(&cgrp->pids_mutex);
}
static void init_cgroup_root(struct cgroupfs_root *root) static void init_cgroup_root(struct cgroupfs_root *root)
{ {
struct cgroup *cgrp = &root->top_cgroup; struct cgroup *cgrp = &root->top_cgroup;
...@@ -876,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) ...@@ -876,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
root->number_of_cgroups = 1; root->number_of_cgroups = 1;
cgrp->root = root; cgrp->root = root;
cgrp->top_cgroup = cgrp; cgrp->top_cgroup = cgrp;
INIT_LIST_HEAD(&cgrp->sibling); init_cgroup_housekeeping(cgrp);
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
} }
static int cgroup_test_super(struct super_block *sb, void *data) static int cgroup_test_super(struct super_block *sb, void *data)
...@@ -1995,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) ...@@ -1995,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
* but we cannot guarantee that the information we produce is correct * but we cannot guarantee that the information we produce is correct
* unless we produce it entirely atomically. * unless we produce it entirely atomically.
* *
* Upon tasks file open(), a struct ctr_struct is allocated, that
* will have a pointer to an array (also allocated here). The struct
* ctr_struct * is stored in file->private_data. Its resources will
* be freed by release() when the file is closed. The array is used
* to sprintf the PIDs and then used by read().
*/ */
struct ctr_struct {
char *buf;
int bufsz;
};
/* /*
* Load into 'pidarray' up to 'npids' of the tasks using cgroup * Load into 'pidarray' up to 'npids' of the tasks using cgroup
...@@ -2086,42 +2082,132 @@ static int cmppid(const void *a, const void *b) ...@@ -2086,42 +2082,132 @@ static int cmppid(const void *a, const void *b)
return *(pid_t *)a - *(pid_t *)b; return *(pid_t *)a - *(pid_t *)b;
} }
/* /*
* Convert array 'a' of 'npids' pid_t's to a string of newline separated * seq_file methods for the "tasks" file. The seq_file position is the
* decimal pids in 'buf'. Don't write more than 'sz' chars, but return * next pid to display; the seq_file iterator is a pointer to the pid
* count 'cnt' of how many chars would be written if buf were large enough. * in the cgroup->tasks_pids array.
*/ */
static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
{ {
int cnt = 0; /*
* Initially we receive a position value that corresponds to
* one more than the last pid shown (or 0 on the first call or
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
struct cgroup *cgrp = s->private;
int index = 0, pid = *pos;
int *iter;
down_read(&cgrp->pids_mutex);
if (pid) {
int end = cgrp->pids_length;
int i; int i;
while (index < end) {
int mid = (index + end) / 2;
if (cgrp->tasks_pids[mid] == pid) {
index = mid;
break;
} else if (cgrp->tasks_pids[mid] <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
if (index >= cgrp->pids_length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
iter = cgrp->tasks_pids + index;
*pos = *iter;
return iter;
}
static void cgroup_tasks_stop(struct seq_file *s, void *v)
{
struct cgroup *cgrp = s->private;
up_read(&cgrp->pids_mutex);
}
static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
{
struct cgroup *cgrp = s->private;
int *p = v;
int *end = cgrp->tasks_pids + cgrp->pids_length;
/*
* Advance to the next pid in the array. If this goes off the
* end, we're done
*/
p++;
if (p >= end) {
return NULL;
} else {
*pos = *p;
return p;
}
}
static int cgroup_tasks_show(struct seq_file *s, void *v)
{
return seq_printf(s, "%d\n", *(int *)v);
}
static struct seq_operations cgroup_tasks_seq_operations = {
.start = cgroup_tasks_start,
.stop = cgroup_tasks_stop,
.next = cgroup_tasks_next,
.show = cgroup_tasks_show,
};
static void release_cgroup_pid_array(struct cgroup *cgrp)
{
down_write(&cgrp->pids_mutex);
BUG_ON(!cgrp->pids_use_count);
if (!--cgrp->pids_use_count) {
kfree(cgrp->tasks_pids);
cgrp->tasks_pids = NULL;
cgrp->pids_length = 0;
}
up_write(&cgrp->pids_mutex);
}
static int cgroup_tasks_release(struct inode *inode, struct file *file)
{
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
for (i = 0; i < npids; i++) if (!(file->f_mode & FMODE_READ))
cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); return 0;
return cnt;
release_cgroup_pid_array(cgrp);
return seq_release(inode, file);
} }
static struct file_operations cgroup_tasks_operations = {
.read = seq_read,
.llseek = seq_lseek,
.write = cgroup_file_write,
.release = cgroup_tasks_release,
};
/* /*
* Handle an open on 'tasks' file. Prepare a buffer listing the * Handle an open on 'tasks' file. Prepare an array containing the
* process id's of tasks currently attached to the cgroup being opened. * process id's of tasks currently attached to the cgroup being opened.
*
* Does not require any specific cgroup mutexes, and does not take any.
*/ */
static int cgroup_tasks_open(struct inode *unused, struct file *file) static int cgroup_tasks_open(struct inode *unused, struct file *file)
{ {
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
struct ctr_struct *ctr;
pid_t *pidarray; pid_t *pidarray;
int npids; int npids;
char c; int retval;
/* Nothing to do for write-only files */
if (!(file->f_mode & FMODE_READ)) if (!(file->f_mode & FMODE_READ))
return 0; return 0;
ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
if (!ctr)
goto err0;
/* /*
* If cgroup gets more users after we read count, we won't have * If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the * enough space - tough. This race is indistinguishable to the
...@@ -2129,57 +2215,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) ...@@ -2129,57 +2215,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
* show up until sometime later on. * show up until sometime later on.
*/ */
npids = cgroup_task_count(cgrp); npids = cgroup_task_count(cgrp);
if (npids) {
pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
if (!pidarray) if (!pidarray)
goto err1; return -ENOMEM;
npids = pid_array_load(pidarray, npids, cgrp); npids = pid_array_load(pidarray, npids, cgrp);
sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
/* Call pid_array_to_buf() twice, first just to get bufsz */ /*
ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; * Store the array in the cgroup, freeing the old
ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); * array if necessary
if (!ctr->buf) */
goto err2; down_write(&cgrp->pids_mutex);
ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); kfree(cgrp->tasks_pids);
cgrp->tasks_pids = pidarray;
kfree(pidarray); cgrp->pids_length = npids;
} else { cgrp->pids_use_count++;
ctr->buf = NULL; up_write(&cgrp->pids_mutex);
ctr->bufsz = 0;
}
file->private_data = ctr;
return 0;
err2:
kfree(pidarray);
err1:
kfree(ctr);
err0:
return -ENOMEM;
}
static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
struct cftype *cft,
struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos)
{
struct ctr_struct *ctr = file->private_data;
return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
}
static int cgroup_tasks_release(struct inode *unused_inode, file->f_op = &cgroup_tasks_operations;
struct file *file)
{
struct ctr_struct *ctr;
if (file->f_mode & FMODE_READ) { retval = seq_open(file, &cgroup_tasks_seq_operations);
ctr = file->private_data; if (retval) {
kfree(ctr->buf); release_cgroup_pid_array(cgrp);
kfree(ctr); return retval;
} }
((struct seq_file *)file->private_data)->private = cgrp;
return 0; return 0;
} }
...@@ -2208,7 +2268,6 @@ static struct cftype files[] = { ...@@ -2208,7 +2268,6 @@ static struct cftype files[] = {
{ {
.name = "tasks", .name = "tasks",
.open = cgroup_tasks_open, .open = cgroup_tasks_open,
.read = cgroup_tasks_read,
.write_u64 = cgroup_tasks_write, .write_u64 = cgroup_tasks_write,
.release = cgroup_tasks_release, .release = cgroup_tasks_release,
.private = FILE_TASKLIST, .private = FILE_TASKLIST,
...@@ -2298,10 +2357,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, ...@@ -2298,10 +2357,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
INIT_LIST_HEAD(&cgrp->sibling); init_cgroup_housekeeping(cgrp);
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
cgrp->parent = parent; cgrp->parent = parent;
cgrp->root = parent->root; cgrp->root = parent->root;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment