Commit e5d1367f authored by Stephane Eranian's avatar Stephane Eranian Committed by Ingo Molnar

perf: Add cgroup support

This kernel patch adds the ability to filter monitoring based on
container groups (cgroups). This is for use in per-cpu mode only.

The cgroup to monitor is passed as a file descriptor in the pid
argument to the syscall. The file descriptor must be opened to
the cgroup name in the cgroup filesystem. For instance, if the
cgroup name is foo and cgroupfs is mounted in /cgroup, then the
file descriptor is opened to /cgroup/foo. Cgroup mode is
activated by passing PERF_FLAG_PID_CGROUP in the flags argument
to the syscall.

For instance to measure in cgroup foo on CPU1 assuming
cgroupfs is mounted under /cgroup:

struct perf_event_attr attr;
int cgroup_fd, fd;

cgroup_fd = open("/cgroup/foo", O_RDONLY);
fd = perf_event_open(&attr, cgroup_fd, 1, -1, PERF_FLAG_PID_CGROUP);
close(cgroup_fd);
Signed-off-by: default avatarStephane Eranian <eranian@google.com>
[ added perf_cgroup_{exit,attach} ]
Signed-off-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4d590250.114ddf0a.689e.4482@mx.google.com>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent d41d5a01
...@@ -627,6 +627,7 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg, ...@@ -627,6 +627,7 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
/* Get id and depth of css */ /* Get id and depth of css */
unsigned short css_id(struct cgroup_subsys_state *css); unsigned short css_id(struct cgroup_subsys_state *css);
unsigned short css_depth(struct cgroup_subsys_state *css); unsigned short css_depth(struct cgroup_subsys_state *css);
struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
#else /* !CONFIG_CGROUPS */ #else /* !CONFIG_CGROUPS */
......
...@@ -65,4 +65,8 @@ SUBSYS(net_cls) ...@@ -65,4 +65,8 @@ SUBSYS(net_cls)
SUBSYS(blkio) SUBSYS(blkio)
#endif #endif
#ifdef CONFIG_CGROUP_PERF
SUBSYS(perf)
#endif
/* */ /* */
...@@ -464,6 +464,7 @@ enum perf_callchain_context { ...@@ -464,6 +464,7 @@ enum perf_callchain_context {
#define PERF_FLAG_FD_NO_GROUP (1U << 0) #define PERF_FLAG_FD_NO_GROUP (1U << 0)
#define PERF_FLAG_FD_OUTPUT (1U << 1) #define PERF_FLAG_FD_OUTPUT (1U << 1)
#define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */
#ifdef __KERNEL__ #ifdef __KERNEL__
/* /*
...@@ -471,6 +472,7 @@ enum perf_callchain_context { ...@@ -471,6 +472,7 @@ enum perf_callchain_context {
*/ */
#ifdef CONFIG_PERF_EVENTS #ifdef CONFIG_PERF_EVENTS
# include <linux/cgroup.h>
# include <asm/perf_event.h> # include <asm/perf_event.h>
# include <asm/local64.h> # include <asm/local64.h>
#endif #endif
...@@ -716,6 +718,22 @@ struct swevent_hlist { ...@@ -716,6 +718,22 @@ struct swevent_hlist {
#define PERF_ATTACH_GROUP 0x02 #define PERF_ATTACH_GROUP 0x02
#define PERF_ATTACH_TASK 0x04 #define PERF_ATTACH_TASK 0x04
#ifdef CONFIG_CGROUP_PERF
/*
* perf_cgroup_info keeps track of time_enabled for a cgroup.
* This is a per-cpu dynamically allocated data structure.
*/
struct perf_cgroup_info {
u64 time;
u64 timestamp;
};
struct perf_cgroup {
struct cgroup_subsys_state css;
struct perf_cgroup_info *info; /* timing info, one per cpu */
};
#endif
/** /**
* struct perf_event - performance event kernel representation: * struct perf_event - performance event kernel representation:
*/ */
...@@ -832,6 +850,11 @@ struct perf_event { ...@@ -832,6 +850,11 @@ struct perf_event {
struct event_filter *filter; struct event_filter *filter;
#endif #endif
#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp; /* cgroup event is attach to */
int cgrp_defer_enabled;
#endif
#endif /* CONFIG_PERF_EVENTS */ #endif /* CONFIG_PERF_EVENTS */
}; };
...@@ -886,6 +909,7 @@ struct perf_event_context { ...@@ -886,6 +909,7 @@ struct perf_event_context {
u64 generation; u64 generation;
int pin_count; int pin_count;
struct rcu_head rcu_head; struct rcu_head rcu_head;
int nr_cgroups; /* cgroup events present */
}; };
/* /*
...@@ -905,6 +929,9 @@ struct perf_cpu_context { ...@@ -905,6 +929,9 @@ struct perf_cpu_context {
struct list_head rotation_list; struct list_head rotation_list;
int jiffies_interval; int jiffies_interval;
struct pmu *active_pmu; struct pmu *active_pmu;
#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp;
#endif
}; };
struct perf_output_handle { struct perf_output_handle {
...@@ -1040,11 +1067,11 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) ...@@ -1040,11 +1067,11 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
__perf_sw_event(event_id, nr, nmi, regs, addr); __perf_sw_event(event_id, nr, nmi, regs, addr);
} }
extern atomic_t perf_task_events; extern atomic_t perf_sched_events;
static inline void perf_event_task_sched_in(struct task_struct *task) static inline void perf_event_task_sched_in(struct task_struct *task)
{ {
COND_STMT(&perf_task_events, __perf_event_task_sched_in(task)); COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task));
} }
static inline static inline
...@@ -1052,7 +1079,7 @@ void perf_event_task_sched_out(struct task_struct *task, struct task_struct *nex ...@@ -1052,7 +1079,7 @@ void perf_event_task_sched_out(struct task_struct *task, struct task_struct *nex
{ {
perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next)); COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next));
} }
extern void perf_event_mmap(struct vm_area_struct *vma); extern void perf_event_mmap(struct vm_area_struct *vma);
......
...@@ -683,6 +683,16 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED ...@@ -683,6 +683,16 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
select this option (if, for some reason, they need to disable it select this option (if, for some reason, they need to disable it
then noswapaccount does the trick). then noswapaccount does the trick).
config CGROUP_PERF
bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
depends on PERF_EVENTS && CGROUPS
help
This option extends the per-cpu mode to restrict monitoring to
threads which belong to the cgroup specificied and run on the
designated cpu.
Say N if unsure.
menuconfig CGROUP_SCHED menuconfig CGROUP_SCHED
bool "Group CPU scheduler" bool "Group CPU scheduler"
depends on EXPERIMENTAL depends on EXPERIMENTAL
......
...@@ -4818,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id, ...@@ -4818,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
return ret; return ret;
} }
/*
* get corresponding css from file open on cgroupfs directory
*/
struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
{
struct cgroup *cgrp;
struct inode *inode;
struct cgroup_subsys_state *css;
inode = f->f_dentry->d_inode;
/* check in cgroup filesystem dir */
if (inode->i_op != &cgroup_dir_inode_operations)
return ERR_PTR(-EBADF);
if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
return ERR_PTR(-EINVAL);
/* get cgroup */
cgrp = __d_cgrp(f->f_dentry);
css = cgrp->subsys[id];
return css ? css : ERR_PTR(-ENOENT);
}
#ifdef CONFIG_CGROUP_DEBUG #ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
struct cgroup *cont) struct cgroup *cont)
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment