Commit ece8a684 authored by Ingo Molnar's avatar Ingo Molnar Committed by Linus Torvalds

[PATCH] sleep profiling

Implement prof=sleep profiling.  TASK_UNINTERRUPTIBLE sleeps will be taken
as a profile hit, and every millisecond spent sleeping causes a profile-hit
for the call site that initiated the sleep.

Sample readprofile output on i386:

   306 ps2_sendbyte                               1.3973
   432 call_usermodehelper_keys                   1.9548
   484 ps2_command                                0.6453
   790 __driver_attach                            4.7879
  1593 msleep                                    44.2500
  3976 sync_buffer                               64.1290
  4076 do_lookup                                 12.4648
  8587 sync_page                                122.6714
 20820 total                                      0.0067

(NOTE: architectures need to check whether get_wchan() can be called from
deep within the wakeup path.)

akpm: we need to mark more functions __sched.  lock_sock(), msleep(), others..

akpm: the contention in do_lookup() is a surprise.  Presumably doing disk
reads for directory contents while holding i_mutex.

[akpm@osdl.org: various fixes]
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 70888bd5
...@@ -1294,6 +1294,7 @@ and is between 256 and 4096 characters. It is defined in the file ...@@ -1294,6 +1294,7 @@ and is between 256 and 4096 characters. It is defined in the file
Param: "schedule" - profile schedule points. Param: "schedule" - profile schedule points.
Param: <number> - step/bucket size as a power of 2 for Param: <number> - step/bucket size as a power of 2 for
statistical time based profiling. statistical time based profiling.
Param: "sleep" - profile D-state sleeping (millisecs)
processor.max_cstate= [HW,ACPI] processor.max_cstate= [HW,ACPI]
Limit processor to maximum C-state Limit processor to maximum C-state
......
...@@ -6,10 +6,15 @@ ...@@ -6,10 +6,15 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/cpumask.h> #include <linux/cpumask.h>
#include <linux/cache.h>
#include <asm/errno.h> #include <asm/errno.h>
extern int prof_on __read_mostly;
#define CPU_PROFILING 1 #define CPU_PROFILING 1
#define SCHED_PROFILING 2 #define SCHED_PROFILING 2
#define SLEEP_PROFILING 3
struct proc_dir_entry; struct proc_dir_entry;
struct pt_regs; struct pt_regs;
...@@ -18,7 +23,24 @@ struct notifier_block; ...@@ -18,7 +23,24 @@ struct notifier_block;
/* init basic kernel profiler */ /* init basic kernel profiler */
void __init profile_init(void); void __init profile_init(void);
void profile_tick(int); void profile_tick(int);
void profile_hit(int, void *);
/*
* Add multiple profiler hits to a given address:
*/
void profile_hits(int, void *ip, unsigned int nr_hits);
/*
* Single profiler hit:
*/
static inline void profile_hit(int type, void *ip)
{
/*
* Speedup for the common (no profiling enabled) case:
*/
if (unlikely(prof_on == type))
profile_hits(type, ip, 1);
}
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
void create_prof_cpu_mask(struct proc_dir_entry *); void create_prof_cpu_mask(struct proc_dir_entry *);
#else #else
......
...@@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly; ...@@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly;
static atomic_t *prof_buffer; static atomic_t *prof_buffer;
static unsigned long prof_len, prof_shift; static unsigned long prof_len, prof_shift;
static int prof_on __read_mostly; int prof_on __read_mostly;
static cpumask_t prof_cpu_mask = CPU_MASK_ALL; static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
...@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex); ...@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex);
static int __init profile_setup(char * str) static int __init profile_setup(char * str)
{ {
static char __initdata schedstr[] = "schedule"; static char __initdata schedstr[] = "schedule";
static char __initdata sleepstr[] = "sleep";
int par; int par;
if (!strncmp(str, schedstr, strlen(schedstr))) { if (!strncmp(str, sleepstr, strlen(sleepstr))) {
prof_on = SLEEP_PROFILING;
if (str[strlen(sleepstr)] == ',')
str += strlen(sleepstr) + 1;
if (get_option(&str, &par))
prof_shift = par;
printk(KERN_INFO
"kernel sleep profiling enabled (shift: %ld)\n",
prof_shift);
} else if (!strncmp(str, sleepstr, strlen(sleepstr))) {
prof_on = SCHED_PROFILING; prof_on = SCHED_PROFILING;
if (str[strlen(schedstr)] == ',') if (str[strlen(schedstr)] == ',')
str += strlen(schedstr) + 1; str += strlen(schedstr) + 1;
...@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister); ...@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister);
* positions to which hits are accounted during short intervals (e.g. * positions to which hits are accounted during short intervals (e.g.
* several seconds) is usually very small. Exclusion from buffer * several seconds) is usually very small. Exclusion from buffer
* flipping is provided by interrupt disablement (note that for * flipping is provided by interrupt disablement (note that for
* SCHED_PROFILING profile_hit() may be called from process context). * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
* process context).
* The hash function is meant to be lightweight as opposed to strong, * The hash function is meant to be lightweight as opposed to strong,
* and was vaguely inspired by ppc64 firmware-supported inverted * and was vaguely inspired by ppc64 firmware-supported inverted
* pagetable hash functions, but uses a full hashtable full of finite * pagetable hash functions, but uses a full hashtable full of finite
...@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void) ...@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void)
mutex_unlock(&profile_flip_mutex); mutex_unlock(&profile_flip_mutex);
} }
void profile_hit(int type, void *__pc) void profile_hits(int type, void *__pc, unsigned int nr_hits)
{ {
unsigned long primary, secondary, flags, pc = (unsigned long)__pc; unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
int i, j, cpu; int i, j, cpu;
...@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc) ...@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc)
put_cpu(); put_cpu();
return; return;
} }
/*
* We buffer the global profiler buffer into a per-CPU
* queue and thus reduce the number of global (and possibly
* NUMA-alien) accesses. The write-queue is self-coalescing:
*/
local_irq_save(flags); local_irq_save(flags);
do { do {
for (j = 0; j < PROFILE_GRPSZ; ++j) { for (j = 0; j < PROFILE_GRPSZ; ++j) {
if (hits[i + j].pc == pc) { if (hits[i + j].pc == pc) {
hits[i + j].hits++; hits[i + j].hits += nr_hits;
goto out; goto out;
} else if (!hits[i + j].hits) { } else if (!hits[i + j].hits) {
hits[i + j].pc = pc; hits[i + j].pc = pc;
hits[i + j].hits = 1; hits[i + j].hits = nr_hits;
goto out; goto out;
} }
} }
i = (i + secondary) & (NR_PROFILE_HIT - 1); i = (i + secondary) & (NR_PROFILE_HIT - 1);
} while (i != primary); } while (i != primary);
atomic_inc(&prof_buffer[pc]);
/*
* Add the current hit(s) and flush the write-queue out
* to the global buffer:
*/
atomic_add(nr_hits, &prof_buffer[pc]);
for (i = 0; i < NR_PROFILE_HIT; ++i) { for (i = 0; i < NR_PROFILE_HIT; ++i) {
atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
hits[i].pc = hits[i].hits = 0; hits[i].pc = hits[i].hits = 0;
...@@ -356,14 +377,14 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, ...@@ -356,14 +377,14 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
#define profile_flip_buffers() do { } while (0) #define profile_flip_buffers() do { } while (0)
#define profile_discard_flip_buffers() do { } while (0) #define profile_discard_flip_buffers() do { } while (0)
void profile_hit(int type, void *__pc) void profile_hits(int type, void *__pc, unsigned int nr_hits)
{ {
unsigned long pc; unsigned long pc;
if (prof_on != type || !prof_buffer) if (prof_on != type || !prof_buffer)
return; return;
pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
} }
#endif /* !CONFIG_SMP */ #endif /* !CONFIG_SMP */
......
...@@ -948,6 +948,17 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) ...@@ -948,6 +948,17 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
} }
#endif #endif
/*
* Sleep time is in units of nanosecs, so shift by 20 to get a
* milliseconds-range estimation of the amount of time that the task
* spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
if (p->state == TASK_UNINTERRUPTIBLE)
profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
(now - p->timestamp) >> 20);
}
if (!rt_task(p)) if (!rt_task(p))
p->prio = recalc_task_prio(p, now); p->prio = recalc_task_prio(p, now);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment