Commit 52b1364b authored by Chengming Zhou's avatar Chengming Zhou Committed by Peter Zijlstra

sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure

Now PSI already tracked workload pressure stall information for
CPU, memory and IO. Apart from these, IRQ/SOFTIRQ could have
obvious impact on some workload productivity, such as web service
workload.

When CONFIG_IRQ_TIME_ACCOUNTING, we can get IRQ/SOFTIRQ delta time
from update_rq_clock_task(), in which we can record that delta
to CPU curr task's cgroups as PSI_IRQ_FULL status.

Note we don't use PSI_IRQ_SOME since IRQ/SOFTIRQ always happen in
the current task on the CPU, make nothing productive could run
even if it were runnable, so we only use PSI_IRQ_FULL.
Signed-off-by: default avatarChengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20220825164111.29534-8-zhouchengming@bytedance.com
parent 71dbdde7
...@@ -976,6 +976,12 @@ All cgroup core files are prefixed with "cgroup." ...@@ -976,6 +976,12 @@ All cgroup core files are prefixed with "cgroup."
killing cgroups is a process directed operation, i.e. it affects killing cgroups is a process directed operation, i.e. it affects
the whole thread-group. the whole thread-group.
irq.pressure
A read-write nested-keyed file.
Shows pressure stall information for IRQ/SOFTIRQ. See
:ref:`Documentation/accounting/psi.rst <psi>` for details.
Controllers Controllers
=========== ===========
......
...@@ -42,7 +42,10 @@ enum psi_res { ...@@ -42,7 +42,10 @@ enum psi_res {
PSI_IO, PSI_IO,
PSI_MEM, PSI_MEM,
PSI_CPU, PSI_CPU,
NR_PSI_RESOURCES = 3, #ifdef CONFIG_IRQ_TIME_ACCOUNTING
PSI_IRQ,
#endif
NR_PSI_RESOURCES,
}; };
/* /*
...@@ -58,9 +61,12 @@ enum psi_states { ...@@ -58,9 +61,12 @@ enum psi_states {
PSI_MEM_FULL, PSI_MEM_FULL,
PSI_CPU_SOME, PSI_CPU_SOME,
PSI_CPU_FULL, PSI_CPU_FULL,
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
PSI_IRQ_FULL,
#endif
/* Only per-CPU, to weigh the CPU in the global average: */ /* Only per-CPU, to weigh the CPU in the global average: */
PSI_NONIDLE, PSI_NONIDLE,
NR_PSI_STATES = 7, NR_PSI_STATES,
}; };
/* Use one bit in the state mask to track TSK_ONCPU */ /* Use one bit in the state mask to track TSK_ONCPU */
......
...@@ -3763,6 +3763,23 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, ...@@ -3763,6 +3763,23 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
} }
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
return psi_show(seq, psi, PSI_IRQ);
}
static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ);
}
#endif
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
poll_table *pt) poll_table *pt)
{ {
...@@ -5179,6 +5196,16 @@ static struct cftype cgroup_base_files[] = { ...@@ -5179,6 +5196,16 @@ static struct cftype cgroup_base_files[] = {
.poll = cgroup_pressure_poll, .poll = cgroup_pressure_poll,
.release = cgroup_pressure_release, .release = cgroup_pressure_release,
}, },
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
{
.name = "irq.pressure",
.flags = CFTYPE_PRESSURE,
.seq_show = cgroup_irq_pressure_show,
.write = cgroup_irq_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
#endif
#endif /* CONFIG_PSI */ #endif /* CONFIG_PSI */
{ } /* terminate */ { } /* terminate */
}; };
......
...@@ -708,6 +708,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) ...@@ -708,6 +708,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta; rq->prev_irq_time += irq_delta;
delta -= irq_delta; delta -= irq_delta;
psi_account_irqtime(rq->curr, irq_delta);
#endif #endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) { if (static_key_false((&paravirt_steal_rq_enabled))) {
......
...@@ -904,6 +904,36 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, ...@@ -904,6 +904,36 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
} }
} }
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
void psi_account_irqtime(struct task_struct *task, u32 delta)
{
int cpu = task_cpu(task);
void *iter = NULL;
struct psi_group *group;
struct psi_group_cpu *groupc;
u64 now;
if (!task->pid)
return;
now = cpu_clock(cpu);
while ((group = iterate_groups(task, &iter))) {
groupc = per_cpu_ptr(group->pcpu, cpu);
write_seqcount_begin(&groupc->seq);
record_times(groupc, now);
groupc->times[PSI_IRQ_FULL] += delta;
write_seqcount_end(&groupc->seq);
if (group->poll_states & (1 << PSI_IRQ_FULL))
psi_schedule_poll_work(group, 1);
}
}
#endif
/** /**
* psi_memstall_enter - mark the beginning of a memory stall section * psi_memstall_enter - mark the beginning of a memory stall section
* @flags: flags to handle nested sections * @flags: flags to handle nested sections
...@@ -1065,6 +1095,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) ...@@ -1065,6 +1095,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
{ {
bool only_full = false;
int full; int full;
u64 now; u64 now;
...@@ -1079,7 +1110,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) ...@@ -1079,7 +1110,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
group->avg_next_update = update_averages(group, now); group->avg_next_update = update_averages(group, now);
mutex_unlock(&group->avgs_lock); mutex_unlock(&group->avgs_lock);
for (full = 0; full < 2; full++) { #ifdef CONFIG_IRQ_TIME_ACCOUNTING
only_full = res == PSI_IRQ;
#endif
for (full = 0; full < 2 - only_full; full++) {
unsigned long avg[3] = { 0, }; unsigned long avg[3] = { 0, };
u64 total = 0; u64 total = 0;
int w; int w;
...@@ -1093,7 +1128,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) ...@@ -1093,7 +1128,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
} }
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
full ? "full" : "some", full || only_full ? "full" : "some",
LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
...@@ -1121,6 +1156,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, ...@@ -1121,6 +1156,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
else else
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
return ERR_PTR(-EINVAL);
#endif
if (state >= PSI_NONIDLE) if (state >= PSI_NONIDLE)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
...@@ -1405,6 +1445,33 @@ static const struct proc_ops psi_cpu_proc_ops = { ...@@ -1405,6 +1445,33 @@ static const struct proc_ops psi_cpu_proc_ops = {
.proc_release = psi_fop_release, .proc_release = psi_fop_release,
}; };
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int psi_irq_show(struct seq_file *m, void *v)
{
return psi_show(m, &psi_system, PSI_IRQ);
}
static int psi_irq_open(struct inode *inode, struct file *file)
{
return psi_open(file, psi_irq_show);
}
static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
size_t nbytes, loff_t *ppos)
{
return psi_write(file, user_buf, nbytes, PSI_IRQ);
}
static const struct proc_ops psi_irq_proc_ops = {
.proc_open = psi_irq_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_write = psi_irq_write,
.proc_poll = psi_fop_poll,
.proc_release = psi_fop_release,
};
#endif
static int __init psi_proc_init(void) static int __init psi_proc_init(void)
{ {
if (psi_enable) { if (psi_enable) {
...@@ -1412,6 +1479,9 @@ static int __init psi_proc_init(void) ...@@ -1412,6 +1479,9 @@ static int __init psi_proc_init(void)
proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
#endif
} }
return 0; return 0;
} }
......
...@@ -110,6 +110,7 @@ __schedstats_from_se(struct sched_entity *se) ...@@ -110,6 +110,7 @@ __schedstats_from_se(struct sched_entity *se)
void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next, void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep); bool sleep);
void psi_account_irqtime(struct task_struct *task, u32 delta);
/* /*
* PSI tracks state that persists across sleeps, such as iowaits and * PSI tracks state that persists across sleeps, such as iowaits and
...@@ -205,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {} ...@@ -205,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev, static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next, struct task_struct *next,
bool sleep) {} bool sleep) {}
static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
#endif /* CONFIG_PSI */ #endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO #ifdef CONFIG_SCHED_INFO
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment