Commit fa2c3254 authored by Valentin Schneider's avatar Valentin Schneider Committed by Peter Zijlstra

sched/tracing: Don't re-read p->state when emitting sched_switch event

As of commit

  c6e7bd7a ("sched/core: Optimize ttwu() spinning on p->on_cpu")

the following sequence becomes possible:

		      p->__state = TASK_INTERRUPTIBLE;
		      __schedule()
			deactivate_task(p);
  ttwu()
    READ !p->on_rq
    p->__state=TASK_WAKING
			trace_sched_switch()
			  __trace_sched_switch_state()
			    task_state_index()
			      return 0;

TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
the trace event.

Prevent this by pushing the value read from __schedule() down the trace
event.
Reported-by: default avatarAbhijeet Dharmapurikar <adharmap@quicinc.com>
Signed-off-by: default avatarValentin Schneider <valentin.schneider@arm.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: default avatarSteven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20220120162520.570782-2-valentin.schneider@arm.com
parent 49bef33e
...@@ -1620,10 +1620,10 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) ...@@ -1620,10 +1620,10 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
#define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_IDLE (TASK_REPORT + 1)
#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)
static inline unsigned int task_state_index(struct task_struct *tsk) static inline unsigned int __task_state_index(unsigned int tsk_state,
unsigned int tsk_exit_state)
{ {
unsigned int tsk_state = READ_ONCE(tsk->__state); unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT;
unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);
...@@ -1633,6 +1633,11 @@ static inline unsigned int task_state_index(struct task_struct *tsk) ...@@ -1633,6 +1633,11 @@ static inline unsigned int task_state_index(struct task_struct *tsk)
return fls(state); return fls(state);
} }
static inline unsigned int task_state_index(struct task_struct *tsk)
{
return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state);
}
static inline char task_index_to_char(unsigned int state) static inline char task_index_to_char(unsigned int state)
{ {
static const char state_char[] = "RSDTtXZPI"; static const char state_char[] = "RSDTtXZPI";
......
...@@ -187,7 +187,9 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, ...@@ -187,7 +187,9 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
TP_ARGS(p)); TP_ARGS(p));
#ifdef CREATE_TRACE_POINTS #ifdef CREATE_TRACE_POINTS
static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p) static inline long __trace_sched_switch_state(bool preempt,
unsigned int prev_state,
struct task_struct *p)
{ {
unsigned int state; unsigned int state;
...@@ -208,7 +210,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct * ...@@ -208,7 +210,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
* it for left shift operation to get the correct task->state * it for left shift operation to get the correct task->state
* mapping. * mapping.
*/ */
state = task_state_index(p); state = __task_state_index(prev_state, p->exit_state);
return state ? (1 << (state - 1)) : state; return state ? (1 << (state - 1)) : state;
} }
...@@ -220,10 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct * ...@@ -220,10 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
TRACE_EVENT(sched_switch, TRACE_EVENT(sched_switch,
TP_PROTO(bool preempt, TP_PROTO(bool preempt,
unsigned int prev_state,
struct task_struct *prev, struct task_struct *prev,
struct task_struct *next), struct task_struct *next),
TP_ARGS(preempt, prev, next), TP_ARGS(preempt, prev_state, prev, next),
TP_STRUCT__entry( TP_STRUCT__entry(
__array( char, prev_comm, TASK_COMM_LEN ) __array( char, prev_comm, TASK_COMM_LEN )
...@@ -239,7 +242,7 @@ TRACE_EVENT(sched_switch, ...@@ -239,7 +242,7 @@ TRACE_EVENT(sched_switch,
memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
__entry->prev_pid = prev->pid; __entry->prev_pid = prev->pid;
__entry->prev_prio = prev->prio; __entry->prev_prio = prev->prio;
__entry->prev_state = __trace_sched_switch_state(preempt, prev); __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
__entry->next_pid = next->pid; __entry->next_pid = next->pid;
__entry->next_prio = next->prio; __entry->next_prio = next->prio;
......
...@@ -4836,7 +4836,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) ...@@ -4836,7 +4836,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
{ {
struct rq *rq = this_rq(); struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm; struct mm_struct *mm = rq->prev_mm;
long prev_state; unsigned int prev_state;
/* /*
* The previous task will have left us with a preempt_count of 2 * The previous task will have left us with a preempt_count of 2
...@@ -6300,7 +6300,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) ...@@ -6300,7 +6300,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
migrate_disable_switch(rq, prev); migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev)); psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next); trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);
/* Also unlocks the rq: */ /* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf); rq = context_switch(rq, prev, next, &rf);
......
...@@ -415,7 +415,9 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) ...@@ -415,7 +415,9 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
static void static void
ftrace_graph_probe_sched_switch(void *ignore, bool preempt, ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
struct task_struct *prev, struct task_struct *next) unsigned int prev_state,
struct task_struct *prev,
struct task_struct *next)
{ {
unsigned long long timestamp; unsigned long long timestamp;
int index; int index;
......
...@@ -7347,7 +7347,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) ...@@ -7347,7 +7347,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
static void static void
ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
struct task_struct *prev, struct task_struct *next) unsigned int prev_state,
struct task_struct *prev,
struct task_struct *next)
{ {
struct trace_array *tr = data; struct trace_array *tr = data;
struct trace_pid_list *pid_list; struct trace_pid_list *pid_list;
......
...@@ -759,7 +759,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable) ...@@ -759,7 +759,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
static void static void
event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
struct task_struct *prev, struct task_struct *next) unsigned int prev_state,
struct task_struct *prev,
struct task_struct *next)
{ {
struct trace_array *tr = data; struct trace_array *tr = data;
struct trace_pid_list *no_pid_list; struct trace_pid_list *no_pid_list;
...@@ -783,7 +785,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, ...@@ -783,7 +785,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
static void static void
event_filter_pid_sched_switch_probe_post(void *data, bool preempt, event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
struct task_struct *prev, struct task_struct *next) unsigned int prev_state,
struct task_struct *prev,
struct task_struct *next)
{ {
struct trace_array *tr = data; struct trace_array *tr = data;
struct trace_pid_list *no_pid_list; struct trace_pid_list *no_pid_list;
......
...@@ -1167,7 +1167,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) ...@@ -1167,7 +1167,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
* used to record the beginning and to report the end of a thread noise window. * used to record the beginning and to report the end of a thread noise window.
*/ */
static void static void
trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, trace_sched_switch_callback(void *data, bool preempt,
unsigned int prev_state,
struct task_struct *p,
struct task_struct *n) struct task_struct *n)
{ {
struct osnoise_variables *osn_var = this_cpu_osn_var(); struct osnoise_variables *osn_var = this_cpu_osn_var();
......
...@@ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex); ...@@ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex);
static void static void
probe_sched_switch(void *ignore, bool preempt, probe_sched_switch(void *ignore, bool preempt,
unsigned int prev_state,
struct task_struct *prev, struct task_struct *next) struct task_struct *prev, struct task_struct *next)
{ {
int flags; int flags;
......
...@@ -426,6 +426,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, ...@@ -426,6 +426,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
static void notrace static void notrace
probe_wakeup_sched_switch(void *ignore, bool preempt, probe_wakeup_sched_switch(void *ignore, bool preempt,
unsigned int prev_state,
struct task_struct *prev, struct task_struct *next) struct task_struct *prev, struct task_struct *next)
{ {
struct trace_array_cpu *data; struct trace_array_cpu *data;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment