Commit 3249fe45 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'trace-v5.10-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace

Pull tracing fixes from Steven Rostedt:

 - Fix off-by-one error in retrieving the context buffer for
   trace_printk()

 - Fix off-by-one error in stack nesting limit

 - Fix recursion to not make all NMI code false positive as recursing

 - Stop losing events in function tracing when transitioning between irq
   context

 - Stop losing events in ring buffer when transitioning between irq
   context

 - Fix return code of error pointer in parse_synth_field() to prevent
   NULL pointer dereference.

 - Fix false positive of NMI recursion in kprobe event handling

* tag 'trace-v5.10-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace:
  kprobes: Tell lockdep about kprobe nesting
  tracing: Make -ENOMEM the default error for parse_synth_field()
  ring-buffer: Fix recursion protection transitions between interrupt context
  tracing: Fix the checking of stackidx in __ftrace_trace_stack
  ftrace: Handle tracing when switching between context
  ftrace: Fix recursion check for NMI test
  tracing: Fix out of bounds write in get_trace_buf
parents 6732b354 645f224e
...@@ -1249,7 +1249,13 @@ __acquires(hlist_lock) ...@@ -1249,7 +1249,13 @@ __acquires(hlist_lock)
*head = &kretprobe_inst_table[hash]; *head = &kretprobe_inst_table[hash];
hlist_lock = kretprobe_table_lock_ptr(hash); hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_lock_irqsave(hlist_lock, *flags); /*
* Nested is a workaround that will soon not be needed.
* There's other protections that make sure the same lock
* is not taken on the same CPU that lockdep is unaware of.
* Differentiate when it is taken in NMI context.
*/
raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi());
} }
NOKPROBE_SYMBOL(kretprobe_hash_lock); NOKPROBE_SYMBOL(kretprobe_hash_lock);
...@@ -1258,7 +1264,13 @@ static void kretprobe_table_lock(unsigned long hash, ...@@ -1258,7 +1264,13 @@ static void kretprobe_table_lock(unsigned long hash,
__acquires(hlist_lock) __acquires(hlist_lock)
{ {
raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_lock_irqsave(hlist_lock, *flags); /*
* Nested is a workaround that will soon not be needed.
* There's other protections that make sure the same lock
* is not taken on the same CPU that lockdep is unaware of.
* Differentiate when it is taken in NMI context.
*/
raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi());
} }
NOKPROBE_SYMBOL(kretprobe_table_lock); NOKPROBE_SYMBOL(kretprobe_table_lock);
...@@ -2028,7 +2040,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) ...@@ -2028,7 +2040,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
/* TODO: consider to only swap the RA after the last pre_handler fired */ /* TODO: consider to only swap the RA after the last pre_handler fired */
hash = hash_ptr(current, KPROBE_HASH_BITS); hash = hash_ptr(current, KPROBE_HASH_BITS);
raw_spin_lock_irqsave(&rp->lock, flags); /*
* Nested is a workaround that will soon not be needed.
* There's other protections that make sure the same lock
* is not taken on the same CPU that lockdep is unaware of.
*/
raw_spin_lock_irqsave_nested(&rp->lock, flags, 1);
if (!hlist_empty(&rp->free_instances)) { if (!hlist_empty(&rp->free_instances)) {
ri = hlist_entry(rp->free_instances.first, ri = hlist_entry(rp->free_instances.first,
struct kretprobe_instance, hlist); struct kretprobe_instance, hlist);
...@@ -2039,7 +2056,7 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) ...@@ -2039,7 +2056,7 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
ri->task = current; ri->task = current;
if (rp->entry_handler && rp->entry_handler(ri, regs)) { if (rp->entry_handler && rp->entry_handler(ri, regs)) {
raw_spin_lock_irqsave(&rp->lock, flags); raw_spin_lock_irqsave_nested(&rp->lock, flags, 1);
hlist_add_head(&ri->hlist, &rp->free_instances); hlist_add_head(&ri->hlist, &rp->free_instances);
raw_spin_unlock_irqrestore(&rp->lock, flags); raw_spin_unlock_irqrestore(&rp->lock, flags);
return 0; return 0;
......
...@@ -438,14 +438,16 @@ enum { ...@@ -438,14 +438,16 @@ enum {
}; };
/* /*
* Used for which event context the event is in. * Used for which event context the event is in.
* NMI = 0 * TRANSITION = 0
* IRQ = 1 * NMI = 1
* SOFTIRQ = 2 * IRQ = 2
* NORMAL = 3 * SOFTIRQ = 3
* NORMAL = 4
* *
* See trace_recursive_lock() comment below for more details. * See trace_recursive_lock() comment below for more details.
*/ */
enum { enum {
RB_CTX_TRANSITION,
RB_CTX_NMI, RB_CTX_NMI,
RB_CTX_IRQ, RB_CTX_IRQ,
RB_CTX_SOFTIRQ, RB_CTX_SOFTIRQ,
...@@ -3014,10 +3016,10 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) ...@@ -3014,10 +3016,10 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* a bit of overhead in something as critical as function tracing, * a bit of overhead in something as critical as function tracing,
* we use a bitmask trick. * we use a bitmask trick.
* *
* bit 0 = NMI context * bit 1 = NMI context
* bit 1 = IRQ context * bit 2 = IRQ context
* bit 2 = SoftIRQ context * bit 3 = SoftIRQ context
* bit 3 = normal context. * bit 4 = normal context.
* *
* This works because this is the order of contexts that can * This works because this is the order of contexts that can
* preempt other contexts. A SoftIRQ never preempts an IRQ * preempt other contexts. A SoftIRQ never preempts an IRQ
...@@ -3040,6 +3042,30 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) ...@@ -3040,6 +3042,30 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* The least significant bit can be cleared this way, and it * The least significant bit can be cleared this way, and it
* just so happens that it is the same bit corresponding to * just so happens that it is the same bit corresponding to
* the current context. * the current context.
*
* Now the TRANSITION bit breaks the above slightly. The TRANSITION bit
* is set when a recursion is detected at the current context, and if
* the TRANSITION bit is already set, it will fail the recursion.
* This is needed because there's a lag between the changing of
* interrupt context and updating the preempt count. In this case,
* a false positive will be found. To handle this, one extra recursion
* is allowed, and this is done by the TRANSITION bit. If the TRANSITION
* bit is already set, then it is considered a recursion and the function
* ends. Otherwise, the TRANSITION bit is set, and that bit is returned.
*
* On the trace_recursive_unlock(), the TRANSITION bit will be the first
* to be cleared. Even if it wasn't the context that set it. That is,
* if an interrupt comes in while NORMAL bit is set and the ring buffer
* is called before preempt_count() is updated, since the check will
* be on the NORMAL bit, the TRANSITION bit will then be set. If an
* NMI then comes in, it will set the NMI bit, but when the NMI code
* does the trace_recursive_unlock() it will clear the TRANSTION bit
* and leave the NMI bit set. But this is fine, because the interrupt
* code that set the TRANSITION bit will then clear the NMI bit when it
* calls trace_recursive_unlock(). If another NMI comes in, it will
* set the TRANSITION bit and continue.
*
* Note: The TRANSITION bit only handles a single transition between context.
*/ */
static __always_inline int static __always_inline int
...@@ -3055,8 +3081,16 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) ...@@ -3055,8 +3081,16 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
bit = pc & NMI_MASK ? RB_CTX_NMI : bit = pc & NMI_MASK ? RB_CTX_NMI :
pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) {
return 1; /*
* It is possible that this was called by transitioning
* between interrupt context, and preempt_count() has not
* been updated yet. In this case, use the TRANSITION bit.
*/
bit = RB_CTX_TRANSITION;
if (val & (1 << (bit + cpu_buffer->nest)))
return 1;
}
val |= (1 << (bit + cpu_buffer->nest)); val |= (1 << (bit + cpu_buffer->nest));
cpu_buffer->current_context = val; cpu_buffer->current_context = val;
...@@ -3071,8 +3105,8 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) ...@@ -3071,8 +3105,8 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->current_context - (1 << cpu_buffer->nest); cpu_buffer->current_context - (1 << cpu_buffer->nest);
} }
/* The recursive locking above uses 4 bits */ /* The recursive locking above uses 5 bits */
#define NESTED_BITS 4 #define NESTED_BITS 5
/** /**
* ring_buffer_nest_start - Allow to trace while nested * ring_buffer_nest_start - Allow to trace while nested
......
...@@ -2750,7 +2750,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, ...@@ -2750,7 +2750,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
/* /*
* If tracing is off, but we have triggers enabled * If tracing is off, but we have triggers enabled
* we still need to look at the event data. Use the temp_buffer * we still need to look at the event data. Use the temp_buffer
* to store the trace event for the tigger to use. It's recusive * to store the trace event for the trigger to use. It's recursive
* safe and will not be recorded anywhere. * safe and will not be recorded anywhere.
*/ */
if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
...@@ -2952,7 +2952,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, ...@@ -2952,7 +2952,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
/* This should never happen. If it does, yell once and skip */ /* This should never happen. If it does, yell once and skip */
if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING)) if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING))
goto out; goto out;
/* /*
...@@ -3132,7 +3132,7 @@ static char *get_trace_buf(void) ...@@ -3132,7 +3132,7 @@ static char *get_trace_buf(void)
/* Interrupts must see nesting incremented before we use the buffer */ /* Interrupts must see nesting incremented before we use the buffer */
barrier(); barrier();
return &buffer->buffer[buffer->nesting][0]; return &buffer->buffer[buffer->nesting - 1][0];
} }
static void put_trace_buf(void) static void put_trace_buf(void)
......
...@@ -637,6 +637,12 @@ enum { ...@@ -637,6 +637,12 @@ enum {
* function is called to clear it. * function is called to clear it.
*/ */
TRACE_GRAPH_NOTRACE_BIT, TRACE_GRAPH_NOTRACE_BIT,
/*
* When transitioning between context, the preempt_count() may
* not be correct. Allow for a single recursion to cover this case.
*/
TRACE_TRANSITION_BIT,
}; };
#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
...@@ -691,14 +697,27 @@ static __always_inline int trace_test_and_set_recursion(int start, int max) ...@@ -691,14 +697,27 @@ static __always_inline int trace_test_and_set_recursion(int start, int max)
return 0; return 0;
bit = trace_get_context_bit() + start; bit = trace_get_context_bit() + start;
if (unlikely(val & (1 << bit))) if (unlikely(val & (1 << bit))) {
return -1; /*
* It could be that preempt_count has not been updated during
* a switch between contexts. Allow for a single recursion.
*/
bit = TRACE_TRANSITION_BIT;
if (trace_recursion_test(bit))
return -1;
trace_recursion_set(bit);
barrier();
return bit + 1;
}
/* Normal check passed, clear the transition to allow it again */
trace_recursion_clear(TRACE_TRANSITION_BIT);
val |= 1 << bit; val |= 1 << bit;
current->trace_recursion = val; current->trace_recursion = val;
barrier(); barrier();
return bit; return bit + 1;
} }
static __always_inline void trace_clear_recursion(int bit) static __always_inline void trace_clear_recursion(int bit)
...@@ -708,6 +727,7 @@ static __always_inline void trace_clear_recursion(int bit) ...@@ -708,6 +727,7 @@ static __always_inline void trace_clear_recursion(int bit)
if (!bit) if (!bit)
return; return;
bit--;
bit = 1 << bit; bit = 1 << bit;
val &= ~bit; val &= ~bit;
......
...@@ -584,7 +584,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, ...@@ -584,7 +584,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
{ {
struct synth_field *field; struct synth_field *field;
const char *prefix = NULL, *field_type = argv[0], *field_name, *array; const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
int len, ret = 0; int len, ret = -ENOMEM;
struct seq_buf s; struct seq_buf s;
ssize_t size; ssize_t size;
...@@ -617,10 +617,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, ...@@ -617,10 +617,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
len--; len--;
field->name = kmemdup_nul(field_name, len, GFP_KERNEL); field->name = kmemdup_nul(field_name, len, GFP_KERNEL);
if (!field->name) { if (!field->name)
ret = -ENOMEM;
goto free; goto free;
}
if (!is_good_name(field->name)) { if (!is_good_name(field->name)) {
synth_err(SYNTH_ERR_BAD_NAME, errpos(field_name)); synth_err(SYNTH_ERR_BAD_NAME, errpos(field_name));
ret = -EINVAL; ret = -EINVAL;
...@@ -638,10 +637,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, ...@@ -638,10 +637,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
len += strlen(prefix); len += strlen(prefix);
field->type = kzalloc(len, GFP_KERNEL); field->type = kzalloc(len, GFP_KERNEL);
if (!field->type) { if (!field->type)
ret = -ENOMEM;
goto free; goto free;
}
seq_buf_init(&s, field->type, len); seq_buf_init(&s, field->type, len);
if (prefix) if (prefix)
seq_buf_puts(&s, prefix); seq_buf_puts(&s, prefix);
...@@ -653,6 +651,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, ...@@ -653,6 +651,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
} }
if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) if (WARN_ON_ONCE(!seq_buf_buffer_left(&s)))
goto free; goto free;
s.buffer[s.len] = '\0'; s.buffer[s.len] = '\0';
size = synth_field_size(field->type); size = synth_field_size(field->type);
...@@ -666,10 +665,8 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, ...@@ -666,10 +665,8 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
len = sizeof("__data_loc ") + strlen(field->type) + 1; len = sizeof("__data_loc ") + strlen(field->type) + 1;
type = kzalloc(len, GFP_KERNEL); type = kzalloc(len, GFP_KERNEL);
if (!type) { if (!type)
ret = -ENOMEM;
goto free; goto free;
}
seq_buf_init(&s, type, len); seq_buf_init(&s, type, len);
seq_buf_puts(&s, "__data_loc "); seq_buf_puts(&s, "__data_loc ");
......
...@@ -492,8 +492,13 @@ trace_selftest_function_recursion(void) ...@@ -492,8 +492,13 @@ trace_selftest_function_recursion(void)
unregister_ftrace_function(&test_rec_probe); unregister_ftrace_function(&test_rec_probe);
ret = -1; ret = -1;
if (trace_selftest_recursion_cnt != 1) { /*
pr_cont("*callback not called once (%d)* ", * Recursion allows for transitions between context,
* and may call the callback twice.
*/
if (trace_selftest_recursion_cnt != 1 &&
trace_selftest_recursion_cnt != 2) {
pr_cont("*callback not called once (or twice) (%d)* ",
trace_selftest_recursion_cnt); trace_selftest_recursion_cnt);
goto out; goto out;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment