mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-10 06:49:29 -04:00
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf updates from Ingo Molnar:
"Bigger kernel side changes:
- Add backwards writing capability to the perf ring-buffer code,
which is preparation for future advanced features like robust
'overwrite support' and snapshot mode. (Wang Nan)
- Add pause and resume ioctls for the perf ringbuffer (Wang Nan)
- x86 Intel cstate code cleanups and reorgnization (Thomas Gleixner)
- x86 Intel uncore and CPU PMU driver updates (Kan Liang, Peter
Zijlstra)
- x86 AUX (Intel PT) related enhancements and updates (Alexander
Shishkin)
- x86 MSR PMU driver enhancements and updates (Huang Rui)
- ... and lots of other changes spread out over 40+ commits.
Biggest tooling side changes:
- 'perf trace' features and enhancements. (Arnaldo Carvalho de Melo)
- BPF tooling updates (Wang Nan)
- 'perf sched' updates (Jiri Olsa)
- 'perf probe' updates (Masami Hiramatsu)
- ... plus 200+ other enhancements, fixes and cleanups to tools/
The merge commits, the shortlog and the changelogs contain a lot more
details"
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (249 commits)
perf/core: Disable the event on a truncated AUX record
perf/x86/intel/pt: Generate PMI in the STOP region as well
perf buildid-cache: Use lsdir() for looking up buildid caches
perf symbols: Use lsdir() for the search in kcore cache directory
perf tools: Use SBUILD_ID_SIZE where applicable
perf tools: Fix lsdir to set errno correctly
perf trace: Move seccomp args beautifiers to tools/perf/trace/beauty/
perf trace: Move flock op beautifier to tools/perf/trace/beauty/
perf build: Add build-test for debug-frame on arm/arm64
perf build: Add build-test for libunwind cross-platforms support
perf script: Fix export of callchains with recursion in db-export
perf script: Fix callchain addresses in db-export
perf script: Fix symbol insertion behavior in db-export
perf symbols: Add dso__insert_symbol function
perf scripting python: Use Py_FatalError instead of die()
perf tools: Remove xrealloc and ALLOC_GROW
perf help: Do not use ALLOC_GROW in add_cmd_list
perf pmu: Make pmu_formats_string to check return value of strbuf
perf header: Make topology checkers to check return value of strbuf
perf tools: Make alias handler to check return value of strbuf
...
This commit is contained in:
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
|
||||
/* check sanity of attributes */
|
||||
if (attr->max_entries == 0 || attr->key_size != 4 ||
|
||||
value_size < 8 || value_size % 8 ||
|
||||
value_size / 8 > PERF_MAX_STACK_DEPTH)
|
||||
value_size / 8 > sysctl_perf_event_max_stack)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
/* hash table size must be power of 2 */
|
||||
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
|
||||
struct perf_callchain_entry *trace;
|
||||
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
|
||||
u32 max_depth = map->value_size / 8;
|
||||
/* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
|
||||
u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
|
||||
/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
|
||||
u32 init_nr = sysctl_perf_event_max_stack - max_depth;
|
||||
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
|
||||
u32 hash, id, trace_nr, trace_len;
|
||||
bool user = flags & BPF_F_USER_STACK;
|
||||
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
|
||||
return -EFAULT;
|
||||
|
||||
/* get_perf_callchain() guarantees that trace->nr >= init_nr
|
||||
* and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
|
||||
* and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
|
||||
*/
|
||||
trace_nr = trace->nr - init_nr;
|
||||
|
||||
|
||||
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
|
||||
struct perf_callchain_entry *cpu_entries[0];
|
||||
};
|
||||
|
||||
int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
|
||||
|
||||
static inline size_t perf_callchain_entry__sizeof(void)
|
||||
{
|
||||
return (sizeof(struct perf_callchain_entry) +
|
||||
sizeof(__u64) * sysctl_perf_event_max_stack);
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
|
||||
static atomic_t nr_callchain_events;
|
||||
static DEFINE_MUTEX(callchain_mutex);
|
||||
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
|
||||
if (!entries)
|
||||
return -ENOMEM;
|
||||
|
||||
size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
|
||||
size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
|
||||
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
|
||||
|
||||
cpu = smp_processor_id();
|
||||
|
||||
return &entries->cpu_entries[cpu][*rctx];
|
||||
return (((void *)entries->cpu_entries[cpu]) +
|
||||
(*rctx * perf_callchain_entry__sizeof()));
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -215,3 +224,25 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
int perf_event_max_stack_handler(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
int new_value = sysctl_perf_event_max_stack, ret;
|
||||
struct ctl_table new_table = *table;
|
||||
|
||||
new_table.data = &new_value;
|
||||
ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
|
||||
if (ret || !write)
|
||||
return ret;
|
||||
|
||||
mutex_lock(&callchain_mutex);
|
||||
if (atomic_read(&nr_callchain_events))
|
||||
ret = -EBUSY;
|
||||
else
|
||||
sysctl_perf_event_max_stack = new_value;
|
||||
|
||||
mutex_unlock(&callchain_mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -11,13 +11,13 @@
|
||||
struct ring_buffer {
|
||||
atomic_t refcount;
|
||||
struct rcu_head rcu_head;
|
||||
struct irq_work irq_work;
|
||||
#ifdef CONFIG_PERF_USE_VMALLOC
|
||||
struct work_struct work;
|
||||
int page_order; /* allocation order */
|
||||
#endif
|
||||
int nr_pages; /* nr of data pages */
|
||||
int overwrite; /* can overwrite itself */
|
||||
int paused; /* can write into ring buffer */
|
||||
|
||||
atomic_t poll; /* POLL_ for wakeups */
|
||||
|
||||
@@ -65,6 +65,14 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head)
|
||||
rb_free(rb);
|
||||
}
|
||||
|
||||
static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
|
||||
{
|
||||
if (!pause && rb->nr_pages)
|
||||
rb->paused = 0;
|
||||
else
|
||||
rb->paused = 1;
|
||||
}
|
||||
|
||||
extern struct ring_buffer *
|
||||
rb_alloc(int nr_pages, long watermark, int cpu, int flags);
|
||||
extern void perf_event_wakeup(struct perf_event *event);
|
||||
|
||||
@@ -102,8 +102,21 @@ static void perf_output_put_handle(struct perf_output_handle *handle)
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
int perf_output_begin(struct perf_output_handle *handle,
|
||||
struct perf_event *event, unsigned int size)
|
||||
static bool __always_inline
|
||||
ring_buffer_has_space(unsigned long head, unsigned long tail,
|
||||
unsigned long data_size, unsigned int size,
|
||||
bool backward)
|
||||
{
|
||||
if (!backward)
|
||||
return CIRC_SPACE(head, tail, data_size) >= size;
|
||||
else
|
||||
return CIRC_SPACE(tail, head, data_size) >= size;
|
||||
}
|
||||
|
||||
static int __always_inline
|
||||
__perf_output_begin(struct perf_output_handle *handle,
|
||||
struct perf_event *event, unsigned int size,
|
||||
bool backward)
|
||||
{
|
||||
struct ring_buffer *rb;
|
||||
unsigned long tail, offset, head;
|
||||
@@ -125,8 +138,11 @@ int perf_output_begin(struct perf_output_handle *handle,
|
||||
if (unlikely(!rb))
|
||||
goto out;
|
||||
|
||||
if (unlikely(!rb->nr_pages))
|
||||
if (unlikely(rb->paused)) {
|
||||
if (rb->nr_pages)
|
||||
local_inc(&rb->lost);
|
||||
goto out;
|
||||
}
|
||||
|
||||
handle->rb = rb;
|
||||
handle->event = event;
|
||||
@@ -143,9 +159,12 @@ int perf_output_begin(struct perf_output_handle *handle,
|
||||
do {
|
||||
tail = READ_ONCE(rb->user_page->data_tail);
|
||||
offset = head = local_read(&rb->head);
|
||||
if (!rb->overwrite &&
|
||||
unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
|
||||
goto fail;
|
||||
if (!rb->overwrite) {
|
||||
if (unlikely(!ring_buffer_has_space(head, tail,
|
||||
perf_data_size(rb),
|
||||
size, backward)))
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/*
|
||||
* The above forms a control dependency barrier separating the
|
||||
@@ -159,9 +178,17 @@ int perf_output_begin(struct perf_output_handle *handle,
|
||||
* See perf_output_put_handle().
|
||||
*/
|
||||
|
||||
head += size;
|
||||
if (!backward)
|
||||
head += size;
|
||||
else
|
||||
head -= size;
|
||||
} while (local_cmpxchg(&rb->head, offset, head) != offset);
|
||||
|
||||
if (backward) {
|
||||
offset = head;
|
||||
head = (u64)(-head);
|
||||
}
|
||||
|
||||
/*
|
||||
* We rely on the implied barrier() by local_cmpxchg() to ensure
|
||||
* none of the data stores below can be lifted up by the compiler.
|
||||
@@ -203,6 +230,26 @@ int perf_output_begin(struct perf_output_handle *handle,
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
int perf_output_begin_forward(struct perf_output_handle *handle,
|
||||
struct perf_event *event, unsigned int size)
|
||||
{
|
||||
return __perf_output_begin(handle, event, size, false);
|
||||
}
|
||||
|
||||
int perf_output_begin_backward(struct perf_output_handle *handle,
|
||||
struct perf_event *event, unsigned int size)
|
||||
{
|
||||
return __perf_output_begin(handle, event, size, true);
|
||||
}
|
||||
|
||||
int perf_output_begin(struct perf_output_handle *handle,
|
||||
struct perf_event *event, unsigned int size)
|
||||
{
|
||||
|
||||
return __perf_output_begin(handle, event, size,
|
||||
unlikely(is_write_backward(event)));
|
||||
}
|
||||
|
||||
unsigned int perf_output_copy(struct perf_output_handle *handle,
|
||||
const void *buf, unsigned int len)
|
||||
{
|
||||
@@ -221,8 +268,6 @@ void perf_output_end(struct perf_output_handle *handle)
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static void rb_irq_work(struct irq_work *work);
|
||||
|
||||
static void
|
||||
ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
|
||||
{
|
||||
@@ -243,16 +288,13 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
|
||||
|
||||
INIT_LIST_HEAD(&rb->event_list);
|
||||
spin_lock_init(&rb->event_lock);
|
||||
init_irq_work(&rb->irq_work, rb_irq_work);
|
||||
}
|
||||
|
||||
static void ring_buffer_put_async(struct ring_buffer *rb)
|
||||
{
|
||||
if (!atomic_dec_and_test(&rb->refcount))
|
||||
return;
|
||||
|
||||
rb->rcu_head.next = (void *)rb;
|
||||
irq_work_queue(&rb->irq_work);
|
||||
/*
|
||||
* perf_output_begin() only checks rb->paused, therefore
|
||||
* rb->paused must be true if we have no pages for output.
|
||||
*/
|
||||
if (!rb->nr_pages)
|
||||
rb->paused = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -264,6 +306,10 @@ static void ring_buffer_put_async(struct ring_buffer *rb)
|
||||
* The ordering is similar to that of perf_output_{begin,end}, with
|
||||
* the exception of (B), which should be taken care of by the pmu
|
||||
* driver, since ordering rules will differ depending on hardware.
|
||||
*
|
||||
* Call this from pmu::start(); see the comment in perf_aux_output_end()
|
||||
* about its use in pmu callbacks. Both can also be called from the PMI
|
||||
* handler if needed.
|
||||
*/
|
||||
void *perf_aux_output_begin(struct perf_output_handle *handle,
|
||||
struct perf_event *event)
|
||||
@@ -287,6 +333,13 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
|
||||
if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
|
||||
goto err;
|
||||
|
||||
/*
|
||||
* If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
|
||||
* the aux buffer is in perf_mmap_close(), about to get freed.
|
||||
*/
|
||||
if (!atomic_read(&rb->aux_mmap_count))
|
||||
goto err_put;
|
||||
|
||||
/*
|
||||
* Nesting is not supported for AUX area, make sure nested
|
||||
* writers are caught early
|
||||
@@ -328,10 +381,11 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
|
||||
return handle->rb->aux_priv;
|
||||
|
||||
err_put:
|
||||
/* can't be last */
|
||||
rb_free_aux(rb);
|
||||
|
||||
err:
|
||||
ring_buffer_put_async(rb);
|
||||
ring_buffer_put(rb);
|
||||
handle->event = NULL;
|
||||
|
||||
return NULL;
|
||||
@@ -342,6 +396,10 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
|
||||
* aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
|
||||
* pmu driver's responsibility to observe ordering rules of the hardware,
|
||||
* so that all the data is externally visible before this is called.
|
||||
*
|
||||
* Note: this has to be called from pmu::stop() callback, as the assumption
|
||||
* of the AUX buffer management code is that after pmu::stop(), the AUX
|
||||
* transaction must be stopped and therefore drop the AUX reference count.
|
||||
*/
|
||||
void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
|
||||
bool truncated)
|
||||
@@ -389,8 +447,9 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
|
||||
handle->event = NULL;
|
||||
|
||||
local_set(&rb->aux_nest, 0);
|
||||
/* can't be last */
|
||||
rb_free_aux(rb);
|
||||
ring_buffer_put_async(rb);
|
||||
ring_buffer_put(rb);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -471,6 +530,14 @@ static void __rb_free_aux(struct ring_buffer *rb)
|
||||
{
|
||||
int pg;
|
||||
|
||||
/*
|
||||
* Should never happen, the last reference should be dropped from
|
||||
* perf_mmap_close() path, which first stops aux transactions (which
|
||||
* in turn are the atomic holders of aux_refcount) and then does the
|
||||
* last rb_free_aux().
|
||||
*/
|
||||
WARN_ON_ONCE(in_atomic());
|
||||
|
||||
if (rb->aux_priv) {
|
||||
rb->free_aux(rb->aux_priv);
|
||||
rb->free_aux = NULL;
|
||||
@@ -582,18 +649,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
|
||||
void rb_free_aux(struct ring_buffer *rb)
|
||||
{
|
||||
if (atomic_dec_and_test(&rb->aux_refcount))
|
||||
irq_work_queue(&rb->irq_work);
|
||||
}
|
||||
|
||||
static void rb_irq_work(struct irq_work *work)
|
||||
{
|
||||
struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work);
|
||||
|
||||
if (!atomic_read(&rb->aux_refcount))
|
||||
__rb_free_aux(rb);
|
||||
|
||||
if (rb->rcu_head.next == (void *)rb)
|
||||
call_rcu(&rb->rcu_head, rb_free_rcu);
|
||||
}
|
||||
|
||||
#ifndef CONFIG_PERF_USE_VMALLOC
|
||||
|
||||
@@ -130,6 +130,9 @@ static int one_thousand = 1000;
|
||||
#ifdef CONFIG_PRINTK
|
||||
static int ten_thousand = 10000;
|
||||
#endif
|
||||
#ifdef CONFIG_PERF_EVENTS
|
||||
static int six_hundred_forty_kb = 640 * 1024;
|
||||
#endif
|
||||
|
||||
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
|
||||
static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
|
||||
@@ -1144,6 +1147,15 @@ static struct ctl_table kern_table[] = {
|
||||
.extra1 = &zero,
|
||||
.extra2 = &one_hundred,
|
||||
},
|
||||
{
|
||||
.procname = "perf_event_max_stack",
|
||||
.data = NULL, /* filled in by handler */
|
||||
.maxlen = sizeof(sysctl_perf_event_max_stack),
|
||||
.mode = 0644,
|
||||
.proc_handler = perf_event_max_stack_handler,
|
||||
.extra1 = &zero,
|
||||
.extra2 = &six_hundred_forty_kb,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_KMEMCHECK
|
||||
{
|
||||
|
||||
@@ -47,6 +47,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
|
||||
if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
if (!is_sampling_event(p_event))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* We don't allow user space callchains for function trace
|
||||
* event, due to issues with page faults while tracing page
|
||||
|
||||
Reference in New Issue
Block a user