Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf updates from Ingo Molnar:
 "Bigger kernel side changes:

   - Add backwards writing capability to the perf ring-buffer code,
     which is preparation for future advanced features like robust
     'overwrite support' and snapshot mode.  (Wang Nan)

   - Add pause and resume ioctls for the perf ringbuffer (Wang Nan)

   - x86 Intel cstate code cleanups and reorgnization (Thomas Gleixner)

   - x86 Intel uncore and CPU PMU driver updates (Kan Liang, Peter
     Zijlstra)

   - x86 AUX (Intel PT) related enhancements and updates (Alexander
     Shishkin)

   - x86 MSR PMU driver enhancements and updates (Huang Rui)

   - ... and lots of other changes spread out over 40+ commits.

  Biggest tooling side changes:

   - 'perf trace' features and enhancements.  (Arnaldo Carvalho de Melo)

   - BPF tooling updates (Wang Nan)

   - 'perf sched' updates (Jiri Olsa)

   - 'perf probe' updates (Masami Hiramatsu)

   - ... plus 200+ other enhancements, fixes and cleanups to tools/

  The merge commits, the shortlog and the changelogs contain a lot more
  details"

* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (249 commits)
  perf/core: Disable the event on a truncated AUX record
  perf/x86/intel/pt: Generate PMI in the STOP region as well
  perf buildid-cache: Use lsdir() for looking up buildid caches
  perf symbols: Use lsdir() for the search in kcore cache directory
  perf tools: Use SBUILD_ID_SIZE where applicable
  perf tools: Fix lsdir to set errno correctly
  perf trace: Move seccomp args beautifiers to tools/perf/trace/beauty/
  perf trace: Move flock op beautifier to tools/perf/trace/beauty/
  perf build: Add build-test for debug-frame on arm/arm64
  perf build: Add build-test for libunwind cross-platforms support
  perf script: Fix export of callchains with recursion in db-export
  perf script: Fix callchain addresses in db-export
  perf script: Fix symbol insertion behavior in db-export
  perf symbols: Add dso__insert_symbol function
  perf scripting python: Use Py_FatalError instead of die()
  perf tools: Remove xrealloc and ALLOC_GROW
  perf help: Do not use ALLOC_GROW in add_cmd_list
  perf pmu: Make pmu_formats_string to check return value of strbuf
  perf header: Make topology checkers to check return value of strbuf
  perf tools: Make alias handler to check return value of strbuf
  ...
This commit is contained in:
Linus Torvalds
2016-05-16 14:08:43 -07:00
217 changed files with 8344 additions and 2747 deletions

View File

@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
value_size < 8 || value_size % 8 ||
value_size / 8 > PERF_MAX_STACK_DEPTH)
value_size / 8 > sysctl_perf_event_max_stack)
return ERR_PTR(-EINVAL);
/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
struct perf_callchain_entry *trace;
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 max_depth = map->value_size / 8;
/* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
return -EFAULT;
/* get_perf_callchain() guarantees that trace->nr >= init_nr
* and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
* and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
*/
trace_nr = trace->nr - init_nr;

View File

@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
struct perf_callchain_entry *cpu_entries[0];
};
int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
static inline size_t perf_callchain_entry__sizeof(void)
{
return (sizeof(struct perf_callchain_entry) +
sizeof(__u64) * sysctl_perf_event_max_stack);
}
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
if (!entries)
return -ENOMEM;
size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
for_each_possible_cpu(cpu) {
entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
cpu = smp_processor_id();
return &entries->cpu_entries[cpu][*rctx];
return (((void *)entries->cpu_entries[cpu]) +
(*rctx * perf_callchain_entry__sizeof()));
}
static void
@@ -215,3 +224,25 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
return entry;
}
int perf_event_max_stack_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int new_value = sysctl_perf_event_max_stack, ret;
struct ctl_table new_table = *table;
new_table.data = &new_value;
ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
mutex_lock(&callchain_mutex);
if (atomic_read(&nr_callchain_events))
ret = -EBUSY;
else
sysctl_perf_event_max_stack = new_value;
mutex_unlock(&callchain_mutex);
return ret;
}

File diff suppressed because it is too large Load Diff

View File

@@ -11,13 +11,13 @@
struct ring_buffer {
atomic_t refcount;
struct rcu_head rcu_head;
struct irq_work irq_work;
#ifdef CONFIG_PERF_USE_VMALLOC
struct work_struct work;
int page_order; /* allocation order */
#endif
int nr_pages; /* nr of data pages */
int overwrite; /* can overwrite itself */
int paused; /* can write into ring buffer */
atomic_t poll; /* POLL_ for wakeups */
@@ -65,6 +65,14 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head)
rb_free(rb);
}
static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
{
if (!pause && rb->nr_pages)
rb->paused = 0;
else
rb->paused = 1;
}
extern struct ring_buffer *
rb_alloc(int nr_pages, long watermark, int cpu, int flags);
extern void perf_event_wakeup(struct perf_event *event);

View File

@@ -102,8 +102,21 @@ static void perf_output_put_handle(struct perf_output_handle *handle)
preempt_enable();
}
int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size)
static bool __always_inline
ring_buffer_has_space(unsigned long head, unsigned long tail,
unsigned long data_size, unsigned int size,
bool backward)
{
if (!backward)
return CIRC_SPACE(head, tail, data_size) >= size;
else
return CIRC_SPACE(tail, head, data_size) >= size;
}
static int __always_inline
__perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size,
bool backward)
{
struct ring_buffer *rb;
unsigned long tail, offset, head;
@@ -125,8 +138,11 @@ int perf_output_begin(struct perf_output_handle *handle,
if (unlikely(!rb))
goto out;
if (unlikely(!rb->nr_pages))
if (unlikely(rb->paused)) {
if (rb->nr_pages)
local_inc(&rb->lost);
goto out;
}
handle->rb = rb;
handle->event = event;
@@ -143,9 +159,12 @@ int perf_output_begin(struct perf_output_handle *handle,
do {
tail = READ_ONCE(rb->user_page->data_tail);
offset = head = local_read(&rb->head);
if (!rb->overwrite &&
unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
goto fail;
if (!rb->overwrite) {
if (unlikely(!ring_buffer_has_space(head, tail,
perf_data_size(rb),
size, backward)))
goto fail;
}
/*
* The above forms a control dependency barrier separating the
@@ -159,9 +178,17 @@ int perf_output_begin(struct perf_output_handle *handle,
* See perf_output_put_handle().
*/
head += size;
if (!backward)
head += size;
else
head -= size;
} while (local_cmpxchg(&rb->head, offset, head) != offset);
if (backward) {
offset = head;
head = (u64)(-head);
}
/*
* We rely on the implied barrier() by local_cmpxchg() to ensure
* none of the data stores below can be lifted up by the compiler.
@@ -203,6 +230,26 @@ int perf_output_begin(struct perf_output_handle *handle,
return -ENOSPC;
}
int perf_output_begin_forward(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size)
{
return __perf_output_begin(handle, event, size, false);
}
int perf_output_begin_backward(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size)
{
return __perf_output_begin(handle, event, size, true);
}
int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size)
{
return __perf_output_begin(handle, event, size,
unlikely(is_write_backward(event)));
}
unsigned int perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
@@ -221,8 +268,6 @@ void perf_output_end(struct perf_output_handle *handle)
rcu_read_unlock();
}
static void rb_irq_work(struct irq_work *work);
static void
ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
{
@@ -243,16 +288,13 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
INIT_LIST_HEAD(&rb->event_list);
spin_lock_init(&rb->event_lock);
init_irq_work(&rb->irq_work, rb_irq_work);
}
static void ring_buffer_put_async(struct ring_buffer *rb)
{
if (!atomic_dec_and_test(&rb->refcount))
return;
rb->rcu_head.next = (void *)rb;
irq_work_queue(&rb->irq_work);
/*
* perf_output_begin() only checks rb->paused, therefore
* rb->paused must be true if we have no pages for output.
*/
if (!rb->nr_pages)
rb->paused = 1;
}
/*
@@ -264,6 +306,10 @@ static void ring_buffer_put_async(struct ring_buffer *rb)
* The ordering is similar to that of perf_output_{begin,end}, with
* the exception of (B), which should be taken care of by the pmu
* driver, since ordering rules will differ depending on hardware.
*
* Call this from pmu::start(); see the comment in perf_aux_output_end()
* about its use in pmu callbacks. Both can also be called from the PMI
* handler if needed.
*/
void *perf_aux_output_begin(struct perf_output_handle *handle,
struct perf_event *event)
@@ -287,6 +333,13 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
goto err;
/*
* If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
* the aux buffer is in perf_mmap_close(), about to get freed.
*/
if (!atomic_read(&rb->aux_mmap_count))
goto err_put;
/*
* Nesting is not supported for AUX area, make sure nested
* writers are caught early
@@ -328,10 +381,11 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
return handle->rb->aux_priv;
err_put:
/* can't be last */
rb_free_aux(rb);
err:
ring_buffer_put_async(rb);
ring_buffer_put(rb);
handle->event = NULL;
return NULL;
@@ -342,6 +396,10 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
* pmu driver's responsibility to observe ordering rules of the hardware,
* so that all the data is externally visible before this is called.
*
* Note: this has to be called from pmu::stop() callback, as the assumption
* of the AUX buffer management code is that after pmu::stop(), the AUX
* transaction must be stopped and therefore drop the AUX reference count.
*/
void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
bool truncated)
@@ -389,8 +447,9 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
handle->event = NULL;
local_set(&rb->aux_nest, 0);
/* can't be last */
rb_free_aux(rb);
ring_buffer_put_async(rb);
ring_buffer_put(rb);
}
/*
@@ -471,6 +530,14 @@ static void __rb_free_aux(struct ring_buffer *rb)
{
int pg;
/*
* Should never happen, the last reference should be dropped from
* perf_mmap_close() path, which first stops aux transactions (which
* in turn are the atomic holders of aux_refcount) and then does the
* last rb_free_aux().
*/
WARN_ON_ONCE(in_atomic());
if (rb->aux_priv) {
rb->free_aux(rb->aux_priv);
rb->free_aux = NULL;
@@ -582,18 +649,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
void rb_free_aux(struct ring_buffer *rb)
{
if (atomic_dec_and_test(&rb->aux_refcount))
irq_work_queue(&rb->irq_work);
}
static void rb_irq_work(struct irq_work *work)
{
struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work);
if (!atomic_read(&rb->aux_refcount))
__rb_free_aux(rb);
if (rb->rcu_head.next == (void *)rb)
call_rcu(&rb->rcu_head, rb_free_rcu);
}
#ifndef CONFIG_PERF_USE_VMALLOC

View File

@@ -130,6 +130,9 @@ static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
#ifdef CONFIG_PERF_EVENTS
static int six_hundred_forty_kb = 640 * 1024;
#endif
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1144,6 +1147,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one_hundred,
},
{
.procname = "perf_event_max_stack",
.data = NULL, /* filled in by handler */
.maxlen = sizeof(sysctl_perf_event_max_stack),
.mode = 0644,
.proc_handler = perf_event_max_stack_handler,
.extra1 = &zero,
.extra2 = &six_hundred_forty_kb,
},
#endif
#ifdef CONFIG_KMEMCHECK
{

View File

@@ -47,6 +47,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
return -EPERM;
if (!is_sampling_event(p_event))
return 0;
/*
* We don't allow user space callchains for function trace
* event, due to issues with page faults while tracing page