blob: 63bc68fc125c92f627ab892d73ee5ec8a59db9e7 [file] [log] [blame]
/* Copyright (c) 2015 Google Inc
* Davide Libenzi <dlibenzi@google.com>
* See LICENSE for details.
*
* This controls the emitting, collecting, and exporting of samples for perf
* events. Examples of events are PMU counter overflows, mmaps, and process
* creation.
*
* Events are collected in a central qio queue. High-frequency events (e.g.
* IRQ backtraces()) are collected in per-core buffers, which are flushed to the
* central queue when they fill up or on command. Lower-frequency events (e.g.
* profiler_notify_mmap()) just go straight to the central queue.
*
* Currently there is one global profiler. Kprof is careful to only have one
* open profiler at a time. See profiler.h for more info. The only sync we do
* in this file is for the functions that are not called while holding the kprof
* mutex - specifically the RCU-protected backtrace sampling code.
*
* A few other notes:
* - profiler_control_trace() controls the per-core trace collection. When it
* is disabled, it also flushes the per-core blocks to the central queue.
* - The collection of mmap and comm samples is independent of trace collection.
* Those will occur whenever the profiler is open, even if it is not started.
* - Looks like we don't bother with munmap records. Not sure if perf can
* handle it or not. */
#include <ros/common.h>
#include <ros/mman.h>
#include <sys/types.h>
#include <smp.h>
#include <trap.h>
#include <kthread.h>
#include <env.h>
#include <process.h>
#include <mm.h>
#include <kmalloc.h>
#include <pmap.h>
#include <atomic.h>
#include <umem.h>
#include <elf.h>
#include <ns.h>
#include <err.h>
#include <core_set.h>
#include <string.h>
#include "profiler.h"
#define PROFILER_MAX_PRG_PATH 256
#define VBE_MAX_SIZE(t) ((8 * sizeof(t) + 6) / 7)
/* Do not rely on the contents of the PCPU ctx with IRQs enabled. */
struct profiler_cpu_context {
struct block *block;
int cpu;
bool tracing;
size_t dropped_data_cnt;
};
/* These are a little hokey, and are currently global vars */
static int profiler_queue_limit = 64 * 1024 * 1024;
static size_t profiler_cpu_buffer_size = 65536;
struct profiler {
struct profiler_cpu_context *pcpu_ctx;
struct queue *qio;
bool tracing;
};
static struct profiler __rcu *gbl_prof;
static struct profiler_cpu_context *profiler_get_cpu_ctx(struct profiler *prof,
int cpu)
{
return prof->pcpu_ctx + cpu;
}
static inline char *vb_encode_uint64(char *data, uint64_t n)
{
/* Classical variable bytes encoding. Encodes 7 bits at a time, using
* bit number 7 in the byte, as indicator of end of sequence (when
* zero). */
for (; n >= 0x80; n >>= 7)
*data++ = (char) (n | 0x80);
*data++ = (char) n;
return data;
}
static struct block *profiler_buffer_write(struct profiler *prof,
struct profiler_cpu_context *cpu_buf,
struct block *b)
{
/* qpass will drop b if the queue is over its limit. we're willing to
* lose traces, but we won't lose 'control' events, such as MMAP and
* PID. */
if (b) {
if (qpass(prof->qio, b) < 0)
cpu_buf->dropped_data_cnt++;
}
return block_alloc(profiler_cpu_buffer_size, MEM_ATOMIC);
}
/* Helper, paired with profiler_cpu_buffer_write_commit. Ensures there is
* enough room in the pcpu block for our write. May alloc a new one.
*
* IRQs must be disabled before calling, until after write_commit. */
static char *profiler_cpu_buffer_write_reserve(struct profiler *prof,
struct profiler_cpu_context *cpu_buf, size_t size, struct block **pb)
{
struct block *b = cpu_buf->block;
if (unlikely((!b) || (b->lim - b->wp) < size)) {
cpu_buf->block = b = profiler_buffer_write(prof, cpu_buf, b);
if (unlikely(!b))
return NULL;
}
*pb = b;
return (char *) b->wp;
}
/* Helper, paired with write_reserve. Finalizes the writing into the block's
* main body of @size bytes. IRQs must be disabled until after this is called.
*/
static inline void profiler_cpu_buffer_write_commit(
struct profiler_cpu_context *cpu_buf, struct block *b, size_t size)
{
b->wp += size;
}
static inline size_t profiler_max_envelope_size(void)
{
return 2 * VBE_MAX_SIZE(uint64_t);
}
static void profiler_push_kernel_trace64(struct profiler *prof,
struct profiler_cpu_context *cpu_buf,
const uintptr_t *trace, size_t count,
uint64_t info)
{
struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
size_t size = sizeof(struct proftype_kern_trace64) +
count * sizeof(uint64_t);
struct block *b;
void *resptr, *ptr;
assert(!irq_is_enabled());
resptr = profiler_cpu_buffer_write_reserve(prof,
cpu_buf, size + profiler_max_envelope_size(), &b);
ptr = resptr;
if (likely(ptr)) {
struct proftype_kern_trace64 *record;
ptr = vb_encode_uint64(ptr, PROFTYPE_KERN_TRACE64);
ptr = vb_encode_uint64(ptr, size);
record = (struct proftype_kern_trace64 *) ptr;
ptr += size;
record->info = info;
record->tstamp = nsec();
if (is_ktask(pcpui->cur_kthread) || !pcpui->cur_proc)
record->pid = -1;
else
record->pid = pcpui->cur_proc->pid;
record->cpu = cpu_buf->cpu;
record->num_traces = count;
for (size_t i = 0; i < count; i++)
record->trace[i] = (uint64_t) trace[i];
profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
}
}
static void profiler_push_user_trace64(struct profiler *prof,
struct profiler_cpu_context *cpu_buf,
struct proc *p, const uintptr_t *trace,
size_t count, uint64_t info)
{
size_t size = sizeof(struct proftype_user_trace64) +
count * sizeof(uint64_t);
struct block *b;
void *resptr, *ptr;
assert(!irq_is_enabled());
resptr = profiler_cpu_buffer_write_reserve(prof,
cpu_buf, size + profiler_max_envelope_size(), &b);
ptr = resptr;
if (likely(ptr)) {
struct proftype_user_trace64 *record;
ptr = vb_encode_uint64(ptr, PROFTYPE_USER_TRACE64);
ptr = vb_encode_uint64(ptr, size);
record = (struct proftype_user_trace64 *) ptr;
ptr += size;
record->info = info;
record->tstamp = nsec();
record->pid = p->pid;
record->cpu = cpu_buf->cpu;
record->num_traces = count;
for (size_t i = 0; i < count; i++)
record->trace[i] = (uint64_t) trace[i];
profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
}
}
static void profiler_push_pid_mmap(struct profiler *prof, struct proc *p,
uintptr_t addr, size_t msize, size_t offset,
const char *path)
{
size_t plen = strlen(path) + 1;
size_t size = sizeof(struct proftype_pid_mmap64) + plen;
void *resptr = kmalloc(size + profiler_max_envelope_size(), MEM_ATOMIC);
if (likely(resptr)) {
void *ptr = resptr;
struct proftype_pid_mmap64 *record;
ptr = vb_encode_uint64(ptr, PROFTYPE_PID_MMAP64);
ptr = vb_encode_uint64(ptr, size);
record = (struct proftype_pid_mmap64 *) ptr;
ptr += size;
record->tstamp = nsec();
record->pid = p->pid;
record->addr = addr;
record->size = msize;
record->offset = offset;
memcpy(record->path, path, plen);
qiwrite(prof->qio, resptr, (int) (ptr - resptr));
kfree(resptr);
}
}
static void profiler_push_new_process(struct profiler *prof, struct proc *p)
{
size_t plen = strlen(p->binary_path) + 1;
size_t size = sizeof(struct proftype_new_process) + plen;
void *resptr = kmalloc(size + profiler_max_envelope_size(), MEM_ATOMIC);
if (likely(resptr)) {
void *ptr = resptr;
struct proftype_new_process *record;
ptr = vb_encode_uint64(ptr, PROFTYPE_NEW_PROCESS);
ptr = vb_encode_uint64(ptr, size);
record = (struct proftype_new_process *) ptr;
ptr += size;
record->tstamp = nsec();
record->pid = p->pid;
memcpy(record->path, p->binary_path, plen);
qiwrite(prof->qio, resptr, (int) (ptr - resptr));
kfree(resptr);
}
}
static void profiler_emit_current_system_status(void)
{
void enum_proc(struct vm_region *vmr, void *opaque)
{
struct proc *p = (struct proc *) opaque;
profiler_notify_mmap(p, vmr->vm_base,
vmr->vm_end - vmr->vm_base,
vmr->vm_prot, vmr->vm_flags, vmr->__vm_foc,
vmr->vm_foff);
}
struct process_set pset;
proc_get_set(&pset);
for (size_t i = 0; i < pset.num_processes; i++) {
profiler_notify_new_process(pset.procs[i]);
enumerate_vmrs(pset.procs[i], enum_proc, pset.procs[i]);
}
proc_free_set(&pset);
}
static long profiler_get_checked_value(const char *value, long k, long minval,
long maxval)
{
long lvalue = strtol(value, NULL, 0) * k;
if (lvalue < minval)
error(EFAIL, "Value should be greater than %ld", minval);
if (lvalue > maxval)
error(EFAIL, "Value should be lower than %ld", maxval);
return lvalue;
}
/* TODO: This configure stuff is a little hokey. You have to configure before
* it's been opened, meaning before you have the kprofctlqid, but you can't
* configure until you have the chan. To use this, you'd need to open, then
* config, then close, then hope that the global settings stick around, then
* open and run it.
*
* Also note that no one uses this. */
int profiler_configure(struct cmdbuf *cb)
{
if (!strcmp(cb->f[0], "prof_qlimit")) {
if (cb->nf < 2)
error(EFAIL, "prof_qlimit KB");
/* If the profiler is already running, this won't take effect
* until the next open. Feel free to change this. */
WRITE_ONCE(profiler_queue_limit,
profiler_get_checked_value(cb->f[1], 1024,
1024 * 1024,
max_pmem / 32));
return 1;
}
if (!strcmp(cb->f[0], "prof_cpubufsz")) {
if (cb->nf < 2)
error(EFAIL, "prof_cpubufsz KB");
WRITE_ONCE(profiler_cpu_buffer_size,
profiler_get_checked_value(cb->f[1], 1024,
16 * 1024,
1024 * 1024));
return 1;
}
return 0;
}
void profiler_append_configure_usage(char *msgbuf, size_t buflen)
{
const char * const cmds[] = {
"prof_qlimit",
"prof_cpubufsz",
};
for (int i = 0; i < ARRAY_SIZE(cmds); i++) {
strlcat(msgbuf, "|", buflen);
strlcat(msgbuf, cmds[i], buflen);
}
}
int profiler_setup(void)
{
struct profiler *prof;
assert(!rcu_dereference_check(gbl_prof, true));
prof = kzmalloc(sizeof(struct profiler), MEM_WAIT);
/* It is very important that we enqueue and dequeue entire records at
* once. If we leave partial records, the entire stream will be
* corrupt. Our reader does its best to make sure it has room for
* complete records (checks qlen()).
*
* If we ever get corrupt streams, try making this a Qmsg. Though it
* doesn't help every situation - we have issues with writes greater
* than Maxatomic regardless. */
prof->qio = qopen(profiler_queue_limit, 0, NULL, NULL);
if (!prof->qio) {
kfree(prof);
return -1;
}
prof->pcpu_ctx = kzmalloc(sizeof(struct profiler_cpu_context)
* num_cores, MEM_WAIT);
for (int i = 0; i < num_cores; i++) {
struct profiler_cpu_context *b = &prof->pcpu_ctx[i];
b->cpu = i;
}
rcu_assign_pointer(gbl_prof, prof);
profiler_emit_current_system_status();
return 0;
}
void profiler_cleanup(void)
{
struct profiler *prof = rcu_dereference_protected(gbl_prof, true);
RCU_INIT_POINTER(gbl_prof, NULL);
synchronize_rcu();
kfree(prof->pcpu_ctx);
qfree(prof->qio);
kfree(prof);
}
static void profiler_cpu_flush(struct profiler *prof,
struct profiler_cpu_context *cpu_buf)
{
int8_t irq_state = 0;
disable_irqsave(&irq_state);
if (cpu_buf->block) {
qibwrite(prof->qio, cpu_buf->block);
cpu_buf->block = NULL;
}
enable_irqsave(&irq_state);
}
static void __profiler_core_trace_enable(void *opaque)
{
struct profiler *prof = opaque;
struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(prof,
core_id());
cpu_buf->tracing = prof->tracing;
if (!cpu_buf->tracing)
profiler_cpu_flush(prof, cpu_buf);
}
static void profiler_control_trace(struct profiler *prof, int onoff)
{
struct core_set cset;
assert(prof);
core_set_init(&cset);
core_set_fill_available(&cset);
prof->tracing = onoff;
/* Note this blocks until all cores have run the function. */
smp_do_in_cores(&cset, __profiler_core_trace_enable, prof);
}
/* This must only be called by the Kprofctlqid FD holder, ensuring that the
* profiler exists. Not thread-safe. */
void profiler_start(void)
{
struct profiler *prof = rcu_dereference_protected(gbl_prof, true);
profiler_control_trace(prof, 1);
qreopen(prof->qio);
}
/* This must only be called by the Kprofctlqid FD holder, ensuring that the
* profiler exists. Not thread-safe. */
void profiler_stop(void)
{
struct profiler *prof = rcu_dereference_protected(gbl_prof, true);
profiler_control_trace(prof, 0);
qhangup(prof->qio, 0);
}
static void __profiler_core_flush(void *opaque)
{
struct profiler *prof = opaque;
struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(prof,
core_id());
profiler_cpu_flush(prof, cpu_buf);
}
/* This must only be called by the Kprofctlqid FD holder, ensuring that the
* profiler exists. */
void profiler_trace_data_flush(void)
{
struct core_set cset;
core_set_init(&cset);
core_set_fill_available(&cset);
smp_do_in_cores(&cset, __profiler_core_flush, NULL);
}
void profiler_push_kernel_backtrace(uintptr_t *pc_list, size_t nr_pcs,
uint64_t info)
{
struct profiler *prof;
rcu_read_lock();
prof = rcu_dereference(gbl_prof);
if (prof) {
struct profiler_cpu_context *cpu_buf =
profiler_get_cpu_ctx(prof, core_id());
if (cpu_buf->tracing)
profiler_push_kernel_trace64(prof, cpu_buf, pc_list,
nr_pcs, info);
}
rcu_read_unlock();
}
void profiler_push_user_backtrace(uintptr_t *pc_list, size_t nr_pcs,
uint64_t info)
{
struct profiler *prof;
rcu_read_lock();
prof = rcu_dereference(gbl_prof);
if (prof) {
struct profiler_cpu_context *cpu_buf =
profiler_get_cpu_ctx(prof, core_id());
if (cpu_buf->tracing)
profiler_push_user_trace64(prof, cpu_buf, current,
pc_list, nr_pcs, info);
}
rcu_read_unlock();
}
size_t profiler_size(void)
{
struct profiler *prof;
size_t ret;
rcu_read_lock();
prof = rcu_dereference(gbl_prof);
ret = prof ? qlen(prof->qio) : 0;
rcu_read_unlock();
return ret;
}
size_t profiler_read(void *va, size_t n)
{
struct profiler *prof;
size_t ret;
rcu_read_lock();
prof = rcu_dereference(gbl_prof);
ret = prof ? qread(prof->qio, va, n) : 0;
rcu_read_unlock();
return ret;
}
void profiler_notify_mmap(struct proc *p, uintptr_t addr, size_t size, int prot,
int flags, struct file_or_chan *foc, size_t offset)
{
struct profiler *prof;
char *path;
rcu_read_lock();
prof = rcu_dereference(gbl_prof);
if (prof && foc && (prot & PROT_EXEC))
profiler_push_pid_mmap(prof, p, addr, size, offset,
foc_abs_path(foc));
rcu_read_unlock();
}
void profiler_notify_new_process(struct proc *p)
{
struct profiler *prof;
rcu_read_lock();
prof = rcu_dereference(gbl_prof);
if (prof && p->binary_path)
profiler_push_new_process(prof, p);
rcu_read_unlock();
}