blob: 596f3595d8f58baba81eb30dc876097f2280f75a [file] [log] [blame]
/**
* @file cpu_buffer.c
*
* @remark Copyright 2002-2009 OProfile authors
* @remark Read the file COPYING
*
* @author John Levon <levon@movementarian.org>
* @author Barry Kasindorf <barry.kasindorf@amd.com>
* @author Robert Richter <robert.richter@amd.com>
*
* Each CPU has a local buffer that stores PC value/event
* pairs. We also log context switches when we notice them.
* Eventually each CPU's buffer is processed into the global
* event buffer by sync_buffer().
*
* We use a local buffer for two reasons: an NMI or similar
* interrupt cannot synchronise, and high sampling rates
* would lead to catastrophic global synchronisation if
* a global buffer was used.
*/
#include "event_buffer.h"
#include "cpu_buffer.h"
#include "buffer_sync.h"
#include "oprof.h"
#define OP_BUFFER_FLAGS 0
/* we allocate an array of these and set the pointer in pcpui */
struct oprofile_cpu_buffer *op_cpu_buffer;
/* this one queue is used by #K to get all events. */
static struct queue *opq;
/* this is run from core 0 for all cpu buffers. */
static void wq_sync_buffer(void);
unsigned long oprofile_cpu_buffer_size = 65536;
unsigned long oprofile_backtrace_depth = 16;
#define DEFAULT_TIMER_EXPIRE (HZ / 10)
static int work_enabled;
/*
* Resets the cpu buffer to a sane state.
*
* reset these to invalid values; the next sample collected will
* populate the buffer with proper values to initialize the buffer
*/
static inline void op_cpu_buffer_reset(int cpu)
{
//print_func_entry();
struct oprofile_cpu_buffer *cpu_buf = &op_cpu_buffer[core_id()];
cpu_buf->last_is_kernel = -1;
cpu_buf->last_proc = NULL;
//print_func_exit();
}
/* returns the remaining free size of data in the entry */
static inline
int op_cpu_buffer_add_data(struct op_entry *entry, unsigned long val)
{
//print_func_entry();
assert(entry->size >= 0);
if (!entry->size) {
//print_func_exit();
return 0;
}
*entry->data = val;
entry->size--;
entry->data++;
//print_func_exit();
return entry->size;
}
/* returns the size of data in the entry */
static inline int op_cpu_buffer_get_size(struct op_entry *entry)
{
//print_func_entry();
//print_func_exit();
return entry->size;
}
/* returns 0 if empty or the size of data including the current value */
static inline
int op_cpu_buffer_get_data(struct op_entry *entry, unsigned long *val)
{
//print_func_entry();
int size = entry->size;
if (!size) {
//print_func_exit();
return 0;
}
*val = *entry->data;
entry->size--;
entry->data++;
//print_func_exit();
return size;
}
unsigned long oprofile_get_cpu_buffer_size(void)
{
//print_func_entry();
//print_func_exit();
return oprofile_cpu_buffer_size;
}
void oprofile_cpu_buffer_inc_smpl_lost(void)
{
//print_func_entry();
struct oprofile_cpu_buffer *cpu_buf = &op_cpu_buffer[core_id()];
cpu_buf->sample_lost_overflow++;
//print_func_exit();
}
void free_cpu_buffers(void)
{
//print_func_entry();
kfree(op_cpu_buffer);
/* we can just leave the queue set up; it will then always return EOF */
//print_func_exit();
}
#define RB_EVENT_HDR_SIZE 4
int alloc_cpu_buffers(void)
{
//print_func_entry();
/* should probably start using waserror() here. The fail stuff just gets
* ugly.
*/
int i;
unsigned long buffer_size = oprofile_cpu_buffer_size;
unsigned long byte_size = buffer_size * (sizeof(struct op_sample) +
RB_EVENT_HDR_SIZE);
/* this can get called lots of times. Things might have been freed.
* So be careful.
*/
/* what limit? No idea. */
if (!opq)
opq = qopen(1024, 0, NULL, NULL);
if (!opq)
goto fail;
/* we *really* don't want to block. Losing data is better. */
qnoblock(opq, 1);
if (!op_cpu_buffer) {
op_cpu_buffer =
kzmalloc(sizeof(*op_cpu_buffer) * num_cpus, KMALLOC_WAIT);
if (!op_cpu_buffer)
goto fail;
for (i = 0; i < num_cpus; i++) {
struct oprofile_cpu_buffer *b = &op_cpu_buffer[i];
b->last_proc = NULL;
b->last_is_kernel = -1;
b->tracing = 0;
b->buffer_size = buffer_size;
b->sample_received = 0;
b->sample_lost_overflow = 0;
b->backtrace_aborted = 0;
b->sample_invalid_eip = 0;
b->cpu = i;
b->fullqueue = qopen(1024, Qmsg, NULL, NULL);
b->emptyqueue = qopen(1024, Qmsg, NULL, NULL);
spinlock_init_irqsave(&b->lock);
}
}
//print_func_exit();
return 0;
fail:
free_cpu_buffers();
//print_func_exit();
return -ENOMEM;
}
void start_cpu_work(void)
{
//print_func_entry();
int i;
work_enabled = 1;
/* task starts here.
schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i);
*/
//print_func_exit();
}
void end_cpu_work(void)
{
//print_func_entry();
work_enabled = 0;
//print_func_exit();
}
/* placeholder. Not used yet.
*/
void flush_cpu_work(void)
{
//print_func_entry();
int i;
struct oprofile_cpu_buffer *b = &op_cpu_buffer[core_id()];
//print_func_exit();
}
/* Not used since we're not doing per-cpu buffering yet.
*/
struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
{
//print_func_entry();
//print_func_exit();
return NULL;
}
static struct block *op_cpu_buffer_write_reserve(struct oprofile_cpu_buffer *cpu_buf,
struct op_entry *entry, int size)
{
//print_func_entry();
struct block *b;
int totalsize = sizeof(struct op_sample) +
size * sizeof(entry->sample->data[0]);
b = cpu_buf->block;
/* we might have run out. */
if ((! b) || (b->lim - b->wp) < size) {
if (b){
qibwrite(opq, b);
}
/* For now. Later, we will grab a block off the
* emptyblock queue.
*/
cpu_buf->block = b = iallocb(oprofile_cpu_buffer_size);
if (!b) {
printk("%s: fail\n", __func__);
//print_func_exit();
return NULL;
}
}
entry->sample = (void *)b->wp;
entry->size = size;
entry->data = entry->sample->data;
b->wp += totalsize;
//print_func_exit();
return b;
}
static int
op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
int is_kernel, struct proc *proc)
{
//print_func_entry();
struct block *b;
struct op_entry entry;
struct op_sample *sample;
unsigned long flags;
int size;
ERRSTACK(1);
flags = 0;
if (waserror()) {
poperror();
printk("%s: failed\n", __func__);
//print_func_exit();
return 1;
}
if (backtrace)
flags |= TRACE_BEGIN;
/* notice a switch from user->kernel or vice versa */
is_kernel = ! !is_kernel;
if (cpu_buf->last_is_kernel != is_kernel) {
cpu_buf->last_is_kernel = is_kernel;
flags |= KERNEL_CTX_SWITCH;
if (is_kernel)
flags |= IS_KERNEL;
}
/* notice a proc switch */
if (cpu_buf->last_proc != proc) {
cpu_buf->last_proc = proc;
flags |= USER_CTX_SWITCH;
}
if (!flags) {
poperror();
/* nothing to do */
//print_func_exit();
return 0;
}
if (flags & USER_CTX_SWITCH)
size = 1;
else
size = 0;
b = op_cpu_buffer_write_reserve(cpu_buf, &entry, size);
entry.sample->eip = ESCAPE_CODE;
entry.sample->event = flags;
if (size)
op_cpu_buffer_add_data(&entry, (unsigned long)proc);
poperror();
//print_func_exit();
return 0;
}
static inline int
op_add_sample(struct oprofile_cpu_buffer *cpu_buf,
unsigned long pc, unsigned long event)
{
//print_func_entry();
ERRSTACK(1);
struct op_entry entry;
struct op_sample *sample;
struct block *b;
if (waserror()) {
poperror();
printk("%s: failed\n", __func__);
//print_func_exit();
return 1;
}
b = op_cpu_buffer_write_reserve(cpu_buf, &entry, 0);
sample = entry.sample;
sample->eip = pc;
sample->event = event;
poperror();
//print_func_exit();
return 0;
}
/*
* This must be safe from any context.
*
* is_kernel is needed because on some architectures you cannot
* tell if you are in kernel or user space simply by looking at
* pc. We tag this in the buffer by generating kernel enter/exit
* events whenever is_kernel changes
*/
static int
log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
unsigned long backtrace, int is_kernel, unsigned long event,
struct proc *proc)
{
//print_func_entry();
struct proc *tsk = proc ? proc : current;
cpu_buf->sample_received++;
if (pc == ESCAPE_CODE) {
cpu_buf->sample_invalid_eip++;
//print_func_exit();
return 0;
}
/* ah, so great. op_add* return 1 in event of failure.
* this function returns 0 in event of failure.
* what a cluster.
*/
spin_lock_irqsave(&cpu_buf->lock);
if (op_add_code(cpu_buf, backtrace, is_kernel, tsk))
goto fail;
if (op_add_sample(cpu_buf, pc, event))
goto fail;
spin_unlock_irqsave(&cpu_buf->lock);
//print_func_exit();
return 1;
fail:
cpu_buf->sample_lost_overflow++;
//print_func_exit();
return 0;
}
static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf)
{
//print_func_entry();
cpu_buf->tracing = 1;
//print_func_exit();
}
static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf)
{
//print_func_entry();
cpu_buf->tracing = 0;
//print_func_exit();
}
void oprofile_cpubuf_flushone(int core, int newbuf)
{
//print_func_entry();
struct oprofile_cpu_buffer *cpu_buf;
cpu_buf = &op_cpu_buffer[core];
spin_lock_irqsave(&cpu_buf->lock);
if (cpu_buf->block) {
printk("Core %d has data\n", core);
qibwrite(opq, cpu_buf->block);
printk("After qibwrite in %s, opq len %d\n", __func__, qlen(opq));
}
if (newbuf)
cpu_buf->block = iallocb(oprofile_cpu_buffer_size);
else
cpu_buf->block = NULL;
spin_unlock_irqsave(&cpu_buf->lock);
//print_func_exit();
}
void oprofile_cpubuf_flushall(int alloc)
{
//print_func_entry();
int core;
for(core = 0; core < num_cpus; core++) {
oprofile_cpubuf_flushone(core, alloc);
}
//print_func_exit();
}
void oprofile_control_trace(int onoff)
{
//print_func_entry();
int core;
struct oprofile_cpu_buffer *cpu_buf;
for(core = 0; core < num_cpus; core++) {
cpu_buf = &op_cpu_buffer[core];
cpu_buf->tracing = onoff;
if (onoff) {
printk("Enable tracing on %d\n", core);
continue;
}
/* halting. Force out all buffers. */
oprofile_cpubuf_flushone(core, 0);
}
//print_func_exit();
}
static inline void
__oprofile_add_ext_sample(unsigned long pc,
void /*struct pt_regs */ *const regs,
unsigned long event, int is_kernel, struct proc *proc)
{
//print_func_entry();
struct oprofile_cpu_buffer *cpu_buf = &op_cpu_buffer[core_id()];
unsigned long backtrace = oprofile_backtrace_depth;
/*
* if log_sample() fail we can't backtrace since we lost the
* source of this event
*/
if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event, proc))
/* failed */
{
//print_func_exit();
return;
}
if (!backtrace) {
//print_func_exit();
return;
}
#if 0
oprofile_begin_trace(cpu_buf);
oprofile_ops.backtrace(regs, backtrace);
oprofile_end_trace(cpu_buf);
#endif
//print_func_exit();
}
void oprofile_add_ext_hw_sample(unsigned long pc,
void /*struct pt_regs */ *const regs,
unsigned long event, int is_kernel,
struct proc *proc)
{
//print_func_entry();
__oprofile_add_ext_sample(pc, regs, event, is_kernel, proc);
//print_func_exit();
}
void oprofile_add_ext_sample(unsigned long pc,
void /*struct pt_regs */ *const regs,
unsigned long event, int is_kernel)
{
//print_func_entry();
__oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL);
//print_func_exit();
}
void oprofile_add_sample(void /*struct pt_regs */ *const regs,
unsigned long event)
{
//print_func_entry();
int is_kernel;
unsigned long pc;
if (regs) {
is_kernel = 0; // FIXME!user_mode(regs);
pc = 0; // FIXME profile_pc(regs);
} else {
is_kernel = 0; /* This value will not be used */
pc = ESCAPE_CODE; /* as this causes an early return. */
}
__oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL);
//print_func_exit();
}
/*
* Add samples with data to the ring buffer.
*
* Use oprofile_add_data(&entry, val) to add data and
* oprofile_write_commit(&entry) to commit the sample.
*/
void
oprofile_write_reserve(struct op_entry *entry,
void /*struct pt_regs */ *const regs,
unsigned long pc, int code, int size)
{
//print_func_entry();
ERRSTACK(1);
struct op_sample *sample;
struct block *b;
int is_kernel = 0; // FIXME!user_mode(regs);
struct oprofile_cpu_buffer *cpu_buf = &op_cpu_buffer[core_id()];
if (waserror()) {
printk("%s: failed\n", __func__);
poperror();
goto fail;
}
cpu_buf->sample_received++;
/* no backtraces for samples with data */
if (op_add_code(cpu_buf, 0, is_kernel, current))
goto fail;
b = op_cpu_buffer_write_reserve(cpu_buf, entry, size + 2);
sample = entry->sample;
sample->eip = ESCAPE_CODE;
sample->event = 0; /* no flags */
op_cpu_buffer_add_data(entry, code);
op_cpu_buffer_add_data(entry, pc);
poperror();
//print_func_exit();
return;
fail:
entry->event = NULL;
cpu_buf->sample_lost_overflow++;
//print_func_exit();
}
int oprofile_add_data(struct op_entry *entry, unsigned long val)
{
//print_func_entry();
if (!entry->event) {
//print_func_exit();
return 0;
}
//print_func_exit();
return op_cpu_buffer_add_data(entry, val);
}
int oprofile_add_data64(struct op_entry *entry, uint64_t val)
{
//print_func_entry();
if (!entry->event) {
//print_func_exit();
return 0;
}
if (op_cpu_buffer_get_size(entry) < 2)
/*
* the function returns 0 to indicate a too small
* buffer, even if there is some space left
*/
{
//print_func_exit();
return 0;
}
if (!op_cpu_buffer_add_data(entry, (uint32_t) val)) {
//print_func_exit();
return 0;
}
//print_func_exit();
return op_cpu_buffer_add_data(entry, (uint32_t) (val >> 32));
}
int oprofile_write_commit(struct op_entry *entry)
{
//print_func_entry();
/* not much to do at present. In future, we might write the Block
* to opq.
*/
//print_func_exit();
return 0;
}
void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
{
//print_func_entry();
struct oprofile_cpu_buffer *cpu_buf = &op_cpu_buffer[core_id()];
log_sample(cpu_buf, pc, 0, is_kernel, event, NULL);
//print_func_exit();
}
void oprofile_add_trace(unsigned long pc)
{
if (! op_cpu_buffer)
return;
//print_func_entry();
struct oprofile_cpu_buffer *cpu_buf = &op_cpu_buffer[core_id()];
if (!cpu_buf->tracing) {
//print_func_exit();
return;
}
/*
* broken frame can give an eip with the same value as an
* escape code, abort the trace if we get it
*/
if (pc == ESCAPE_CODE)
goto fail;
if (op_add_sample(cpu_buf, pc, nsec()&~0xf))
goto fail;
//print_func_exit();
return;
fail:
printk("%s: fail. Turning of tracing on cpu %d\n", core_id());
cpu_buf->tracing = 0;
cpu_buf->backtrace_aborted++;
//print_func_exit();
return;
}
/* Format for samples:
* first word:
* high 8 bits is ee, which is an invalid address on amd64.
* next 8 bits is protocol version
* next 16 bits is unused, MBZ. Later, we can make it a packet type.
* next 16 bits is core id
* next 8 bits is unused
* next 8 bits is # PCs following. This should be at least 1, for one EIP.
*
* second word is time in ns.
*
* Third and following words are PCs, there must be at least one of them.
*/
void oprofile_add_backtrace(uintptr_t pc, uintptr_t fp)
{
/* version 1. */
uint64_t descriptor = 0xee01ULL<<48;
if (! op_cpu_buffer)
return;
//print_func_entry();
struct oprofile_cpu_buffer *cpu_buf = &op_cpu_buffer[core_id()];
if (!cpu_buf->tracing) {
//print_func_exit();
return;
}
struct op_entry entry;
struct op_sample *sample;
struct block *b;
uint64_t event = nsec();
uintptr_t bt_pcs[oprofile_backtrace_depth];
int nr_pcs;
nr_pcs = backtrace_list(pc, fp, bt_pcs, oprofile_backtrace_depth);
/* write_reserve always assumes passed-in-size + 2.
* backtrace_depth should always be > 0.
*/
b = op_cpu_buffer_write_reserve(cpu_buf, &entry, nr_pcs);
if (! b)
return;
/* we are changing the sample format, but not the struct
* member names yet. Later, assuming this works out.
*/
descriptor |= (core_id() << 16) | nr_pcs;
sample = entry.sample;
sample->eip = descriptor;
sample->event = event;
memcpy(sample->data, bt_pcs, sizeof(uintptr_t) * nr_pcs);
//print_func_exit();
return;
fail:
printk("%s: fail. Turning of tracing on cpu %d\n", core_id());
cpu_buf->tracing = 0;
cpu_buf->backtrace_aborted++;
//print_func_exit();
return;
}
void oprofile_add_userpc(uintptr_t pc)
{
struct oprofile_cpu_buffer *cpu_buf;
uint32_t pcoreid = core_id();
struct op_entry entry;
struct block *b;
uint64_t descriptor = (0xee01ULL << 48) | (pcoreid << 16) | 1;
if (!op_cpu_buffer)
return;
cpu_buf = &op_cpu_buffer[pcoreid];
if (!cpu_buf->tracing)
return;
/* write_reserve always assumes passed-in-size + 2. need room for 1 PC. */
b = op_cpu_buffer_write_reserve(cpu_buf, &entry, 1);
if (!b)
return;
entry.sample->eip = descriptor;
entry.sample->event = nsec();
/* entry.sample->data == entry.data */
assert(entry.sample->data == entry.data);
*entry.sample->data = pc;
}
int
oproflen(void)
{
return qlen(opq);
}
/* return # bytes read, or 0 if profiling is off, or block if profiling on and no data.
*/
int
oprofread(void *va, int n)
{
int len = qlen(opq);
struct oprofile_cpu_buffer *cpu_buf = &op_cpu_buffer[core_id()];
if (len == 0) {
if (cpu_buf->tracing == 0)
return 0;
}
len = qread(opq, va, n);
return len;
}