blob: 9a6bd663313716a998065c183048601f7e91f3ca [file] [log] [blame]
/*
* This file is part of the UCB release of Plan 9. It is subject to the license
* terms in the LICENSE file found in the top-level directory of this
* distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
* part of the UCB release of Plan 9, including this file, may be copied,
* modified, propagated, or distributed except according to the terms contained
* in the LICENSE file.
*/
#include <ros/profiler_records.h>
#include <arch/time.h>
#include <slab.h>
#include <kmalloc.h>
#include <kref.h>
#include <atomic.h>
#include <kthread.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <error.h>
#include <pmap.h>
#include <smp.h>
#include <time.h>
#include <circular_buffer.h>
#include <umem.h>
#include <profiler.h>
#include <kprof.h>
#include <ros/procinfo.h>
#include <init.h>
#define KTRACE_BUFFER_SIZE (128 * 1024)
#define TRACE_PRINTK_BUFFER_SIZE (8 * 1024)
enum {
Kprofdirqid = 0,
Kprofdataqid,
Kprofctlqid,
Kptracectlqid,
Kptraceqid,
Kprintxqid,
Kmpstatqid,
Kmpstatrawqid,
};
struct trace_printk_buffer {
atomic_t in_use;
char buffer[TRACE_PRINTK_BUFFER_SIZE];
};
struct kprof {
qlock_t lock;
bool mpstat_ipi;
bool profiling;
bool opened;
};
struct dev kprofdevtab;
struct dirtab kproftab[] = {
{".", {Kprofdirqid, 0, QTDIR},0, DMDIR|0550},
{"kpdata", {Kprofdataqid}, 0, 0600},
{"kpctl", {Kprofctlqid}, 0, 0600},
{"kptrace_ctl", {Kptracectlqid}, 0, 0660},
{"kptrace", {Kptraceqid}, 0, 0600},
{"kprintx", {Kprintxqid}, 0, 0600},
{"mpstat", {Kmpstatqid}, 0, 0600},
{"mpstat-raw", {Kmpstatrawqid}, 0, 0600},
};
static struct kprof kprof;
static bool ktrace_init_done = FALSE;
static spinlock_t ktrace_lock = SPINLOCK_INITIALIZER_IRQSAVE;
static struct circular_buffer ktrace_data;
static char ktrace_buffer[KTRACE_BUFFER_SIZE];
static char kprof_control_usage[128];
static size_t mpstat_len(void)
{
size_t each_row = 7 + NR_CPU_STATES * 26;
return each_row * (num_cores + 1) + 1;
}
static size_t mpstatraw_len(void)
{
size_t header_row = 27 + NR_CPU_STATES * 7 + 1;
size_t cpu_row = 7 + NR_CPU_STATES * 17;
return header_row + cpu_row * num_cores + 1;
}
static char *devname(void)
{
return kprofdevtab.name;
}
static struct chan *kprof_attach(char *spec)
{
return devattach(devname(), spec);
}
/* Start collecting samples from perf events into the profiler.
*
* This command only runs if the user successfully opened kpctl, which gives
* them a profiler (the global profiler, for now). */
static void kprof_start_profiler(void)
{
ERRSTACK(1);
qlock(&kprof.lock);
if (waserror()) {
qunlock(&kprof.lock);
nexterror();
}
if (!kprof.profiling) {
profiler_start();
kprof.profiling = TRUE;
}
poperror();
qunlock(&kprof.lock);
}
/* Stops collecting samples from perf events.
*
* This command only runs if the user successfully opened kpctl, which gives
* them a profiler (the global profiler, for now). */
static void kprof_stop_profiler(void)
{
ERRSTACK(1);
qlock(&kprof.lock);
if (waserror()) {
qunlock(&kprof.lock);
nexterror();
}
if (kprof.profiling) {
profiler_stop();
kprof.profiling = FALSE;
}
poperror();
qunlock(&kprof.lock);
}
/* Makes each core flush its results into the profiler queue. You can do this
* while the profiler is still running. However, this does not hang up the
* queue, so reads on kpdata will block. */
static void kprof_flush_profiler(void)
{
ERRSTACK(1);
qlock(&kprof.lock);
if (waserror()) {
qunlock(&kprof.lock);
nexterror();
}
if (kprof.profiling)
profiler_trace_data_flush();
poperror();
qunlock(&kprof.lock);
}
static void kprof_init(void)
{
profiler_init();
qlock_init(&kprof.lock);
kprof.profiling = FALSE;
kprof.opened = FALSE;
for (int i = 0; i < ARRAY_SIZE(kproftab); i++)
kproftab[i].length = 0;
kprof.mpstat_ipi = TRUE;
kproftab[Kmpstatqid].length = mpstat_len();
kproftab[Kmpstatrawqid].length = mpstatraw_len();
strlcpy(kprof_control_usage, "start|stop|flush",
sizeof(kprof_control_usage));
profiler_append_configure_usage(kprof_control_usage,
sizeof(kprof_control_usage));
}
static void kprof_shutdown(void)
{
}
static struct walkqid *kprof_walk(struct chan *c, struct chan *nc, char **name,
unsigned int nname)
{
return devwalk(c, nc, name, nname, kproftab, ARRAY_SIZE(kproftab),
devgen);
}
static size_t kprof_profdata_size(void)
{
return profiler_size();
}
static long kprof_profdata_read(void *dest, long size, int64_t off)
{
return profiler_read(dest, size);
}
static size_t kprof_stat(struct chan *c, uint8_t *db, size_t n)
{
kproftab[Kprofdataqid].length = kprof_profdata_size();
kproftab[Kptraceqid].length = kprof_tracedata_size();
return devstat(c, db, n, kproftab, ARRAY_SIZE(kproftab), devgen);
}
static struct chan *kprof_open(struct chan *c, int omode)
{
if (c->qid.type & QTDIR) {
if (openmode(omode) != O_READ)
error(EPERM, ERROR_FIXME);
}
switch ((int) c->qid.path) {
case Kprofctlqid:
/* We have one global profiler. Only one FD may be opened at a
* time for it. If we ever have separate profilers, we can
* create the profiler here, and every open would get a separate
* instance. */
qlock(&kprof.lock);
if (kprof.opened) {
qunlock(&kprof.lock);
error(EBUSY, "Global profiler is already open");
}
kprof.opened = TRUE;
/* TODO: have a creation function for a non-global profiler */
profiler_setup();
qunlock(&kprof.lock);
break;
}
c->mode = openmode(omode);
c->flag |= COPEN;
c->offset = 0;
return c;
}
static void kprof_close(struct chan *c)
{
if (c->flag & COPEN) {
switch ((int) c->qid.path) {
case Kprofctlqid:
kprof_stop_profiler();
qlock(&kprof.lock);
profiler_cleanup();
kprof.opened = FALSE;
qunlock(&kprof.lock);
break;
}
}
}
static long mpstat_read(void *va, long n, int64_t off)
{
size_t bufsz = mpstat_len();
char *buf = kmalloc(bufsz, MEM_WAIT);
int len = 0;
struct per_cpu_info *pcpui;
uint64_t cpu_total;
struct timespec ts;
/* the IPI interferes with other cores, might want to disable that. */
if (kprof.mpstat_ipi)
send_broadcast_ipi(I_POKE_CORE);
len += snprintf(buf + len, bufsz - len, " CPU: ");
for (int j = 0; j < NR_CPU_STATES; j++)
len += snprintf(buf + len, bufsz - len, "%23s%s",
cpu_state_names[j],
j != NR_CPU_STATES - 1 ? " " : " \n");
for (int i = 0; i < num_cores; i++) {
pcpui = &per_cpu_info[i];
cpu_total = 0;
len += snprintf(buf + len, bufsz - len, "%5d: ", i);
for (int j = 0; j < NR_CPU_STATES; j++)
cpu_total += pcpui->state_ticks[j];
cpu_total = MAX(cpu_total, 1); /* for the divide later */
for (int j = 0; j < NR_CPU_STATES; j++) {
ts = tsc2timespec(pcpui->state_ticks[j]);
len += snprintf(buf + len, bufsz - len,
"%10d.%06d (%3d%%)%s",
ts.tv_sec, ts.tv_nsec / 1000,
MIN((pcpui->state_ticks[j] * 100) /
cpu_total, 100),
j != NR_CPU_STATES - 1 ? ", " : " \n");
}
}
n = readstr(off, va, n, buf);
kfree(buf);
return n;
}
static long mpstatraw_read(void *va, long n, int64_t off)
{
size_t bufsz = mpstatraw_len();
char *buf = kmalloc(bufsz, MEM_WAIT);
int len = 0;
struct per_cpu_info *pcpui;
/* could spit it all out in binary, though then it'd be harder to
* process the data across a mnt (if we export #K). probably not a big
* deal. */
/* header line: version, num_cores, tsc freq, state names */
len += snprintf(buf + len, bufsz - len, "v%03d %5d %16llu", 1,
num_cores, __proc_global_info.tsc_freq);
for (int j = 0; j < NR_CPU_STATES; j++)
len += snprintf(buf + len, bufsz - len, " %6s",
cpu_state_names[j]);
len += snprintf(buf + len, bufsz - len, "\n");
for (int i = 0; i < num_cores; i++) {
pcpui = &per_cpu_info[i];
len += snprintf(buf + len, bufsz - len, "%5d: ", i);
for (int j = 0; j < NR_CPU_STATES; j++) {
len += snprintf(buf + len, bufsz - len, "%16llx%s",
pcpui->state_ticks[j],
j != NR_CPU_STATES - 1 ? " " : "\n");
}
}
n = readstr(off, va, n, buf);
kfree(buf);
return n;
}
static size_t kprof_read(struct chan *c, void *va, size_t n, off64_t off)
{
uint64_t w, *bp;
char *a, *ea;
uintptr_t offset = off;
uint64_t pc;
switch ((int) c->qid.path) {
case Kprofdirqid:
return devdirread(c, va, n, kproftab, ARRAY_SIZE(kproftab),
devgen);
case Kprofdataqid:
n = kprof_profdata_read(va, n, off);
break;
case Kptraceqid:
n = kprof_tracedata_read(va, n, off);
break;
case Kprintxqid:
n = readstr(offset, va, n, printx_on ? "on" : "off");
break;
case Kmpstatqid:
n = mpstat_read(va, n, offset);
break;
case Kmpstatrawqid:
n = mpstatraw_read(va, n, offset);
break;
default:
n = 0;
break;
}
return n;
}
static size_t kprof_write(struct chan *c, void *a, size_t n, off64_t unused)
{
ERRSTACK(1);
struct cmdbuf *cb = parsecmd(a, n);
if (waserror()) {
kfree(cb);
nexterror();
}
switch ((int) c->qid.path) {
case Kprofctlqid:
if (cb->nf < 1)
error(EFAIL, kprof_control_usage);
if (profiler_configure(cb))
break;
if (!strcmp(cb->f[0], "start")) {
kprof_start_profiler();
} else if (!strcmp(cb->f[0], "flush")) {
kprof_flush_profiler();
} else if (!strcmp(cb->f[0], "stop")) {
kprof_stop_profiler();
} else {
error(EFAIL, kprof_control_usage);
}
break;
case Kptracectlqid:
if (cb->nf < 1)
error(EFAIL, "Bad kptrace_ctl option (reset)");
if (!strcmp(cb->f[0], "clear")) {
spin_lock_irqsave(&ktrace_lock);
circular_buffer_clear(&ktrace_data);
spin_unlock_irqsave(&ktrace_lock);
}
break;
case Kptraceqid:
if (a && (n > 0)) {
char *uptr = user_strdup_errno(current, a, n);
if (uptr) {
trace_printk("%s", uptr);
user_memdup_free(current, uptr);
} else {
n = -1;
}
}
break;
case Kprintxqid:
if (!strncmp(a, "on", 2))
set_printx(1);
else if (!strncmp(a, "off", 3))
set_printx(0);
else if (!strncmp(a, "toggle", 6))
set_printx(2);
else
error(EFAIL, "Invalid option to Kprintx %s\n", a);
break;
case Kmpstatqid:
case Kmpstatrawqid:
if (cb->nf < 1)
error(EFAIL, "Bad mpstat option (reset|ipi|on|off)");
if (!strcmp(cb->f[0], "reset")) {
for (int i = 0; i < num_cores; i++)
reset_cpu_state_ticks(i);
} else if (!strcmp(cb->f[0], "on")) {
/* TODO: enable the ticks */ ;
} else if (!strcmp(cb->f[0], "off")) {
/* TODO: disable the ticks */ ;
} else if (!strcmp(cb->f[0], "ipi")) {
if (cb->nf < 2)
error(EFAIL, "Need another arg: ipi [on|off]");
if (!strcmp(cb->f[1], "on"))
kprof.mpstat_ipi = TRUE;
else if (!strcmp(cb->f[1], "off"))
kprof.mpstat_ipi = FALSE;
else
error(EFAIL, "ipi [on|off]");
} else {
error(EFAIL, "Bad mpstat option (reset|ipi|on|off)");
}
break;
default:
error(EBADFD, ERROR_FIXME);
}
kfree(cb);
poperror();
return n;
}
size_t kprof_tracedata_size(void)
{
return circular_buffer_size(&ktrace_data);
}
size_t kprof_tracedata_read(void *data, size_t size, size_t offset)
{
spin_lock_irqsave(&ktrace_lock);
if (likely(ktrace_init_done))
size = circular_buffer_read(&ktrace_data, data, size, offset);
else
size = 0;
spin_unlock_irqsave(&ktrace_lock);
return size;
}
void kprof_dump_data(void)
{
void *buf;
size_t len = kprof_tracedata_size();
buf = kmalloc(len, MEM_WAIT);
kprof_tracedata_read(buf, len, 0);
printk("%s", buf);
kfree(buf);
}
void kprof_tracedata_write(const char *pretty_buf, size_t len)
{
spin_lock_irqsave(&ktrace_lock);
if (unlikely(!ktrace_init_done)) {
circular_buffer_init(&ktrace_data, sizeof(ktrace_buffer),
ktrace_buffer);
ktrace_init_done = TRUE;
}
circular_buffer_write(&ktrace_data, pretty_buf, len);
spin_unlock_irqsave(&ktrace_lock);
}
static struct trace_printk_buffer *kprof_get_printk_buffer(void)
{
static struct trace_printk_buffer boot_tpb;
static struct trace_printk_buffer *cpu_tpbs;
static atomic_t alloc_done;
if (unlikely(booting))
return &boot_tpb;
if (unlikely(!cpu_tpbs)) {
/* Poor man per-CPU data structure. I really do no like
* littering global data structures with module specific data.
* We cannot take the ktrace_lock to protect the kzmalloc()
* call, as that might trigger printk()s, and we would reenter
* here. Let only one core into the kzmalloc() path, and let
* the others get the boot_tpb until finished. */
if (!atomic_cas(&alloc_done, 0, 1))
return &boot_tpb;
cpu_tpbs = kzmalloc(num_cores *
sizeof(struct trace_printk_buffer), 0);
}
return cpu_tpbs + core_id_early();
}
void trace_vprintk(const char *fmt, va_list args)
{
struct print_buf {
char *ptr;
char *top;
};
void emit_print_buf_str(struct print_buf *pb, const char *str,
ssize_t size)
{
if (size < 0) {
for (; *str && (pb->ptr < pb->top); str++)
*(pb->ptr++) = *str;
} else {
for (; (size > 0) && (pb->ptr < pb->top); str++, size--)
*(pb->ptr++) = *str;
}
}
static const size_t bufsz = TRACE_PRINTK_BUFFER_SIZE;
static const size_t usr_bufsz = (3 * bufsz) / 8;
static const size_t kp_bufsz = bufsz - usr_bufsz;
struct trace_printk_buffer *tpb = kprof_get_printk_buffer();
struct timespec ts_now = { 0, 0 };
struct print_buf pb;
char *usrbuf = tpb->buffer, *kpbuf = tpb->buffer + usr_bufsz;
const char *utop, *uptr;
char hdr[64];
if (!atomic_cas(&tpb->in_use, 0, 1))
return;
if (likely(__proc_global_info.tsc_freq))
ts_now = tsc2timespec(read_tsc());
snprintf(hdr, sizeof(hdr), "[%lu.%09lu]:cpu%d: ", ts_now.tv_sec,
ts_now.tv_nsec, core_id_early());
pb.ptr = usrbuf + vsnprintf(usrbuf, usr_bufsz, fmt, args);
pb.top = usrbuf + usr_bufsz;
if (pb.ptr[-1] != '\n')
emit_print_buf_str(&pb, "\n", 1);
/* snprintf null terminates the buffer, and does not count that as part
* of the len. If we maxed out the buffer, let's make sure it has a \n.
*/
if (pb.ptr == pb.top)
pb.ptr[-1] = '\n';
utop = pb.ptr;
pb.ptr = kpbuf;
pb.top = kpbuf + kp_bufsz;
for (uptr = usrbuf; uptr < utop;) {
const char *nlptr = memchr(uptr, '\n', utop - uptr);
if (nlptr == NULL)
nlptr = utop;
emit_print_buf_str(&pb, hdr, -1);
emit_print_buf_str(&pb, uptr, (nlptr - uptr) + 1);
uptr = nlptr + 1;
}
kprof_tracedata_write(kpbuf, pb.ptr - kpbuf);
atomic_set(&tpb->in_use, 0);
}
void trace_printk(const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
trace_vprintk(fmt, args);
va_end(args);
}
struct dev kprofdevtab __devtab = {
.name = "kprof",
.reset = devreset,
.init = kprof_init,
.shutdown = kprof_shutdown,
.attach = kprof_attach,
.walk = kprof_walk,
.stat = kprof_stat,
.open = kprof_open,
.create = devcreate,
.close = kprof_close,
.read = kprof_read,
.bread = devbread,
.write = kprof_write,
.bwrite = devbwrite,
.remove = devremove,
.wstat = devwstat,
};