kern/drivers/dev/kprof.c - upstream - Git at Google

 /*
  * This file is part of the UCB release of Plan 9. It is subject to the license
  * terms in the LICENSE file found in the top-level directory of this
  * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  * part of the UCB release of Plan 9, including this file, may be copied,
  * modified, propagated, or distributed except according to the terms contained
  * in the LICENSE file.
  */

 // get_fn_name is slowing down the kprocread
 // 	have an array of translated fns
 // 	or a "next" iterator, since we're walking in order
 //
 // irqsave locks
 //
 // kprof struct should be a ptr, have them per core
 // 		we'll probably need to track the length still, so userspace knows how
 // 		big it is
 //
 // 		will also want more files in the kprof dir for each cpu or something
 //
 // maybe don't use slot 0 and 1 as total and 'not kernel' ticks
 //
 // fix the failed assert XXX

 #include <vfs.h>
 #include <kfs.h>
 #include <slab.h>
 #include <kmalloc.h>
 #include <kref.h>
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
 #include <error.h>
 #include <cpio.h>
 #include <pmap.h>
 #include <smp.h>
 #include <ip.h>
 #include <oprofile.h>

 #define LRES	3		/* log of PC resolution */
 #define CELLSIZE	8	/* sizeof of count cell */

 struct kprof
 {
 	uintptr_t	minpc;
 	uintptr_t	maxpc;
 	int	nbuf;
 	int	time;
 	uint64_t	*buf;	/* keep in sync with cellsize */
 	size_t		buf_sz;
 	spinlock_t lock;
 	struct queue *systrace;
 	bool		mpstat_ipi;
 };
 struct kprof kprof;

 /* output format. Nice fixed size. That makes it seekable.
  * small subtle bit here. You have to convert offset FROM FORMATSIZE units
  * to CELLSIZE units in a few places.
  */
 char *outformat = "%016llx %29.29s %016llx\n";
 #define FORMATSIZE 64
 enum{
 	Kprofdirqid = 0,
 	Kprofdataqid,
 	Kprofctlqid,
 	Kprofoprofileqid,
 	Kptraceqid,
 	Kprintxqid,
 	Kmpstatqid,
 	Kmpstatrawqid,
 };

 struct dirtab kproftab[]={
 	{".",		{Kprofdirqid, 0, QTDIR},0,	DMDIR|0550},
 	{"kpdata",	{Kprofdataqid},		0,	0600},
 	{"kpctl",	{Kprofctlqid},		0,	0600},
 	{"kpoprofile",	{Kprofoprofileqid},	0,	0600},
 	{"kptrace",	{Kptraceqid},		0,	0600},
 	{"kprintx",	{Kprintxqid},		0,	0600},
 	{"mpstat",	{Kmpstatqid},		0,	0600},
 	{"mpstat-raw",	{Kmpstatrawqid},		0,	0600},
 };

 static size_t mpstatraw_len(void);
 static size_t mpstat_len(void);

 static struct alarm_waiter *oprof_alarms;
 static unsigned int oprof_timer_period = 1000;

 static void oprof_alarm_handler(struct alarm_waiter *waiter,
                                 struct hw_trapframe *hw_tf)
 {
 	int coreid = core_id();
 	struct timer_chain *tchain = &per_cpu_info[coreid].tchain;
 	if (in_kernel(hw_tf))
 		oprofile_add_backtrace(get_hwtf_pc(hw_tf), get_hwtf_fp(hw_tf));
 	else
 		oprofile_add_userpc(get_hwtf_pc(hw_tf));
 	reset_alarm_rel(tchain, waiter, oprof_timer_period);
 }

 static struct chan*
 kprofattach(char *spec)
 {
 	// Did we initialise completely?
 	if ( !(oprof_alarms && kprof.buf && kprof.systrace) )
 		error(Enomem);

 	return devattach('K', spec);
 }

 static void
 kproftimer(uintptr_t pc)
 {
 	if(kprof.time == 0)
 		return;

 	/*
 	 * if the pc corresponds to the idle loop, don't consider it.

 	if(m->inidle)
 		return;
 	 */
 	/*
 	 *  if the pc is coming out of spllo or splx,
 	 *  use the pc saved when we went splhi.

 	if(pc>=PTR2UINT(spllo) && pc<=PTR2UINT(spldone))
 		pc = m->splpc;
 	 */

 //	ilock(&kprof);
 	/* this is weird. What we do is assume that all the time since the last
 	 * measurement went into this PC. It's the best
 	 * we can do I suppose. And we are sampling at 1 ms. for now.
 	 * better ideas welcome.
 	 */
 	kprof.buf[0] += 1; //Total count of ticks.
 	if(kprof.minpc<=pc && pc<kprof.maxpc){
 		pc -= kprof.minpc;
 		pc >>= LRES;
 		kprof.buf[pc] += 1;
 	}else
 		kprof.buf[1] += 1; // Why?
 //	iunlock(&kprof);
 }

 static void setup_timers(void)
 {
 	void kprof_alarm(struct alarm_waiter *waiter, struct hw_trapframe *hw_tf)
 	{
 		struct timer_chain *tchain = &per_cpu_info[core_id()].tchain;
 		kproftimer(get_hwtf_pc(hw_tf));
 		set_awaiter_rel(waiter, 1000);
 		set_alarm(tchain, waiter);
 	}
 	struct timer_chain *tchain = &per_cpu_info[core_id()].tchain;
 	struct alarm_waiter *waiter = kmalloc(sizeof(struct alarm_waiter), 0);
 	init_awaiter_irq(waiter, kprof_alarm);
 	set_awaiter_rel(waiter, 1000);
 	set_alarm(tchain, waiter);
 }

 static void kprofinit(void)
 {
 	uint32_t n;

 	static_assert(CELLSIZE == sizeof kprof.buf[0]); // kprof size

 	/* allocate when first used */
 	kprof.minpc = KERN_LOAD_ADDR;
 	kprof.maxpc = (uintptr_t) &etext;
 	kprof.nbuf = (kprof.maxpc-kprof.minpc) >> LRES;
 	n = kprof.nbuf*CELLSIZE;
 	kprof.buf = kzmalloc(n, KMALLOC_WAIT);
 	if (kprof.buf)
 		kprof.buf_sz = n;

 	/* no, i'm not sure how we should do this yet. */
 	int alloc_cpu_buffers(void);
 	alloc_cpu_buffers();
 	oprof_alarms = kzmalloc(sizeof(struct alarm_waiter) * num_cpus,
 	                        KMALLOC_WAIT);
 	if (!oprof_alarms)
 		error(Enomem);

 	for (int i = 0; i < num_cpus; i++)
 		init_awaiter_irq(&oprof_alarms[i], oprof_alarm_handler);

 	kprof.systrace = qopen(2 << 20, 0, 0, 0);
 	if (!kprof.systrace) {
 		printk("systrace allocate failed. No system call tracing\n");
 	}
 	kprof.mpstat_ipi = TRUE;

 	kproftab[Kprofdataqid].length = kprof.nbuf * FORMATSIZE;
 	kproftab[Kmpstatqid].length = mpstat_len();
 	kproftab[Kmpstatrawqid].length = mpstatraw_len();
 }

 static void kprofshutdown(void)
 {
 	kfree(oprof_alarms); oprof_alarms = NULL;
 	kfree(kprof.buf); kprof.buf = NULL;
 	qfree(kprof.systrace); kprof.systrace = NULL;
 	void free_cpu_buffers(void);
 	free_cpu_buffers();
 }

 static struct walkqid*
 kprofwalk(struct chan *c, struct chan *nc, char **name, int nname)
 {
 	return devwalk(c, nc, name, nname, kproftab, ARRAY_SIZE(kproftab), devgen);
 }

 static int
 kprofstat(struct chan *c, uint8_t *db, int n)
 {
 	kproftab[Kprofoprofileqid].length = oproflen();
 	if (kprof.systrace)
 		kproftab[Kptraceqid].length = qlen(kprof.systrace);
 	else
 		kproftab[Kptraceqid].length = 0;

 	return devstat(c, db, n, kproftab, ARRAY_SIZE(kproftab), devgen);
 }

 static struct chan*
 kprofopen(struct chan *c, int omode)
 {
 	if(c->qid.type & QTDIR){
 		if(openmode(omode) != OREAD)
 			error(Eperm);
 	}
 	c->mode = openmode(omode);
 	c->flag |= COPEN;
 	c->offset = 0;
 	return c;
 }

 static void
 kprofclose(struct chan*unused)
 {
 }

 static size_t mpstat_len(void)
 {
 	size_t each_row = 7 + NR_CPU_STATES * 26;
 	return each_row * (num_cpus + 1) + 1;
 }

 static long mpstat_read(void *va, long n, int64_t off)
 {
 	size_t bufsz = mpstat_len();
 	char *buf = kmalloc(bufsz, KMALLOC_WAIT);
 	int len = 0;
 	struct per_cpu_info *pcpui;
 	uint64_t cpu_total;
 	struct timespec ts;

 	/* the IPI interferes with other cores, might want to disable that. */
 	if (kprof.mpstat_ipi)
 		send_broadcast_ipi(I_POKE_CORE);

 	len += snprintf(buf + len, bufsz - len, "  CPU: ");
 	for (int j = 0; j < NR_CPU_STATES; j++)
 		len += snprintf(buf + len, bufsz - len, "%23s%s", cpu_state_names[j],
 		                j != NR_CPU_STATES - 1 ? "   " : "  \n");

 	for (int i = 0; i < num_cpus; i++) {
 		pcpui = &per_cpu_info[i];
 		cpu_total = 0;
 		len += snprintf(buf + len, bufsz - len, "%5d: ", i);
 		for (int j = 0; j < NR_CPU_STATES; j++)
 			cpu_total += pcpui->state_ticks[j];
 		cpu_total = MAX(cpu_total, 1);	/* for the divide later */
 		for (int j = 0; j < NR_CPU_STATES; j++) {
 			tsc2timespec(pcpui->state_ticks[j], &ts);
 			len += snprintf(buf + len, bufsz - len, "%10d.%06d (%3d%%)%s",
 			                ts.tv_sec, ts.tv_nsec / 1000,
 			                MIN((pcpui->state_ticks[j] * 100) / cpu_total, 100),
 			                j != NR_CPU_STATES - 1 ? ", " : " \n");
 		}
 	}
 	n = readstr(off, va, n, buf);
 	kfree(buf);
 	return n;
 }

 static size_t mpstatraw_len(void)
 {
 	size_t header_row = 27 + NR_CPU_STATES * 7 + 1;
 	size_t cpu_row = 7 + NR_CPU_STATES * 17;
 	return header_row + cpu_row * num_cpus + 1;
 }

 static long mpstatraw_read(void *va, long n, int64_t off)
 {
 	size_t bufsz = mpstatraw_len();
 	char *buf = kmalloc(bufsz, KMALLOC_WAIT);
 	int len = 0;
 	struct per_cpu_info *pcpui;

 	/* could spit it all out in binary, though then it'd be harder to process
 	 * the data across a mnt (if we export #K).  probably not a big deal. */

 	/* header line: version, num_cpus, tsc freq, state names */
 	len += snprintf(buf + len, bufsz - len, "v%03d %5d %16llu", 1, num_cpus,
 	                system_timing.tsc_freq);
 	for (int j = 0; j < NR_CPU_STATES; j++)
 		len += snprintf(buf + len, bufsz - len, " %6s", cpu_state_names[j]);
 	len += snprintf(buf + len, bufsz - len, "\n");

 	for (int i = 0; i < num_cpus; i++) {
 		pcpui = &per_cpu_info[i];
 		len += snprintf(buf + len, bufsz - len, "%5d: ", i);
 		for (int j = 0; j < NR_CPU_STATES; j++) {
 			len += snprintf(buf + len, bufsz - len, "%16llx%s",
 			                pcpui->state_ticks[j],
 			                j != NR_CPU_STATES - 1 ? " " : "\n");
 		}
 	}
 	n = readstr(off, va, n, buf);
 	kfree(buf);
 	return n;
 }

 static long
 kprofread(struct chan *c, void *va, long n, int64_t off)
 {
 	uint64_t w, *bp;
 	char *a, *ea;
 	uintptr_t offset = off;
 	uint64_t pc;
 	int snp_ret, ret = 0;

 	switch((int)c->qid.path){
 	case Kprofdirqid:
 		return devdirread(c, va, n, kproftab, ARRAY_SIZE(kproftab), devgen);

 	case Kprofdataqid:

 		if (n < FORMATSIZE){
 			n = 0;
 			break;
 		}
 		a = va;
 		ea = a + n;

 		/* we check offset later before deref bp.  offset / FORMATSIZE is how
 		 * many entries we're skipping/offsetting. */
 		bp = kprof.buf + offset/FORMATSIZE;
 		pc = kprof.minpc + ((offset/FORMATSIZE)<<LRES);
 		while((a < ea) && (n >= FORMATSIZE)){
 			/* what a pain. We need to manage the
 			 * fact that the *prints all make room for
 			 * \0
 			 */
 			char print[FORMATSIZE+1];
 			char *name;
 			int amt_read;

 			if (pc >= kprof.maxpc)
 				break;
 			/* pc is also our exit for bp.  should be in lockstep */
 			// XXX this assert fails, fix it!
 			//assert(bp < kprof.buf + kprof.nbuf);
 			/* do not attempt to filter these results based on w < threshold.
 			 * earlier, we computed bp/pc based on assuming a full-sized file,
 			 * and skipping entries will result in read() calls thinking they
 			 * received earlier entries when they really received later ones.
 			 * imagine a case where there are 1000 skipped items, and read()
 			 * asks for chunks of 32.  it'll get chunks of the next 32 valid
 			 * items, over and over (1000/32 times). */
 			w = *bp++;

 			if (pc == kprof.minpc)
 				name = "Total";
 			else if (pc == kprof.minpc + 8)
 				name = "User";
 			else
 				name = get_fn_name(pc);

 			snp_ret = snprintf(print, sizeof(print), outformat, pc, name, w);
 			assert(snp_ret == FORMATSIZE);
 			if ((pc != kprof.minpc) && (pc != kprof.minpc + 8))
 				kfree(name);

 			amt_read = readmem(offset % FORMATSIZE, a, n, print, FORMATSIZE);
 			offset = 0;	/* future loops have no offset */

 			a += amt_read;
 			n -= amt_read;
 			ret += amt_read;

 			pc += (1 << LRES);
 		}
 		n = ret;
 		break;
 	case Kprofoprofileqid:
 		n = oprofread(va,n);
 		break;
 	case Kptraceqid:
 		if (kprof.systrace) {
 			printd("Kptraceqid: kprof.systrace %p len %p\n", kprof.systrace, qlen(kprof.systrace));
 			if (qlen(kprof.systrace) > 0)
 				n = qread(kprof.systrace, va, n);
 			else
 				n = 0;
 		} else
 			error("no systrace queue");
 		break;
 	case Kprintxqid:
 		n = readstr(offset, va, n, printx_on ? "on" : "off");
 		break;
 	case Kmpstatqid:
 		n = mpstat_read(va, n, offset);
 		break;
 	case Kmpstatrawqid:
 		n = mpstatraw_read(va, n, offset);
 		break;
 	default:
 		n = 0;
 		break;
 	}
 	return n;
 }

 static void kprof_clear(struct kprof *kp)
 {
 	spin_lock(&kp->lock);
 	memset(kp->buf, 0, kp->buf_sz);
 	spin_unlock(&kp->lock);
 }

 static void manage_oprof_timer(int coreid, struct cmdbuf *cb)
 {
 	struct timer_chain *tchain = &per_cpu_info[coreid].tchain;
 	struct alarm_waiter *waiter = &oprof_alarms[coreid];
 	if (!strcmp(cb->f[2], "on")) {
 		/* pcpu waiters already inited.  will set/reset each time (1 ms
 		 * default). */
 		reset_alarm_rel(tchain, waiter, oprof_timer_period);
 	} else if (!strcmp(cb->f[2], "off")) {
 		/* since the alarm handler runs and gets reset within IRQ context, then
 		 * we should never fail to cancel the alarm if it was already running
 		 * (tchain locks synchronize us).  but it might not be set at all, which
 		 * is fine. */
 		unset_alarm(tchain, waiter);
 	} else {
 		error("optimer needs on|off");
 	}
 }

 static long
 kprofwrite(struct chan *c, void *a, long n, int64_t unused)
 {
 	ERRSTACK(1);
 	uintptr_t pc;
 	struct cmdbuf *cb;
 	char *ctlstring = "startclr|start|stop|clear|opstart|opstop|optimer";
 	cb = parsecmd(a, n);

 	if (waserror()) {
 		kfree(cb);
 		nexterror();
 	}

 	switch((int)(c->qid.path)){
 	case Kprofctlqid:
 		if (cb->nf < 1)
 			error(ctlstring);

 		/* Kprof: a "which kaddr are we at when the timer goes off".  not used
 		 * much anymore */
 		if (!strcmp(cb->f[0], "startclr")) {
 			kprof_clear(&kprof);
 			kprof.time = 1;
 		} else if (!strcmp(cb->f[0], "start")) {
 			kprof.time = 1;
 			/* this sets up the timer on the *calling* core! */
 			setup_timers();
 		} else if (!strcmp(cb->f[0], "stop")) {
 			/* TODO: stop the timers! */
 			kprof.time = 0;
 		} else if (!strcmp(cb->f[0], "clear")) {
 			kprof_clear(&kprof);

 		/* oprof: samples and traces using oprofile */
 		} else if (!strcmp(cb->f[0], "optimer")) {
 			if (cb->nf < 3)
 				error("optimer [<0|1|..|n|all> <on|off>] [period USEC]");
 			if (!strcmp(cb->f[1], "period")) {
 				oprof_timer_period = strtoul(cb->f[2], 0, 10);
 			} else if (!strcmp(cb->f[1], "all")) {
 				for (int i = 0; i < num_cpus; i++)
 					manage_oprof_timer(i, cb);
 			} else {
 				int pcoreid = strtoul(cb->f[1], 0, 10);
 				if (pcoreid >= num_cpus)
 					error("no such coreid %d", pcoreid);
 				manage_oprof_timer(pcoreid, cb);
 			}
 		} else if (!strcmp(cb->f[0], "opstart")) {
 			oprofile_control_trace(1);
 		} else if (!strcmp(cb->f[0], "opstop")) {
 			oprofile_control_trace(0);
 		} else {
 			error(ctlstring);
 		}
 		break;

 		/* The format is a long as text. We strtoul, and jam it into the
 		 * trace buffer.
 		 */
 	case Kprofoprofileqid:
 		pc = strtoul(a, 0, 0);
 		oprofile_add_trace(pc);
 		break;
 	case Kprintxqid:
 		if (!strncmp(a, "on", 2))
 			set_printx(1);
 		else if (!strncmp(a, "off", 3))
 			set_printx(0);
 		else if (!strncmp(a, "toggle", 6))	/* why not. */
 			set_printx(2);
 		else
 			error("invalid option to Kprintx %s\n", a);
 		break;
 	case Kmpstatqid:
 	case Kmpstatrawqid:
 		if (cb->nf < 1)
 			error("mpstat bad option (reset|ipi|on|off)");
 		if (!strcmp(cb->f[0], "reset")) {
 			for (int i = 0; i < num_cpus; i++)
 				reset_cpu_state_ticks(i);
 		} else if (!strcmp(cb->f[0], "on")) {
 			/* TODO: enable the ticks */ ;
 		} else if (!strcmp(cb->f[0], "off")) {
 			/* TODO: disable the ticks */ ;
 		} else if (!strcmp(cb->f[0], "ipi")) {
 			if (cb->nf < 2)
 				error("need another arg: ipi [on|off]");
 			if (!strcmp(cb->f[1], "on"))
 				kprof.mpstat_ipi = TRUE;
 			else if (!strcmp(cb->f[1], "off"))
 				kprof.mpstat_ipi = FALSE;
 			else
 				error("ipi [on|off]");
 		} else {
 			error("mpstat bad option (reset|ipi|on|off)");
 		}
 		break;
 	default:
 		error(Ebadusefd);
 	}
 	kfree(cb);
 	poperror();
 	return n;
 }

 void kprof_write_sysrecord(char *pretty_buf, size_t len)
 {
 	int wrote;
 	if (kprof.systrace) {
 		/* TODO: need qio work so we can simply add the buf as extra data */
 		wrote = qiwrite(kprof.systrace, pretty_buf, len);
 		/* based on the current queue settings, we only drop when we're running
 		 * out of memory.  odds are, we won't make it this far. */
 		if (wrote != len)
 			printk("DROPPED %s", pretty_buf);
 	}
 }

 void trace_printk(const char *fmt, ...)
 {
 	va_list ap;
 	struct timespec ts_now;
 	size_t bufsz = 160;	/* 2x terminal width */
 	size_t len = 0;
 	char *buf = kmalloc(bufsz, 0);

 	if (!buf)
 		return;
 	tsc2timespec(read_tsc(), &ts_now);
 	len += snprintf(buf + len, bufsz - len, "[%7d.%09d] /* ", ts_now.tv_sec,
 	                ts_now.tv_nsec);
 	va_start(ap, fmt);
 	len += vsnprintf(buf + len, bufsz - len, fmt, ap);
 	va_start(ap, fmt);
 	va_end(ap);
 	len += snprintf(buf + len, bufsz - len, " */\n");
 	va_start(ap, fmt);
 	/* snprintf null terminates the buffer, and does not count that as part of
 	 * the len.  if we maxed out the buffer, let's make sure it has a \n */
 	if (len == bufsz - 1) {
 		assert(buf[bufsz - 1] == '\0');
 		buf[bufsz - 2] = '\n';
 	}
 	kprof_write_sysrecord(buf, len);
 	kfree(buf);
 }

 struct dev kprofdevtab __devtab = {
 	'K',
 	"kprof",

 	devreset,
 	kprofinit,
 	kprofshutdown,
 	kprofattach,
 	kprofwalk,
 	kprofstat,
 	kprofopen,
 	devcreate,
 	kprofclose,
 	kprofread,
 	devbread,
 	kprofwrite,
 	devbwrite,
 	devremove,
 	devwstat,
 };
	/*
	* This file is part of the UCB release of Plan 9. It is subject to the license
	* terms in the LICENSE file found in the top-level directory of this
	* distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
	* part of the UCB release of Plan 9, including this file, may be copied,
	* modified, propagated, or distributed except according to the terms contained
	* in the LICENSE file.
	*/

	// get_fn_name is slowing down the kprocread
	// have an array of translated fns
	// or a "next" iterator, since we're walking in order
	//
	// irqsave locks
	//
	// kprof struct should be a ptr, have them per core
	// we'll probably need to track the length still, so userspace knows how
	// big it is
	//
	// will also want more files in the kprof dir for each cpu or something
	//
	// maybe don't use slot 0 and 1 as total and 'not kernel' ticks
	//
	// fix the failed assert XXX

	#include <vfs.h>
	#include <kfs.h>
	#include <slab.h>
	#include <kmalloc.h>
	#include <kref.h>
	#include <string.h>
	#include <stdio.h>
	#include <assert.h>
	#include <error.h>
	#include <cpio.h>
	#include <pmap.h>
	#include <smp.h>
	#include <ip.h>
	#include <oprofile.h>

	#define LRES 3 /* log of PC resolution */
	#define CELLSIZE 8 /* sizeof of count cell */

	struct kprof
	{
	uintptr_t minpc;
	uintptr_t maxpc;
	int nbuf;
	int time;
	uint64_t buf; / keep in sync with cellsize */
	size_t buf_sz;
	spinlock_t lock;
	struct queue *systrace;
	bool mpstat_ipi;
	};
	struct kprof kprof;

	/* output format. Nice fixed size. That makes it seekable.
	* small subtle bit here. You have to convert offset FROM FORMATSIZE units
	* to CELLSIZE units in a few places.
	*/
	char *outformat = "%016llx %29.29s %016llx\n";
	#define FORMATSIZE 64
	enum{
	Kprofdirqid = 0,
	Kprofdataqid,
	Kprofctlqid,
	Kprofoprofileqid,
	Kptraceqid,
	Kprintxqid,
	Kmpstatqid,
	Kmpstatrawqid,
	};

	struct dirtab kproftab[]={
	{".", {Kprofdirqid, 0, QTDIR},0, DMDIR\|0550},
	{"kpdata", {Kprofdataqid}, 0, 0600},
	{"kpctl", {Kprofctlqid}, 0, 0600},
	{"kpoprofile", {Kprofoprofileqid}, 0, 0600},
	{"kptrace", {Kptraceqid}, 0, 0600},
	{"kprintx", {Kprintxqid}, 0, 0600},
	{"mpstat", {Kmpstatqid}, 0, 0600},
	{"mpstat-raw", {Kmpstatrawqid}, 0, 0600},
	};

	static size_t mpstatraw_len(void);
	static size_t mpstat_len(void);

	static struct alarm_waiter *oprof_alarms;
	static unsigned int oprof_timer_period = 1000;

	static void oprof_alarm_handler(struct alarm_waiter *waiter,
	struct hw_trapframe *hw_tf)
	{
	int coreid = core_id();
	struct timer_chain *tchain = &per_cpu_info[coreid].tchain;
	if (in_kernel(hw_tf))
	oprofile_add_backtrace(get_hwtf_pc(hw_tf), get_hwtf_fp(hw_tf));
	else
	oprofile_add_userpc(get_hwtf_pc(hw_tf));
	reset_alarm_rel(tchain, waiter, oprof_timer_period);
	}

	static struct chan*
	kprofattach(char *spec)
	{
	// Did we initialise completely?
	if ( !(oprof_alarms && kprof.buf && kprof.systrace) )
	error(Enomem);

	return devattach('K', spec);
	}

	static void
	kproftimer(uintptr_t pc)
	{
	if(kprof.time == 0)
	return;

	/*
	* if the pc corresponds to the idle loop, don't consider it.

	if(m->inidle)
	return;
	*/
	/*
	* if the pc is coming out of spllo or splx,
	* use the pc saved when we went splhi.

	if(pc>=PTR2UINT(spllo) && pc<=PTR2UINT(spldone))
	pc = m->splpc;
	*/

	// ilock(&kprof);
	/* this is weird. What we do is assume that all the time since the last
	* measurement went into this PC. It's the best
	* we can do I suppose. And we are sampling at 1 ms. for now.
	* better ideas welcome.
	*/
	kprof.buf[0] += 1; //Total count of ticks.
	if(kprof.minpc<=pc && pc<kprof.maxpc){
	pc -= kprof.minpc;
	pc >>= LRES;
	kprof.buf[pc] += 1;
	}else
	kprof.buf[1] += 1; // Why?
	// iunlock(&kprof);
	}

	static void setup_timers(void)
	{
	void kprof_alarm(struct alarm_waiter waiter, struct hw_trapframe hw_tf)
	{
	struct timer_chain *tchain = &per_cpu_info[core_id()].tchain;
	kproftimer(get_hwtf_pc(hw_tf));
	set_awaiter_rel(waiter, 1000);
	set_alarm(tchain, waiter);
	}
	struct timer_chain *tchain = &per_cpu_info[core_id()].tchain;
	struct alarm_waiter *waiter = kmalloc(sizeof(struct alarm_waiter), 0);
	init_awaiter_irq(waiter, kprof_alarm);
	set_awaiter_rel(waiter, 1000);
	set_alarm(tchain, waiter);
	}

	static void kprofinit(void)
	{
	uint32_t n;

	static_assert(CELLSIZE == sizeof kprof.buf[0]); // kprof size

	/* allocate when first used */
	kprof.minpc = KERN_LOAD_ADDR;
	kprof.maxpc = (uintptr_t) &etext;
	kprof.nbuf = (kprof.maxpc-kprof.minpc) >> LRES;
	n = kprof.nbuf*CELLSIZE;
	kprof.buf = kzmalloc(n, KMALLOC_WAIT);
	if (kprof.buf)
	kprof.buf_sz = n;

	/* no, i'm not sure how we should do this yet. */
	int alloc_cpu_buffers(void);
	alloc_cpu_buffers();
	oprof_alarms = kzmalloc(sizeof(struct alarm_waiter) * num_cpus,
	KMALLOC_WAIT);
	if (!oprof_alarms)
	error(Enomem);

	for (int i = 0; i < num_cpus; i++)
	init_awaiter_irq(&oprof_alarms[i], oprof_alarm_handler);

	kprof.systrace = qopen(2 << 20, 0, 0, 0);
	if (!kprof.systrace) {
	printk("systrace allocate failed. No system call tracing\n");
	}
	kprof.mpstat_ipi = TRUE;

	kproftab[Kprofdataqid].length = kprof.nbuf * FORMATSIZE;
	kproftab[Kmpstatqid].length = mpstat_len();
	kproftab[Kmpstatrawqid].length = mpstatraw_len();
	}

	static void kprofshutdown(void)
	{
	kfree(oprof_alarms); oprof_alarms = NULL;
	kfree(kprof.buf); kprof.buf = NULL;
	qfree(kprof.systrace); kprof.systrace = NULL;
	void free_cpu_buffers(void);
	free_cpu_buffers();
	}

	static struct walkqid*
	kprofwalk(struct chan c, struct chan nc, char **name, int nname)
	{
	return devwalk(c, nc, name, nname, kproftab, ARRAY_SIZE(kproftab), devgen);
	}

	static int
	kprofstat(struct chan c, uint8_t db, int n)
	{
	kproftab[Kprofoprofileqid].length = oproflen();
	if (kprof.systrace)
	kproftab[Kptraceqid].length = qlen(kprof.systrace);
	else
	kproftab[Kptraceqid].length = 0;

	return devstat(c, db, n, kproftab, ARRAY_SIZE(kproftab), devgen);
	}

	static struct chan*
	kprofopen(struct chan *c, int omode)
	{
	if(c->qid.type & QTDIR){
	if(openmode(omode) != OREAD)
	error(Eperm);
	}
	c->mode = openmode(omode);
	c->flag \|= COPEN;
	c->offset = 0;
	return c;
	}

	static void
	kprofclose(struct chan*unused)
	{
	}

	static size_t mpstat_len(void)
	{
	size_t each_row = 7 + NR_CPU_STATES * 26;
	return each_row * (num_cpus + 1) + 1;
	}

	static long mpstat_read(void *va, long n, int64_t off)
	{
	size_t bufsz = mpstat_len();
	char *buf = kmalloc(bufsz, KMALLOC_WAIT);
	int len = 0;
	struct per_cpu_info *pcpui;
	uint64_t cpu_total;
	struct timespec ts;

	/* the IPI interferes with other cores, might want to disable that. */
	if (kprof.mpstat_ipi)
	send_broadcast_ipi(I_POKE_CORE);

	len += snprintf(buf + len, bufsz - len, " CPU: ");
	for (int j = 0; j < NR_CPU_STATES; j++)
	len += snprintf(buf + len, bufsz - len, "%23s%s", cpu_state_names[j],
	j != NR_CPU_STATES - 1 ? " " : " \n");

	for (int i = 0; i < num_cpus; i++) {
	pcpui = &per_cpu_info[i];
	cpu_total = 0;
	len += snprintf(buf + len, bufsz - len, "%5d: ", i);
	for (int j = 0; j < NR_CPU_STATES; j++)
	cpu_total += pcpui->state_ticks[j];
	cpu_total = MAX(cpu_total, 1); /* for the divide later */
	for (int j = 0; j < NR_CPU_STATES; j++) {
	tsc2timespec(pcpui->state_ticks[j], &ts);
	len += snprintf(buf + len, bufsz - len, "%10d.%06d (%3d%%)%s",
	ts.tv_sec, ts.tv_nsec / 1000,
	MIN((pcpui->state_ticks[j] * 100) / cpu_total, 100),
	j != NR_CPU_STATES - 1 ? ", " : " \n");
	}
	}
	n = readstr(off, va, n, buf);
	kfree(buf);
	return n;
	}

	static size_t mpstatraw_len(void)
	{
	size_t header_row = 27 + NR_CPU_STATES * 7 + 1;
	size_t cpu_row = 7 + NR_CPU_STATES * 17;
	return header_row + cpu_row * num_cpus + 1;
	}

	static long mpstatraw_read(void *va, long n, int64_t off)
	{
	size_t bufsz = mpstatraw_len();
	char *buf = kmalloc(bufsz, KMALLOC_WAIT);
	int len = 0;
	struct per_cpu_info *pcpui;

	/* could spit it all out in binary, though then it'd be harder to process
	* the data across a mnt (if we export #K). probably not a big deal. */

	/* header line: version, num_cpus, tsc freq, state names */
	len += snprintf(buf + len, bufsz - len, "v%03d %5d %16llu", 1, num_cpus,
	system_timing.tsc_freq);
	for (int j = 0; j < NR_CPU_STATES; j++)
	len += snprintf(buf + len, bufsz - len, " %6s", cpu_state_names[j]);
	len += snprintf(buf + len, bufsz - len, "\n");

	for (int i = 0; i < num_cpus; i++) {
	pcpui = &per_cpu_info[i];
	len += snprintf(buf + len, bufsz - len, "%5d: ", i);
	for (int j = 0; j < NR_CPU_STATES; j++) {
	len += snprintf(buf + len, bufsz - len, "%16llx%s",
	pcpui->state_ticks[j],
	j != NR_CPU_STATES - 1 ? " " : "\n");
	}
	}
	n = readstr(off, va, n, buf);
	kfree(buf);
	return n;
	}

	static long
	kprofread(struct chan c, void va, long n, int64_t off)
	{
	uint64_t w, *bp;
	char a, ea;
	uintptr_t offset = off;
	uint64_t pc;
	int snp_ret, ret = 0;

	switch((int)c->qid.path){
	case Kprofdirqid:
	return devdirread(c, va, n, kproftab, ARRAY_SIZE(kproftab), devgen);

	case Kprofdataqid:

	if (n < FORMATSIZE){
	n = 0;
	break;
	}
	a = va;
	ea = a + n;

	/* we check offset later before deref bp. offset / FORMATSIZE is how
	* many entries we're skipping/offsetting. */
	bp = kprof.buf + offset/FORMATSIZE;
	pc = kprof.minpc + ((offset/FORMATSIZE)<<LRES);
	while((a < ea) && (n >= FORMATSIZE)){
	/* what a pain. We need to manage the
	* fact that the *prints all make room for
	* \0
	*/
	char print[FORMATSIZE+1];
	char *name;
	int amt_read;

	if (pc >= kprof.maxpc)
	break;
	/* pc is also our exit for bp. should be in lockstep */
	// XXX this assert fails, fix it!
	//assert(bp < kprof.buf + kprof.nbuf);
	/* do not attempt to filter these results based on w < threshold.
	* earlier, we computed bp/pc based on assuming a full-sized file,
	* and skipping entries will result in read() calls thinking they
	* received earlier entries when they really received later ones.
	* imagine a case where there are 1000 skipped items, and read()
	* asks for chunks of 32. it'll get chunks of the next 32 valid
	* items, over and over (1000/32 times). */
	w = *bp++;

	if (pc == kprof.minpc)
	name = "Total";
	else if (pc == kprof.minpc + 8)
	name = "User";
	else
	name = get_fn_name(pc);

	snp_ret = snprintf(print, sizeof(print), outformat, pc, name, w);
	assert(snp_ret == FORMATSIZE);
	if ((pc != kprof.minpc) && (pc != kprof.minpc + 8))
	kfree(name);

	amt_read = readmem(offset % FORMATSIZE, a, n, print, FORMATSIZE);
	offset = 0; /* future loops have no offset */

	a += amt_read;
	n -= amt_read;
	ret += amt_read;

	pc += (1 << LRES);
	}
	n = ret;
	break;
	case Kprofoprofileqid:
	n = oprofread(va,n);
	break;
	case Kptraceqid:
	if (kprof.systrace) {
	printd("Kptraceqid: kprof.systrace %p len %p\n", kprof.systrace, qlen(kprof.systrace));
	if (qlen(kprof.systrace) > 0)
	n = qread(kprof.systrace, va, n);
	else
	n = 0;
	} else
	error("no systrace queue");
	break;
	case Kprintxqid:
	n = readstr(offset, va, n, printx_on ? "on" : "off");
	break;
	case Kmpstatqid:
	n = mpstat_read(va, n, offset);
	break;
	case Kmpstatrawqid:
	n = mpstatraw_read(va, n, offset);
	break;
	default:
	n = 0;
	break;
	}
	return n;
	}

	static void kprof_clear(struct kprof *kp)
	{
	spin_lock(&kp->lock);
	memset(kp->buf, 0, kp->buf_sz);
	spin_unlock(&kp->lock);
	}

	static void manage_oprof_timer(int coreid, struct cmdbuf *cb)
	{
	struct timer_chain *tchain = &per_cpu_info[coreid].tchain;
	struct alarm_waiter *waiter = &oprof_alarms[coreid];
	if (!strcmp(cb->f[2], "on")) {
	/* pcpu waiters already inited. will set/reset each time (1 ms
	* default). */
	reset_alarm_rel(tchain, waiter, oprof_timer_period);
	} else if (!strcmp(cb->f[2], "off")) {
	/* since the alarm handler runs and gets reset within IRQ context, then
	* we should never fail to cancel the alarm if it was already running
	* (tchain locks synchronize us). but it might not be set at all, which
	* is fine. */
	unset_alarm(tchain, waiter);
	} else {
	error("optimer needs on\|off");
	}
	}

	static long
	kprofwrite(struct chan c, void a, long n, int64_t unused)
	{
	ERRSTACK(1);
	uintptr_t pc;
	struct cmdbuf *cb;
	char *ctlstring = "startclr\|start\|stop\|clear\|opstart\|opstop\|optimer";
	cb = parsecmd(a, n);

	if (waserror()) {
	kfree(cb);
	nexterror();
	}

	switch((int)(c->qid.path)){
	case Kprofctlqid:
	if (cb->nf < 1)
	error(ctlstring);

	/* Kprof: a "which kaddr are we at when the timer goes off". not used
	* much anymore */
	if (!strcmp(cb->f[0], "startclr")) {
	kprof_clear(&kprof);
	kprof.time = 1;
	} else if (!strcmp(cb->f[0], "start")) {
	kprof.time = 1;
	/* this sets up the timer on the calling core! */
	setup_timers();
	} else if (!strcmp(cb->f[0], "stop")) {
	/* TODO: stop the timers! */
	kprof.time = 0;
	} else if (!strcmp(cb->f[0], "clear")) {
	kprof_clear(&kprof);

	/* oprof: samples and traces using oprofile */
	} else if (!strcmp(cb->f[0], "optimer")) {
	if (cb->nf < 3)
	error("optimer [<0\|1\|..\|n\|all> <on\|off>] [period USEC]");
	if (!strcmp(cb->f[1], "period")) {
	oprof_timer_period = strtoul(cb->f[2], 0, 10);
	} else if (!strcmp(cb->f[1], "all")) {
	for (int i = 0; i < num_cpus; i++)
	manage_oprof_timer(i, cb);
	} else {
	int pcoreid = strtoul(cb->f[1], 0, 10);
	if (pcoreid >= num_cpus)
	error("no such coreid %d", pcoreid);
	manage_oprof_timer(pcoreid, cb);
	}
	} else if (!strcmp(cb->f[0], "opstart")) {
	oprofile_control_trace(1);
	} else if (!strcmp(cb->f[0], "opstop")) {
	oprofile_control_trace(0);
	} else {
	error(ctlstring);
	}
	break;

	/* The format is a long as text. We strtoul, and jam it into the
	* trace buffer.
	*/
	case Kprofoprofileqid:
	pc = strtoul(a, 0, 0);
	oprofile_add_trace(pc);
	break;
	case Kprintxqid:
	if (!strncmp(a, "on", 2))
	set_printx(1);
	else if (!strncmp(a, "off", 3))
	set_printx(0);
	else if (!strncmp(a, "toggle", 6)) /* why not. */
	set_printx(2);
	else
	error("invalid option to Kprintx %s\n", a);
	break;
	case Kmpstatqid:
	case Kmpstatrawqid:
	if (cb->nf < 1)
	error("mpstat bad option (reset\|ipi\|on\|off)");
	if (!strcmp(cb->f[0], "reset")) {
	for (int i = 0; i < num_cpus; i++)
	reset_cpu_state_ticks(i);
	} else if (!strcmp(cb->f[0], "on")) {
	/* TODO: enable the ticks */ ;
	} else if (!strcmp(cb->f[0], "off")) {
	/* TODO: disable the ticks */ ;
	} else if (!strcmp(cb->f[0], "ipi")) {
	if (cb->nf < 2)
	error("need another arg: ipi [on\|off]");
	if (!strcmp(cb->f[1], "on"))
	kprof.mpstat_ipi = TRUE;
	else if (!strcmp(cb->f[1], "off"))
	kprof.mpstat_ipi = FALSE;
	else
	error("ipi [on\|off]");
	} else {
	error("mpstat bad option (reset\|ipi\|on\|off)");
	}
	break;
	default:
	error(Ebadusefd);
	}
	kfree(cb);
	poperror();
	return n;
	}

	void kprof_write_sysrecord(char *pretty_buf, size_t len)
	{
	int wrote;
	if (kprof.systrace) {
	/* TODO: need qio work so we can simply add the buf as extra data */
	wrote = qiwrite(kprof.systrace, pretty_buf, len);
	/* based on the current queue settings, we only drop when we're running
	* out of memory. odds are, we won't make it this far. */
	if (wrote != len)
	printk("DROPPED %s", pretty_buf);
	}
	}

	void trace_printk(const char *fmt, ...)
	{
	va_list ap;
	struct timespec ts_now;
	size_t bufsz = 160; /* 2x terminal width */
	size_t len = 0;
	char *buf = kmalloc(bufsz, 0);

	if (!buf)
	return;
	tsc2timespec(read_tsc(), &ts_now);
	len += snprintf(buf + len, bufsz - len, "[%7d.%09d] /* ", ts_now.tv_sec,
	ts_now.tv_nsec);
	va_start(ap, fmt);
	len += vsnprintf(buf + len, bufsz - len, fmt, ap);
	va_start(ap, fmt);
	va_end(ap);
	len += snprintf(buf + len, bufsz - len, " */\n");
	va_start(ap, fmt);
	/* snprintf null terminates the buffer, and does not count that as part of
	* the len. if we maxed out the buffer, let's make sure it has a \n */
	if (len == bufsz - 1) {
	assert(buf[bufsz - 1] == '\0');
	buf[bufsz - 2] = '\n';
	}
	kprof_write_sysrecord(buf, len);
	kfree(buf);
	}

	struct dev kprofdevtab __devtab = {
	'K',
	"kprof",

	devreset,
	kprofinit,
	kprofshutdown,
	kprofattach,
	kprofwalk,
	kprofstat,
	kprofopen,
	devcreate,
	kprofclose,
	kprofread,
	devbread,
	kprofwrite,
	devbwrite,
	devremove,
	devwstat,
	};