kern/drivers/dev/devttrace.c - upstream - Git at Google

 /*
  * This file is part of the UCB release of Plan 9. It is subject to the license
  * terms in the LICENSE file found in the top-level directory of this
  * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  * part of the UCB release of Plan 9, including this file, may be copied,
  * modified, propagated, or distributed except according to the terms contained
  * in the LICENSE file.
  */

 /*
  * This file implements the #T device and was based upon the UCB Plan 9 kprof.c
  */

 #include <assert.h>
 #include <atomic.h>
 #include <kmalloc.h>
 #include <ns.h>
 #include <smallidpool.h>
 #include <smp.h>
 #include <stdio.h>
 #include <string.h>
 #include <trace.h>
 #include <ttrace.h>
 #include <umem.h>

 #include <ros/fs.h>

 /*
  * ttrace macros and constant data
  */
 #ifndef min
 #define min(a, b) ({ \
 	typeof (a) _a = (a); typeof (b) _b = (b); _a < _b ? _a : _b; })
 #endif

 #define TTRACE_CTL_LEN     46
 #define TTRACE_MAX_TSID    min(MAX_U16_POOL_SZ, (1 << Logtsid))
 #define TTRACE_NUM_OPENERS 8
 #define TT_SAFE_GENBUF_SZ  (GENBUF_SZ-1)  // Leave room for newline

 /* TODO(gvdl): I don't get plan 9's permissions, why do directories get group
  * rx permissions, and what's with the DMDIR. Some devices use it and others
  * don't. In theory the DMDIR is copied over by a higher layer but I have no
  * idea why two copies seems necessary. */
 #define TTPERMDIR    (S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|DMDIR)
 #define TTPERMRWFILE (S_IRUSR|S_IWUSR)
 #define TTPERMROFILE (S_IRUSR)

 enum {
 	Ttdevdirqid = 0,
 	Ttdevbase,
 	Ttdevctlqid = Ttdevbase,	// 1
 	Ttdevauxqid,				// 2
 	Ttdevcpudataqid,			// 3

 	Logtype = 4,  // Enough for 16 unique qids types
 	Masktype = (1 << Logtype) - 1,
 	Shifttype = 0,

 	Logcpu = 12, // Upto 4096 cpus can be time traced
 	Maskcpu = (1 << Logcpu) - 1,
 	Shiftcpu = Shifttype + Logtype,

 	/* ttrace timestamp id, used by data file readers */
 	Logtsid = 12 + 3, // 4096 cpus by 8 simultaneous opened
 	Masktsid = (1 << Logtsid) - 1,
 	Shifttsid = Shiftcpu + Logcpu,
 };

 #define TTYPE(x)		( (int) ((uint32_t)(x).path) & Masktype )
 #define TTCPU(q)		( ((q).path >> Shiftcpu) & Maskcpu )
 #define TTCPUQID(c, t)	( ((c) << Shiftcpu) | (t))
 #define TTTSID(q)		( ((q).path >> Shifttsid) & Masktsid )
 #define TTTSIDQID(q, i)	( ((i) << Shifttsid) | (q).path )

 /*
  * ttrace timestamp pool and accessor
  */
 static uintptr_t *ttdevtimestamp;		// array of open file timestamps
 static struct u16_pool *ttdevtspool;	// pool of timestamp indices
 static inline int get_tsid(void) {
 	return (ttdevtspool)? get_u16(ttdevtspool) : -1;
 }
 static inline put_tsid(int tsid) {
 	dassert(tsid >= 1 && ttdevtspool);
 	put_u16(ttdevtspool, tsid);
 }

 /*
  * ttrace device gen implementation
  *
  * #T directory layout
  * [-1]        {".",      {Ttdevdirqid, 0, QTDIR},    0, TTPERMDIR},
  * [0..ncpu-1] {"cpunnn", {Ttdevcpudataqid|coreid},   0, TTPERMRWFILE},
  * [ncpu]      {"ctl",    {Ttdevctlqid}, TTRACE_CTL_LEN, TTPERMROFILE},
  * [ncpu+1]    {"aux",    {Ttdevauxqid},              0, TTPERMRWFILE},
  */

 /* Generate qids for the top level directory */
 static inline int ttdev1gen(const struct chan *c, int s, struct qid *qp)
 {
 	int ret = 1;
 	int path = -1;
 	/* Must only be called to decode top level dir channel */
 	dassert(TTYPE(c->qid) == Ttdevdirqid);

 	if (s < num_cpus) // "cpunnn" data files
 		path = TTCPUQID(s, Ttdevcpudataqid);
 	else {
 		switch (s - num_cpus) {
 		case 0:  path = Ttdevctlqid; break;		// "ctl"
 		case 1:  path = Ttdevauxqid; break;		// "aux"
 		default: return -1;
 		}
 	}
 	dassert(path > 0);
 	mkqid(qp, path, 0, QTFILE);
 	return ret;
 }

 static int ttdevgen(struct chan *c, char *unused_name,
 					struct dirtab *unused_tab, int ntab, int s, struct dir *dp)
 {
 	dassert(s >= 0); // DOTDOT must handled before getting here

 	/* Always return the top kprof dir for '..' */
 	if (s == DEVDOTDOT) {
 		static const struct qid topqid = {Ttdevdirqid, 0, QTDIR};
 		devdir(c, topqid, "#T", 0, eve, TTPERMDIR, dp);
 		return 1;
 	}

 	struct qid q = c->qid;
 	if (Ttdevdirqid == TTYPE(q) && ttdev1gen(c, s, &q) < 0)
 		return -1;

 	const char *name = NULL;
 	long perm = TTPERMRWFILE;
 	switch (TTYPE(q)) {
 	case Ttdevctlqid: name = "ctl"; break;
 	case Ttdevauxqid: name = "aux"; perm = TTPERMROFILE; break;
 	case Ttdevcpudataqid:
 		snprintf(get_cur_genbuf(), GENBUF_SZ, "cpu%03d", TTCPU(q));
 		name = get_cur_genbuf();
 		break;

 	default:
 		panic("devttrace: Where did bad qid come from?\n");
 	case Ttdevdirqid:
 		panic("devttrace: What happened to ttdev1gen decode?\n");
 	}
 	dassert(name);
 	devdir(c, q, (char *) name, 0, eve, perm, dp);
 	return 1;
 }

 /*
  * ttrace read implementation
  */
 static size_t ttdevcopyout(char *va, long n, size_t offset,
 						   const char *buf, long len)
 {
 	if (offset + len > n)
 		error(Eshort);

 	if (!current)
 		memcpy(&va[offset], buf, len);
 	else if (ESUCCESS != memcpy_to_user(current, &va[offset], buf, len)) {
 		/* UMEM */
 		// TODO(gvdl): No p9 equivalent to EFAULT, determine causes of failure.
 		error(Enovmem);
 	}

 	return len;
 }

 /* Context for trace_ring_foreach call of ttdevread_cpu_entry() */
 #define TTRACE_ENTRY_QUADS (sizeof(struct ttrace_entry) / sizeof(uint64_t))
 /* #quads * (whitespace + len(hex(quad))) */
 #define CTXT_GENBUF_SZ     (TTRACE_ENTRY_QUADS * (1 + 2 * sizeof(uint64_t)))

 struct ttdevread_cpu_ctxt {
 	int64_t c;
 	uintptr_t min_timestamp;
 	char *va;
 	long n;
 	char genbuf[CTXT_GENBUF_SZ];
 };

 static inline int ttdevhexdigit(uint8_t x)
 {
 	return "0123456789abcdef"[x];
 }

 static void ttdevread_cpu_entry(void *ventry, void *vctxt)
 {
 	struct ttdevread_cpu_ctxt *ctxt = (struct ttdevread_cpu_ctxt *) vctxt;
 	/* A cache line aligned copy of the input entry, should make partial entrys
 	 * less likely. Still an entry is bracketted with timestamp == -1 */
 	uint8_t buf[2 * sizeof(struct ttrace_entry)];  // 128 byte buffer
 	const uintptr_t size_mask = sizeof(struct ttrace_entry) - 1;
 	struct ttrace_entry* entry = (struct ttrace_entry *)
 		(((uintptr_t) buf + size_mask) & ~size_mask); // align to cache line
 	*entry = *((struct ttrace_entry *) ventry);  // Grab the entry

 	/* If time stamp == -1 (i.e. entry is a partial) or is less than
 	 * the minimum then ignore this entry */
 	if (!(entry->timestamp + 1) || entry->timestamp < ctxt->min_timestamp)
 		return;

 	uint64_t *sqp = (uint64_t *) entry;
 	char *dcp = ctxt->genbuf;
 	for (int i = 0; i < TTRACE_ENTRY_QUADS; i++) {
 		const uint64_t quad = sqp[i];
 		dcp[0] = ttdevhexdigit((quad >> 28) & 0xf);
 		dcp[1] = ttdevhexdigit((quad >> 24) & 0xf);
 		dcp[2] = ttdevhexdigit((quad >> 20) & 0xf);
 		dcp[3] = ttdevhexdigit((quad >> 16) & 0xf);
 		dcp[4] = ttdevhexdigit((quad >> 12) & 0xf);
 		dcp[5] = ttdevhexdigit((quad >>  8) & 0xf);
 		dcp[6] = ttdevhexdigit((quad >>  4) & 0xf);
 		dcp[7] = ttdevhexdigit((quad >>  0) & 0xf);
 		dcp[8] = ' ';
 		dcp += 9;
 	}
 	dassert(&ctxt->genbuf[sizeof(ctxt->genbuf)] == dcp);
 	dcp[-1] = '\n';  // Replace trailing space with a newline

 	ctxt->c += ttdevcopyout(ctxt->va, ctxt->n, ctxt->c,
 			                ctxt->genbuf, sizeof(ctxt->genbuf));
 }

 /* iotimestamp takes the timestamp pointer and the I/Os offset and returns the
  * minimum timestamp last requested in a write. In the case where the channel
  * has been opened readonly we will complete the offset == 0 request and return
  * end of file for all subsequent (offset > 0) requests; this allows cat to
  * return one page of data. */
 static inline uintptr_t ttdevread_mintimestamp(const int tsid, int64_t offset)
 {
 	/* ttdevread_cpu code can not deal sensibly with offsets without making the
 	 * code much more complicated, probably not worth it. */
 	if (offset)
 		return 0;

 	const uintptr_t min_timestamp = ttdevtimestamp[tsid];
 	if (min_timestamp > read_tscp()) {
 		// no point in trying to read the future.
 		error(Ebadarg);
 	}

 	return min_timestamp;
 }

 static inline long ttdevread_cpu(const int tsid,
 								 int coreid, void *va, long n, int64_t offset)
 {
 	ERRSTACK(1);
 	const uintptr_t min_timestamp = ttdevread_mintimestamp(tsid, offset);
 	if (!min_timestamp)
 		return 0;

 	struct ttdevread_cpu_ctxt *ctxt = kzalloc(sizeof(*ctxt), KMALLOC_WAIT);
 	if (!ctxt)
 		error(Enomem);
 	else if (waserror()) {
 		kfree(ctxt);
 		nexterror();
 	}

 	ctxt->min_timestamp = min_timestamp
 	ctxt->va = va;
 	ctxt->n = n;

 	struct trace_ring * const ring = get_ttrace_ring_for_core(coreid);
 	trace_ring_foreach(ring, &ttdevread_cpu_entry, ctxt);

 	kfree(ctxt);
 	poperror();
 	return ctxt->c;
 }

 static inline long ttdevread_ctl(void *va, long n, int64_t offset)
 {
 	/* Read the ttrace_type_mask and create a 'setmask' ctl command
 	 *
 	 * cmd     ttrace_bits       bit mask
 	 * setmask 0x0123456789abcdef 0x0123456789abcdef\n"
 	 * 123456789012345678901234567890123456789012345 6  len 46 bytes
 	 */
 	char * const buffer = get_cur_genbuf();
 	static_assert(TTRACE_CTL_LEN <= GENBUF_SZ);

 	int c = snprintf(buffer, GENBUF_SZ, "setmask 0x%016llx 0x%016llx\n",
 					 ttrace_type_mask & TTRACE_TYPE_MASK, TTRACE_TYPE_MASK);
 	dassert(TTRACE_CTL_LEN == c);

 	return readstr(offset, va, n, buffer);
 }

 /* This code will be more efficient if the user data is page aligned, but
  * should work no matter which alignment the use gives.
  * Output:
  *   Page 0:   struct ttrace_version
  *   Page 1-n: Auxillary buffer.
  */
 static inline long ttdevread_aux(uint8_t *va, long n, int64_t offset)
 {
 	ptrdiff_t dummy_offset;
 	struct ttrace_version vers;
 	fill_ttrace_version(&vers);

 	const long buffer_length = vers.buffer_mask + 1;
 	if (offset)
 		return 0; // Only allow single reads at offset 0, all others are empty
 	else if (n < PGSIZE + buffer_length)
 		error(Etoosmall);

 	size_t c = PGSIZE; // Advance count to second page

 	/* Implements reader side of auxillary buffer protocol, see
 	 * _ttrace_point_string comment in ttrace.c
 	 *
 	 * Touch memory to get any page faults out of the way now, hopefully we
 	 * will not be under paging pressure. Note that I'm accumulating into the
 	 * vers.last_offset so that the compiler doesn't throw out the memory touch
 	 * loop, the vers.last_offset is reset when we take a buffer snapshot.
 	 *
 	 * TODO(gvdl): formalise memory pinning for later I/O.
 	 */
 	vers.last_offset = 0;
 	size_t t = PGSIZE + ((uintptr_t) va & (sizeof(long) - 1));
 	for (t = 0; t < n, t += PGSIZE)
 		vers.last_offset += atomic_read((atomic_t *) va[t]);

 	get_ttrace_aux_buffer_snapshot(&dummy_offset, &vers.last_offset);
 	const uint8_t * const aux_buffer = get_ttrace_aux_buffer();
 	c += ttdevcopyout(va, n, c, aux_buffer, buffer_length);
 	get_ttrace_aux_buffer_snapshot(&vers.first_offset, &dummy_ffset);

 	/* Output version with buffer offsets last */
 	ttdevcopyout(va, n, 0, &vers, sizeof(vers));

 	return c;
 }

 /*
  * ttrace write utility routines and macros
  */
 static uint64_t parseul(const char * const num_str, int base)
 {
 	char *end_num = NULL;
 	uint64_t ret = strtoul(num_str, &end_num, base);
 	if (num_str == end_num)
 		error(Ebadarg);
 	return ret;
 }

 /*
  * ttrace devtab entry points
  */
 static void ttdevinit(void)
 {
 	static_assert(MAX_NUM_CPUS <= Maskcpu);  // Assert encoding is still good

 	/* Support upto 8 simultaneous opens on the ttrace/cpunnn files. */
 	const int pool_size
 		= min(TTRACE_MAX_TSID, TTRACE_NUM_OPENERS * num_cpus);
 	/* Test for too many cpus for our tsid mechanism, re-implement. */
 	dassert(num_cpus <= pool_size);
 	if (num_cpus > pool_size) {
 		printk("Insufficient ids for ttrace timestamp pool");
 		return;
 	}

 	const size_t ts_size = pool_size * sizeof(*ttdevtimestamp);
 	ttdevtimestamp = kmalloc(ts_size, KMALLOC_WAIT);
 	memset(ttdevtimestamp, 0xff, ts_size);
 	ttdevtspool = create_u16_pool(pool_size);

 	/* Always allocate 0 as a unused/NULL sentinel */
 	int tsidnull = get_tsid(ttdevtspool);
 	assert(!tsidnull);
 	ttdevtimestamp[tsidnull] = 1;  // tsid[0] is set to timestamp of 1.
 }

 #define KFREE_AND_NULL(x) do { kfree(x); x = NULL; } while(false)
 static void ttdevshutdown(void)
 {
 	KFREE_AND_NULL(ttdevtspool);
 	KFREE_AND_NULL(ttdevtimestamp);
 }

 static struct chan *ttdevattach(char *spec)
 {
 	return devattach('T', spec);
 }

 static struct walkqid *ttdevwalk(struct chan *c, struct chan *nc,
 								 char **name, int nname)
 {
 	return devwalk(c, nc, name, nname, NULL, 0, ttdevgen);
 }

 static int ttdevstat(struct chan *c, uint8_t *db, int n)
 {
 	int ret = devstat(c, db, n, NULL, 0, ttdevgen);
 	return ret;
 }

 static struct chan *ttdevopen(struct chan *c, int omode)
 {
 	const int o = openmode(omode);
 	int tsid = TTTSID(c->qid);
 	switch(TTYPE(c->qid)) {
 	default:
 		assert(false); // How did a bad chan get to us?

 	case Ttdevdirqid:
 		dassert(c->qid.type & QTDIR);
 		if (openmode(omode) != OREAD)
 			error(Eperm);
 		break;

 	case Ttdevcpudataqid:
 		if (tsid)
 			break; // Already allocated, reopen

 		if (o == O_RDWR) {
 			tsid = get_tsid(ttdevtspool);
 			if (tsid < 0)
 				error(Enoenv);
 			else
 				dassert(tsid);
 			ttdevtimestamp[tsid] = 1;
 			c->qid.path = TTTSIDQID(c->qid, tsid);  // Record tsid
 		} else if (o == O_RDONLY) {
 			// Nothing to do for O_RDONLY
 		} else
 			error(Eperm);
 		break;

 	case Ttdevauxqid:
 	case Ttdevctlqid:
 		break;
 	}

 	c->mode = o;
 	c->flag |= COPEN;
 	c->offset = 0;

 	return c;
 }

 static void ttdevclose(struct chan *c)
 {
 	if (!(c->flag & COPEN))
 		return;

 	const int tsid = TTTSID(c->qid);
 	switch (TTYPE(c->qid)) {
 	case Ttdevcpudataqid:
 		/* Release timestamp */
 		if (tsid) {
 			ttdevtimestamp[tsid] = -1;
 			put_tsid(ttdevtspool, tsid);
 		}
 		break;
 	case Ttdevauxqid:
 	case Ttdevctlqid:
 	case Ttdevdirqid:
 		break;
 	default:
 		assert(false);
 	}
 }

 static long ttdevread(struct chan *c, void *va, long n, int64_t offset)
 {
 	const unsigned tsid = TTTSID(c->qid);
 	assert(tsid < ttdevtspool->size);

 	switch (TTYPE(c->qid)) {
 	case Ttdevdirqid:  // ttrace directory read
 		return devdirread(c, va, n, NULL, 0, ttdevgen);
 	case Ttdevctlqid:
 		return ttdevread_ctl(va, n, offset);
 	case Ttdevauxqid:
 		return ttdevread_aux(va, n, offset);
 	case Ttdevcpudataqid:
 		return ttdevread_cpu(tsid, TTCPU(c->qid), va, n, offset);
 	}
 	return 0; // Not us
 }

 #define CONST_STRNCMP(vp, conststr) strncmp((vp), conststr, sizeof(conststr)-1)
 static long ttdevwrite(struct chan *c, void *a, long n, int64_t unused_off)
 {
 	ERRSTACK(1);
 	uintptr_t pc;
 	struct cmdbuf *cb;
 	static const char ctlstring[]
 		= "setmask <value> <mask>|setbits <value>|clrbits <mask>";
 	static const char tsstring[] = "settimestamp <minimum timestamp>";

 	cb = parsecmd(a, n);
 	if (waserror()) {
 		kfree(cb);
 		nexterror();
 	}

 	const unsigned tsid = TTTSID(c->qid);
 	assert(tsid < ttdevtspool->size);

 	uint64_t mask = TTRACE_TYPE_MASK;
 	uint64_t value = 0;

 	switch(TTYPE(c->qid)) {
 	default:
 		error(Ebadusefd);

 	case Ttdevctlqid:
 		if (cb->nf == 3 && !CONST_STRNCMP(cb->f[0], "setmask")) {
 			value = parseul(cb->f[1], 0);
 			mask &= parseul(cb->f[2], 0);
 		} else if (cb->nf == 2 && !CONST_STRNCMP(cb->f[0], "setbits"))
 			value = parseul(cb->f[1], 0);
 		else if (cb->nf == 2 && !CONST_STRNCMP(cb->f[0], "clrbits"))
 			mask &= parseul(cb->f[1], 0);
 		else
 			error(ctlstring);

 		/* Thread safe, but... lets face it if we have competing controllers
 		 * setting and clearing mask bits then the behaviour is going to be
 		 * unexpected. Perhaps we could enforce exclusive open of the ctl
 		 * channel. */
 		{
 			uint64_t cur_mask, new_mask;
 			do {
 				cur_mask = atomic_read((void **) &ttrace_type_mask);
 				new_mask = (cur_mask & ~mask) | (value & mask);
 			} while (!atomic_cas((void **) &ttrace_type_mask,
 								 cur_mask, new_mask));
 		}
 		break;

 	case Ttdevcpudataqid:
 		if (cb->nf == 2 && !CONST_STRNCMP(cb->f[0], "settimestamp")) {
 			if (!tsid) error(Ebadfd);

 			const char *endptr = NULL;
 			const unsigned long ts = parseul(cb->f[1], /* base */ 0);
 			if (ts > read_tscp()) // Is the timestamp in the future.
 				error(Ebadarg);
 			ttdevtimestamp[tsid] = ts;
 		} else
 			error((char *) tsstring);
 		break;
 	}

 	kfree(cb);
 	poperror();
 	return n;
 }

 struct dev ttdevdevtab __devtab = {
 	'T',
 	"ttrace",

 	devreset,
 	ttdevinit,
 	ttdevshutdown,
 	ttdevattach,
 	ttdevwalk,
 	ttdevstat,
 	ttdevopen,
 	devcreate,
 	ttdevclose,
 	ttdevread,
 	devbread,
 	ttdevwrite,
 	devbwrite,
 	devremove,
 	devwstat,
 };
	/*
	* This file is part of the UCB release of Plan 9. It is subject to the license
	* terms in the LICENSE file found in the top-level directory of this
	* distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
	* part of the UCB release of Plan 9, including this file, may be copied,
	* modified, propagated, or distributed except according to the terms contained
	* in the LICENSE file.
	*/

	/*
	* This file implements the #T device and was based upon the UCB Plan 9 kprof.c
	*/

	#include <assert.h>
	#include <atomic.h>
	#include <kmalloc.h>
	#include <ns.h>
	#include <smallidpool.h>
	#include <smp.h>
	#include <stdio.h>
	#include <string.h>
	#include <trace.h>
	#include <ttrace.h>
	#include <umem.h>

	#include <ros/fs.h>

	/*
	* ttrace macros and constant data
	*/
	#ifndef min
	#define min(a, b) ({ \
	typeof (a) _a = (a); typeof (b) _b = (b); _a < _b ? _a : _b; })
	#endif

	#define TTRACE_CTL_LEN 46
	#define TTRACE_MAX_TSID min(MAX_U16_POOL_SZ, (1 << Logtsid))
	#define TTRACE_NUM_OPENERS 8
	#define TT_SAFE_GENBUF_SZ (GENBUF_SZ-1) // Leave room for newline

	/* TODO(gvdl): I don't get plan 9's permissions, why do directories get group
	* rx permissions, and what's with the DMDIR. Some devices use it and others
	* don't. In theory the DMDIR is copied over by a higher layer but I have no
	* idea why two copies seems necessary. */
	#define TTPERMDIR (S_IRUSR\|S_IXUSR\|S_IRGRP\|S_IXGRP\|DMDIR)
	#define TTPERMRWFILE (S_IRUSR\|S_IWUSR)
	#define TTPERMROFILE (S_IRUSR)

	enum {
	Ttdevdirqid = 0,
	Ttdevbase,
	Ttdevctlqid = Ttdevbase, // 1
	Ttdevauxqid, // 2
	Ttdevcpudataqid, // 3

	Logtype = 4, // Enough for 16 unique qids types
	Masktype = (1 << Logtype) - 1,
	Shifttype = 0,

	Logcpu = 12, // Upto 4096 cpus can be time traced
	Maskcpu = (1 << Logcpu) - 1,
	Shiftcpu = Shifttype + Logtype,

	/* ttrace timestamp id, used by data file readers */
	Logtsid = 12 + 3, // 4096 cpus by 8 simultaneous opened
	Masktsid = (1 << Logtsid) - 1,
	Shifttsid = Shiftcpu + Logcpu,
	};

	#define TTYPE(x) ( (int) ((uint32_t)(x).path) & Masktype )
	#define TTCPU(q) ( ((q).path >> Shiftcpu) & Maskcpu )
	#define TTCPUQID(c, t) ( ((c) << Shiftcpu) \| (t))
	#define TTTSID(q) ( ((q).path >> Shifttsid) & Masktsid )
	#define TTTSIDQID(q, i) ( ((i) << Shifttsid) \| (q).path )

	/*
	* ttrace timestamp pool and accessor
	*/
	static uintptr_t *ttdevtimestamp; // array of open file timestamps
	static struct u16_pool *ttdevtspool; // pool of timestamp indices
	static inline int get_tsid(void) {
	return (ttdevtspool)? get_u16(ttdevtspool) : -1;
	}
	static inline put_tsid(int tsid) {
	dassert(tsid >= 1 && ttdevtspool);
	put_u16(ttdevtspool, tsid);
	}

	/*
	* ttrace device gen implementation
	*
	* #T directory layout
	* [-1] {".", {Ttdevdirqid, 0, QTDIR}, 0, TTPERMDIR},
	* [0..ncpu-1] {"cpunnn", {Ttdevcpudataqid\|coreid}, 0, TTPERMRWFILE},
	* [ncpu] {"ctl", {Ttdevctlqid}, TTRACE_CTL_LEN, TTPERMROFILE},
	* [ncpu+1] {"aux", {Ttdevauxqid}, 0, TTPERMRWFILE},
	*/

	/* Generate qids for the top level directory */
	static inline int ttdev1gen(const struct chan c, int s, struct qid qp)
	{
	int ret = 1;
	int path = -1;
	/* Must only be called to decode top level dir channel */
	dassert(TTYPE(c->qid) == Ttdevdirqid);

	if (s < num_cpus) // "cpunnn" data files
	path = TTCPUQID(s, Ttdevcpudataqid);
	else {
	switch (s - num_cpus) {
	case 0: path = Ttdevctlqid; break; // "ctl"
	case 1: path = Ttdevauxqid; break; // "aux"
	default: return -1;
	}
	}
	dassert(path > 0);
	mkqid(qp, path, 0, QTFILE);
	return ret;
	}

	static int ttdevgen(struct chan c, char unused_name,
	struct dirtab unused_tab, int ntab, int s, struct dir dp)
	{
	dassert(s >= 0); // DOTDOT must handled before getting here

	/* Always return the top kprof dir for '..' */
	if (s == DEVDOTDOT) {
	static const struct qid topqid = {Ttdevdirqid, 0, QTDIR};
	devdir(c, topqid, "#T", 0, eve, TTPERMDIR, dp);
	return 1;
	}

	struct qid q = c->qid;
	if (Ttdevdirqid == TTYPE(q) && ttdev1gen(c, s, &q) < 0)
	return -1;

	const char *name = NULL;
	long perm = TTPERMRWFILE;
	switch (TTYPE(q)) {
	case Ttdevctlqid: name = "ctl"; break;
	case Ttdevauxqid: name = "aux"; perm = TTPERMROFILE; break;
	case Ttdevcpudataqid:
	snprintf(get_cur_genbuf(), GENBUF_SZ, "cpu%03d", TTCPU(q));
	name = get_cur_genbuf();
	break;

	default:
	panic("devttrace: Where did bad qid come from?\n");
	case Ttdevdirqid:
	panic("devttrace: What happened to ttdev1gen decode?\n");
	}
	dassert(name);
	devdir(c, q, (char *) name, 0, eve, perm, dp);
	return 1;
	}

	/*
	* ttrace read implementation
	*/
	static size_t ttdevcopyout(char *va, long n, size_t offset,
	const char *buf, long len)
	{
	if (offset + len > n)
	error(Eshort);

	if (!current)
	memcpy(&va[offset], buf, len);
	else if (ESUCCESS != memcpy_to_user(current, &va[offset], buf, len)) {
	/* UMEM */
	// TODO(gvdl): No p9 equivalent to EFAULT, determine causes of failure.
	error(Enovmem);
	}

	return len;
	}

	/* Context for trace_ring_foreach call of ttdevread_cpu_entry() */
	#define TTRACE_ENTRY_QUADS (sizeof(struct ttrace_entry) / sizeof(uint64_t))
	/* #quads * (whitespace + len(hex(quad))) */
	#define CTXT_GENBUF_SZ (TTRACE_ENTRY_QUADS * (1 + 2 * sizeof(uint64_t)))

	struct ttdevread_cpu_ctxt {
	int64_t c;
	uintptr_t min_timestamp;
	char *va;
	long n;
	char genbuf[CTXT_GENBUF_SZ];
	};

	static inline int ttdevhexdigit(uint8_t x)
	{
	return "0123456789abcdef"[x];
	}

	static void ttdevread_cpu_entry(void ventry, void vctxt)
	{
	struct ttdevread_cpu_ctxt ctxt = (struct ttdevread_cpu_ctxt ) vctxt;
	/* A cache line aligned copy of the input entry, should make partial entrys
	* less likely. Still an entry is bracketted with timestamp == -1 */
	uint8_t buf[2 * sizeof(struct ttrace_entry)]; // 128 byte buffer
	const uintptr_t size_mask = sizeof(struct ttrace_entry) - 1;
	struct ttrace_entry* entry = (struct ttrace_entry *)
	(((uintptr_t) buf + size_mask) & ~size_mask); // align to cache line
	entry = ((struct ttrace_entry *) ventry); // Grab the entry

	/* If time stamp == -1 (i.e. entry is a partial) or is less than
	* the minimum then ignore this entry */
	if (!(entry->timestamp + 1) \|\| entry->timestamp < ctxt->min_timestamp)
	return;

	uint64_t sqp = (uint64_t ) entry;
	char *dcp = ctxt->genbuf;
	for (int i = 0; i < TTRACE_ENTRY_QUADS; i++) {
	const uint64_t quad = sqp[i];
	dcp[0] = ttdevhexdigit((quad >> 28) & 0xf);
	dcp[1] = ttdevhexdigit((quad >> 24) & 0xf);
	dcp[2] = ttdevhexdigit((quad >> 20) & 0xf);
	dcp[3] = ttdevhexdigit((quad >> 16) & 0xf);
	dcp[4] = ttdevhexdigit((quad >> 12) & 0xf);
	dcp[5] = ttdevhexdigit((quad >> 8) & 0xf);
	dcp[6] = ttdevhexdigit((quad >> 4) & 0xf);
	dcp[7] = ttdevhexdigit((quad >> 0) & 0xf);
	dcp[8] = ' ';
	dcp += 9;
	}
	dassert(&ctxt->genbuf[sizeof(ctxt->genbuf)] == dcp);
	dcp[-1] = '\n'; // Replace trailing space with a newline

	ctxt->c += ttdevcopyout(ctxt->va, ctxt->n, ctxt->c,
	ctxt->genbuf, sizeof(ctxt->genbuf));
	}

	/* iotimestamp takes the timestamp pointer and the I/Os offset and returns the
	* minimum timestamp last requested in a write. In the case where the channel
	* has been opened readonly we will complete the offset == 0 request and return
	* end of file for all subsequent (offset > 0) requests; this allows cat to
	* return one page of data. */
	static inline uintptr_t ttdevread_mintimestamp(const int tsid, int64_t offset)
	{
	/* ttdevread_cpu code can not deal sensibly with offsets without making the
	* code much more complicated, probably not worth it. */
	if (offset)
	return 0;

	const uintptr_t min_timestamp = ttdevtimestamp[tsid];
	if (min_timestamp > read_tscp()) {
	// no point in trying to read the future.
	error(Ebadarg);
	}

	return min_timestamp;
	}

	static inline long ttdevread_cpu(const int tsid,
	int coreid, void *va, long n, int64_t offset)
	{
	ERRSTACK(1);
	const uintptr_t min_timestamp = ttdevread_mintimestamp(tsid, offset);
	if (!min_timestamp)
	return 0;

	struct ttdevread_cpu_ctxt ctxt = kzalloc(sizeof(ctxt), KMALLOC_WAIT);
	if (!ctxt)
	error(Enomem);
	else if (waserror()) {
	kfree(ctxt);
	nexterror();
	}

	ctxt->min_timestamp = min_timestamp
	ctxt->va = va;
	ctxt->n = n;

	struct trace_ring * const ring = get_ttrace_ring_for_core(coreid);
	trace_ring_foreach(ring, &ttdevread_cpu_entry, ctxt);

	kfree(ctxt);
	poperror();
	return ctxt->c;
	}

	static inline long ttdevread_ctl(void *va, long n, int64_t offset)
	{
	/* Read the ttrace_type_mask and create a 'setmask' ctl command
	*
	* cmd ttrace_bits bit mask
	* setmask 0x0123456789abcdef 0x0123456789abcdef\n"
	* 123456789012345678901234567890123456789012345 6 len 46 bytes
	*/
	char * const buffer = get_cur_genbuf();
	static_assert(TTRACE_CTL_LEN <= GENBUF_SZ);

	int c = snprintf(buffer, GENBUF_SZ, "setmask 0x%016llx 0x%016llx\n",
	ttrace_type_mask & TTRACE_TYPE_MASK, TTRACE_TYPE_MASK);
	dassert(TTRACE_CTL_LEN == c);

	return readstr(offset, va, n, buffer);
	}

	/* This code will be more efficient if the user data is page aligned, but
	* should work no matter which alignment the use gives.
	* Output:
	* Page 0: struct ttrace_version
	* Page 1-n: Auxillary buffer.
	*/
	static inline long ttdevread_aux(uint8_t *va, long n, int64_t offset)
	{
	ptrdiff_t dummy_offset;
	struct ttrace_version vers;
	fill_ttrace_version(&vers);

	const long buffer_length = vers.buffer_mask + 1;
	if (offset)
	return 0; // Only allow single reads at offset 0, all others are empty
	else if (n < PGSIZE + buffer_length)
	error(Etoosmall);

	size_t c = PGSIZE; // Advance count to second page

	/* Implements reader side of auxillary buffer protocol, see
	* _ttrace_point_string comment in ttrace.c
	*
	* Touch memory to get any page faults out of the way now, hopefully we
	* will not be under paging pressure. Note that I'm accumulating into the
	* vers.last_offset so that the compiler doesn't throw out the memory touch
	* loop, the vers.last_offset is reset when we take a buffer snapshot.
	*
	* TODO(gvdl): formalise memory pinning for later I/O.
	*/
	vers.last_offset = 0;
	size_t t = PGSIZE + ((uintptr_t) va & (sizeof(long) - 1));
	for (t = 0; t < n, t += PGSIZE)
	vers.last_offset += atomic_read((atomic_t *) va[t]);

	get_ttrace_aux_buffer_snapshot(&dummy_offset, &vers.last_offset);
	const uint8_t * const aux_buffer = get_ttrace_aux_buffer();
	c += ttdevcopyout(va, n, c, aux_buffer, buffer_length);
	get_ttrace_aux_buffer_snapshot(&vers.first_offset, &dummy_ffset);

	/* Output version with buffer offsets last */
	ttdevcopyout(va, n, 0, &vers, sizeof(vers));

	return c;
	}

	/*
	* ttrace write utility routines and macros
	*/
	static uint64_t parseul(const char * const num_str, int base)
	{
	char *end_num = NULL;
	uint64_t ret = strtoul(num_str, &end_num, base);
	if (num_str == end_num)
	error(Ebadarg);
	return ret;
	}

	/*
	* ttrace devtab entry points
	*/
	static void ttdevinit(void)
	{
	static_assert(MAX_NUM_CPUS <= Maskcpu); // Assert encoding is still good

	/* Support upto 8 simultaneous opens on the ttrace/cpunnn files. */
	const int pool_size
	= min(TTRACE_MAX_TSID, TTRACE_NUM_OPENERS * num_cpus);
	/* Test for too many cpus for our tsid mechanism, re-implement. */
	dassert(num_cpus <= pool_size);
	if (num_cpus > pool_size) {
	printk("Insufficient ids for ttrace timestamp pool");
	return;
	}

	const size_t ts_size = pool_size * sizeof(*ttdevtimestamp);
	ttdevtimestamp = kmalloc(ts_size, KMALLOC_WAIT);
	memset(ttdevtimestamp, 0xff, ts_size);
	ttdevtspool = create_u16_pool(pool_size);

	/* Always allocate 0 as a unused/NULL sentinel */
	int tsidnull = get_tsid(ttdevtspool);
	assert(!tsidnull);
	ttdevtimestamp[tsidnull] = 1; // tsid[0] is set to timestamp of 1.
	}

	#define KFREE_AND_NULL(x) do { kfree(x); x = NULL; } while(false)
	static void ttdevshutdown(void)
	{
	KFREE_AND_NULL(ttdevtspool);
	KFREE_AND_NULL(ttdevtimestamp);
	}

	static struct chan ttdevattach(char spec)
	{
	return devattach('T', spec);
	}

	static struct walkqid ttdevwalk(struct chan c, struct chan *nc,
	char **name, int nname)
	{
	return devwalk(c, nc, name, nname, NULL, 0, ttdevgen);
	}

	static int ttdevstat(struct chan c, uint8_t db, int n)
	{
	int ret = devstat(c, db, n, NULL, 0, ttdevgen);
	return ret;
	}

	static struct chan ttdevopen(struct chan c, int omode)
	{
	const int o = openmode(omode);
	int tsid = TTTSID(c->qid);
	switch(TTYPE(c->qid)) {
	default:
	assert(false); // How did a bad chan get to us?

	case Ttdevdirqid:
	dassert(c->qid.type & QTDIR);
	if (openmode(omode) != OREAD)
	error(Eperm);
	break;

	case Ttdevcpudataqid:
	if (tsid)
	break; // Already allocated, reopen

	if (o == O_RDWR) {
	tsid = get_tsid(ttdevtspool);
	if (tsid < 0)
	error(Enoenv);
	else
	dassert(tsid);
	ttdevtimestamp[tsid] = 1;
	c->qid.path = TTTSIDQID(c->qid, tsid); // Record tsid
	} else if (o == O_RDONLY) {
	// Nothing to do for O_RDONLY
	} else
	error(Eperm);
	break;

	case Ttdevauxqid:
	case Ttdevctlqid:
	break;
	}

	c->mode = o;
	c->flag \|= COPEN;
	c->offset = 0;

	return c;
	}

	static void ttdevclose(struct chan *c)
	{
	if (!(c->flag & COPEN))
	return;

	const int tsid = TTTSID(c->qid);
	switch (TTYPE(c->qid)) {
	case Ttdevcpudataqid:
	/* Release timestamp */
	if (tsid) {
	ttdevtimestamp[tsid] = -1;
	put_tsid(ttdevtspool, tsid);
	}
	break;
	case Ttdevauxqid:
	case Ttdevctlqid:
	case Ttdevdirqid:
	break;
	default:
	assert(false);
	}
	}

	static long ttdevread(struct chan c, void va, long n, int64_t offset)
	{
	const unsigned tsid = TTTSID(c->qid);
	assert(tsid < ttdevtspool->size);

	switch (TTYPE(c->qid)) {
	case Ttdevdirqid: // ttrace directory read
	return devdirread(c, va, n, NULL, 0, ttdevgen);
	case Ttdevctlqid:
	return ttdevread_ctl(va, n, offset);
	case Ttdevauxqid:
	return ttdevread_aux(va, n, offset);
	case Ttdevcpudataqid:
	return ttdevread_cpu(tsid, TTCPU(c->qid), va, n, offset);
	}
	return 0; // Not us
	}

	#define CONST_STRNCMP(vp, conststr) strncmp((vp), conststr, sizeof(conststr)-1)
	static long ttdevwrite(struct chan c, void a, long n, int64_t unused_off)
	{
	ERRSTACK(1);
	uintptr_t pc;
	struct cmdbuf *cb;
	static const char ctlstring[]
	= "setmask <value> <mask>\|setbits <value>\|clrbits <mask>";
	static const char tsstring[] = "settimestamp <minimum timestamp>";

	cb = parsecmd(a, n);
	if (waserror()) {
	kfree(cb);
	nexterror();
	}

	const unsigned tsid = TTTSID(c->qid);
	assert(tsid < ttdevtspool->size);

	uint64_t mask = TTRACE_TYPE_MASK;
	uint64_t value = 0;

	switch(TTYPE(c->qid)) {
	default:
	error(Ebadusefd);

	case Ttdevctlqid:
	if (cb->nf == 3 && !CONST_STRNCMP(cb->f[0], "setmask")) {
	value = parseul(cb->f[1], 0);
	mask &= parseul(cb->f[2], 0);
	} else if (cb->nf == 2 && !CONST_STRNCMP(cb->f[0], "setbits"))
	value = parseul(cb->f[1], 0);
	else if (cb->nf == 2 && !CONST_STRNCMP(cb->f[0], "clrbits"))
	mask &= parseul(cb->f[1], 0);
	else
	error(ctlstring);

	/* Thread safe, but... lets face it if we have competing controllers
	* setting and clearing mask bits then the behaviour is going to be
	* unexpected. Perhaps we could enforce exclusive open of the ctl
	* channel. */
	{
	uint64_t cur_mask, new_mask;
	do {
	cur_mask = atomic_read((void **) &ttrace_type_mask);
	new_mask = (cur_mask & ~mask) \| (value & mask);
	} while (!atomic_cas((void **) &ttrace_type_mask,
	cur_mask, new_mask));
	}
	break;

	case Ttdevcpudataqid:
	if (cb->nf == 2 && !CONST_STRNCMP(cb->f[0], "settimestamp")) {
	if (!tsid) error(Ebadfd);

	const char *endptr = NULL;
	const unsigned long ts = parseul(cb->f[1], /* base */ 0);
	if (ts > read_tscp()) // Is the timestamp in the future.
	error(Ebadarg);
	ttdevtimestamp[tsid] = ts;
	} else
	error((char *) tsstring);
	break;
	}

	kfree(cb);
	poperror();
	return n;
	}

	struct dev ttdevdevtab __devtab = {
	'T',
	"ttrace",

	devreset,
	ttdevinit,
	ttdevshutdown,
	ttdevattach,
	ttdevwalk,
	ttdevstat,
	ttdevopen,
	devcreate,
	ttdevclose,
	ttdevread,
	devbread,
	ttdevwrite,
	devbwrite,
	devremove,
	devwstat,
	};