TTRACE timed trace kernel profiler.

Initial ttrace development, checkpoint commit.
diff --git a/Kconfig b/Kconfig
index b37e18d..fb30c90 100644
--- a/Kconfig
+++ b/Kconfig
@@ -131,14 +131,6 @@
 		(meaning it'll clobber older events).  Always enables scheduling timed
 		tracing.
 
-config TTRACE_SEMAPHORES
-	bool "Timed Trace Semaphores"
-	depends on TTRACE
-	default n
-	help
-		Turn on timestamp based thread tracing recording a timestamp, a
-		semaphore up/down event type thread_id(gvdl?) in the TTRACE buffer.
-
 config TTRACE_CV
 	bool "Timed Trace Condition Vars"
 	depends on TTRACE
@@ -148,14 +140,30 @@
 		condition waits and signals and underlying semaphore up/down event
 		types thread_id(gvdl?) in the TTRACE buffer.
 
+config TTRACE_PROCESS
+	bool "Timed Trace Process spawn, exec and death"
+	depends on TTRACE
+	default n
+	help
+		Turn on timestamp based processtracing recording the timestamp,
+		of interesting process related events such as proc creation,
+                destruction and execing.
+
 config TTRACE_RWLOCK
 	bool "Timed Trace Readers/Writers locking"
 	depends on TTRACE
 	default n
 	help
-		Turn on timestamp based thread tracing recording the timestamp,
-		reader/writer lock type, thread_id, number of readers waiting, number
-		of writes waiting.
+		Turn on timestamp based tracing recording timestamp and other
+                data for reader/writer lock interactions.
+
+config TTRACE_SEMAPHORES
+	bool "Timed Trace Semaphores"
+	depends on TTRACE
+	default n
+	help
+		Turn on timestamp based thread tracing recording a timestamp, a
+		semaphore up/down event type thread_id(gvdl?) in the TTRACE buffer.
 
 endmenu
 
diff --git a/kern/drivers/dev/devttrace.c b/kern/drivers/dev/devttrace.c
index 067282f..7ea62bd 100644
--- a/kern/drivers/dev/devttrace.c
+++ b/kern/drivers/dev/devttrace.c
@@ -7,9 +7,9 @@
  * in the LICENSE file.
  */
 
-//
-// This file implements the #T device and was based upon the UCB Plan 9 kprof.c
-//
+/*
+ * This file implements the #T device and was based upon the UCB Plan 9 kprof.c
+ */
 
 #include <assert.h>
 #include <atomic.h>
@@ -19,14 +19,11 @@
 #include <smp.h>
 #include <stdio.h>
 #include <string.h>
-#include <syscall.h>
+#include <trace.h>
+#include <ttrace.h>
 #include <umem.h>
 
 #include <ros/fs.h>
-#include <ros/ttrace.h>
-
-#warning Remove this scaffold when ttrace profiling is going
-uint64_t ttrace_type_mask;
 
 /*
  * ttrace macros and constant data
@@ -41,10 +38,10 @@
 #define TTRACE_NUM_OPENERS 8
 #define TT_SAFE_GENBUF_SZ  (GENBUF_SZ-1)  // Leave room for newline
 
-// TODO(gvdl): I don't get plan 9's permissions, why do directories get group
-// rx permissions, and what's with the DMDIR. Some devices use it and others
-// don't. In theory the DMDIR is copied over by a higher layer but I have no
-// idea why two copies seems necessary.
+/* TODO(gvdl): I don't get plan 9's permissions, why do directories get group
+ * rx permissions, and what's with the DMDIR. Some devices use it and others
+ * don't. In theory the DMDIR is copied over by a higher layer but I have no
+ * idea why two copies seems necessary. */
 #define TTPERMDIR    (S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|DMDIR)
 #define TTPERMRWFILE (S_IRUSR|S_IWUSR)
 #define TTPERMROFILE (S_IRUSR)
@@ -53,7 +50,7 @@
 	Ttdevdirqid = 0,
 	Ttdevbase,
 	Ttdevctlqid = Ttdevbase,	// 1
-	Ttdevdataqid,				// 2
+	Ttdevauxqid,				// 2
 	Ttdevcpudataqid,			// 3
 
 	Logtype = 4,  // Enough for 16 unique qids types
@@ -64,8 +61,8 @@
 	Maskcpu = (1 << Logcpu) - 1,
 	Shiftcpu = Shifttype + Logtype,
 
-	// ttrace timestamp id, used by data file readers
-	Logtsid = 13 + 3, // 4096 ttrace/cpus + a ttrace/data file by 8 opened
+	/* ttrace timestamp id, used by data file readers */
+	Logtsid = 12 + 3, // 4096 cpus by 8 simultaneous opened
 	Masktsid = (1 << Logtsid) - 1,
 	Shifttsid = Shiftcpu + Logcpu,
 };
@@ -77,34 +74,42 @@
 #define TTTSIDQID(q, i)	( ((i) << Shifttsid) | (q).path )
 
 /*
- * ttrace static data
+ * ttrace timestamp pool and accessor
  */
 static uintptr_t *ttdevtimestamp;		// array of open file timestamps
-static struct u16_pool *ttdevtspool;		// pool of timestamp indices
+static struct u16_pool *ttdevtspool;	// pool of timestamp indices
+static inline int get_tsid(void) {
+	return (ttdevtspool)? get_u16(ttdevtspool) : -1;
+}
+static inline put_tsid(int tsid) {
+	dassert(tsid >= 1 && ttdevtspool);
+	put_u16(ttdevtspool, tsid);
+}
 
-//
-// ttrace device gen implementation
-//
-// #T directory layout
-// [-1]        {".",      {Ttdevdirqid, 0, QTDIR},    0, TTPERMDIR},
-// [0..ncpu-1] {"cpunnn", {Ttdevcpudataqid|core_id},  0, TTPERMRWFILE},
-// [ncpu]      {"ctl",    {Ttdevctlqid}, TTRACE_CTL_LEN, TTPERMRWFILE},
-// [ncpu+1]    {"data",   {Ttdevdataqid},             0, TTPERMRWFILE},
+/*
+ * ttrace device gen implementation
+ *
+ * #T directory layout
+ * [-1]        {".",      {Ttdevdirqid, 0, QTDIR},    0, TTPERMDIR},
+ * [0..ncpu-1] {"cpunnn", {Ttdevcpudataqid|coreid},   0, TTPERMRWFILE},
+ * [ncpu]      {"ctl",    {Ttdevctlqid}, TTRACE_CTL_LEN, TTPERMROFILE},
+ * [ncpu+1]    {"aux",    {Ttdevauxqid},              0, TTPERMRWFILE},
+ */
 
-// Generate qids for the top level directory
+/* Generate qids for the top level directory */
 static inline int ttdev1gen(const struct chan *c, int s, struct qid *qp)
 {
 	int ret = 1;
 	int path = -1;
-	// Must only be called to decode top level dir channel
+	/* Must only be called to decode top level dir channel */
 	dassert(TTYPE(c->qid) == Ttdevdirqid);
 
 	if (s < num_cpus) // "cpunnn" data files
 		path = TTCPUQID(s, Ttdevcpudataqid);
 	else {
 		switch (s - num_cpus) {
-		case 0:  path = Ttdevctlqid;  break; // "ctl"
-		case 1:  path = Ttdevdataqid; break; // "data"
+		case 0:  path = Ttdevctlqid; break;		// "ctl"
+		case 1:  path = Ttdevauxqid; break;		// "aux"
 		default: return -1;
 		}
 	}
@@ -130,9 +135,10 @@
 		return -1;
 
 	const char *name = NULL;
+	long perm = TTPERMRWFILE;
 	switch (TTYPE(q)) {
-	case Ttdevctlqid:  name = "ctl";  break;
-	case Ttdevdataqid: name = "data"; break;
+	case Ttdevctlqid: name = "ctl"; break;
+	case Ttdevauxqid: name = "aux"; perm = TTPERMROFILE; break;
 	case Ttdevcpudataqid:
 		snprintf(get_cur_genbuf(), GENBUF_SZ, "cpu%03d", TTCPU(q));
 		name = get_cur_genbuf();
@@ -144,7 +150,7 @@
 		panic("devttrace: What happened to ttdev1gen decode?\n");
 	}
 	dassert(name);
-	devdir(c, q, (char *) name, 0, eve, TTPERMRWFILE, dp);
+	devdir(c, q, (char *) name, 0, eve, perm, dp);
 	return 1;
 }
 
@@ -160,6 +166,7 @@
 	if (!current)
 		memcpy(&va[offset], buf, len);
 	else if (ESUCCESS != memcpy_to_user(current, &va[offset], buf, len)) {
+		/* UMEM */
 		// TODO(gvdl): No p9 equivalent to EFAULT, determine causes of failure.
 		error(Enovmem);
 	}
@@ -167,124 +174,72 @@
 	return len;
 }
 
-static inline int ttdevputchar(char *buf, int len, char c)
+/* Context for trace_ring_foreach call of ttdevread_cpu_entry() */
+#define TTRACE_ENTRY_QUADS (sizeof(struct ttrace_entry) / sizeof(uint64_t))
+/* #quads * (whitespace + len(hex(quad))) */
+#define CTXT_GENBUF_SZ     (TTRACE_ENTRY_QUADS * (1 + 2 * sizeof(uint64_t)))
+
+struct ttdevread_cpu_ctxt {
+	int64_t c;
+	uintptr_t min_timestamp;
+	char *va;
+	long n;
+	char genbuf[CTXT_GENBUF_SZ];
+};
+
+static inline int ttdevhexdigit(uint8_t x)
 {
-	dassert(len >= 1);
-	*buf = c;
-	return 1;
+	return "0123456789abcdef"[x];
 }
 
-static inline int ttdevputhex8(char* buf, int len, uint8_t x)
+static void ttdevread_cpu_entry(void *ventry, void *vctxt)
 {
-	static const char hex_digits[] = "0123456789abcdef";
-	int c = 0;
-	c += ttdevputchar(&buf[c], len-c, hex_digits[x >> 4]);
-	c += ttdevputchar(&buf[c], len-c, hex_digits[x & 0xf]);
-	return c;
-}
+	struct ttdevread_cpu_ctxt *ctxt = (struct ttdevread_cpu_ctxt *) vctxt;
+	/* A cache line aligned copy of the input entry, should make partial entrys
+	 * less likely. Still an entry is bracketted with timestamp == -1 */
+	uint8_t buf[2 * sizeof(struct ttrace_entry)];  // 128 byte buffer
+	const uintptr_t size_mask = sizeof(struct ttrace_entry) - 1;
+	struct ttrace_entry* entry = (struct ttrace_entry *)
+		(((uintptr_t) buf + size_mask) & ~size_mask); // align to cache line
+	*entry = *((struct ttrace_entry *) ventry);  // Grab the entry
 
-static inline int ttdevputhex16(char *buf, int len, uint16_t x)
-{
-	int c = ttdevputhex8(&buf[0], len, x >> 8);
-	ttdevputhex8(&buf[c], len-c, (uint8_t) x);
-	return sizeof(uint16_t) * 2;
-}
+	/* If time stamp == -1 (i.e. entry is a partial) or is less than
+	 * the minimum then ignore this entry */
+	if (!(entry->timestamp + 1) || entry->timestamp < ctxt->min_timestamp)
+		return;
 
-static int ttdevputhex32(char *buf, int len, uint32_t x)
-{
-	int c = ttdevputhex16(&buf[0], len, x >> 16);
-	ttdevputhex16(&buf[c], len-c, (uint16_t) x);
-	return sizeof(uint32_t) * 2;
-}
-
-static int ttdevputhex64(char *buf, int len, uint64_t x)
-{
-	int c = ttdevputhex32(&buf[0], len, x >> 32);
-	ttdevputhex32(&buf[c], len-c, (uint32_t) x);
-	return sizeof(uint64_t) * 2;
-}
-
-static inline int ttdevputmem(char *buf, int len, const void *mem, int mem_len)
-{
-	dassert(mem_len <= len);
-	memcpy(buf, mem, mem_len);
-	return mem_len;
-}
-
-static int ttdevputhdr(char *buf, int len,
-					   uint8_t rec_len, uint8_t tag, uintptr_t timestamp)
-{
-	int c = 0;
-	c += ttdevputhex8(&buf[c], len-c, rec_len);
-	c += ttdevputhex8(&buf[c], len-c, tag);
-	c += ttdevputhex64(&buf[c], len-c, timestamp);
-	return c;
-}
-
-static size_t ttdev_readnamerec(void *va, long n, uint8_t tag,
-								uintptr_t timestamp, uint64_t id,
-								const char *name)
-{
-	const int len = TT_SAFE_GENBUF_SZ; // Always leave room for newline
-	char * const buffer = get_cur_genbuf();
-	int remaining = strlen(name);
-	size_t out_len = 0;
-
-	/* Output first header */
-	int nc = min(len - TTRACEH_NAME_LEN, remaining);
-	int rec_len = TTRACEH_NAME_LEN + nc; dassert(rec_len <= len);
-	int c = 0;
-	c += ttdevputhdr(&buffer[c], len-c, rec_len, tag, timestamp);
-	c += ttdevputhex64(&buffer[c], len-c, id);
-	c += ttdevputmem(&buffer[c], len-c, name, nc);
-	c += ttdevputchar(&buffer[c], len+1-c, '\n');
-	out_len = ttdevcopyout(va, n, out_len, buffer, c);
-
-	tag |= TTRACEH_TAG_CONT;
-	const int buf_nm_len = len - TTRACEH_CONT_LEN;
-	while ((remaining -= nc) > 0) {
-		name += nc;
-
-		nc = min(buf_nm_len, remaining);
-		rec_len = TTRACEH_CONT_LEN + nc; dassert(rec_len <= len);
-		c = 0;
-		c += ttdevputhex8(&buffer[c], len-c, rec_len);
-		c += ttdevputhex8(&buffer[c], len-c, tag);
-		c += ttdevputmem(&buffer[c], len-c, name, nc);
-		c += ttdevputchar(&buffer[c], len+1-c, '\n');
-		out_len += ttdevcopyout(va, n, out_len, buffer, c);
+	uint64_t *sqp = (uint64_t *) entry;
+	char *dcp = ctxt->genbuf;
+	for (int i = 0; i < TTRACE_ENTRY_QUADS; i++) {
+		const uint64_t quad = sqp[i];
+		dcp[0] = ttdevhexdigit((quad >> 28) & 0xf);
+		dcp[1] = ttdevhexdigit((quad >> 24) & 0xf);
+		dcp[2] = ttdevhexdigit((quad >> 20) & 0xf);
+		dcp[3] = ttdevhexdigit((quad >> 16) & 0xf);
+		dcp[4] = ttdevhexdigit((quad >> 12) & 0xf);
+		dcp[5] = ttdevhexdigit((quad >>  8) & 0xf);
+		dcp[6] = ttdevhexdigit((quad >>  4) & 0xf);
+		dcp[7] = ttdevhexdigit((quad >>  0) & 0xf);
+		dcp[8] = ' ';
+		dcp += 9;
 	}
-	return out_len;
+	dassert(&ctxt->genbuf[sizeof(ctxt->genbuf)] == dcp);
+	dcp[-1] = '\n';  // Replace trailing space with a newline
+
+	ctxt->c += ttdevcopyout(ctxt->va, ctxt->n, ctxt->c,
+			                ctxt->genbuf, sizeof(ctxt->genbuf));
 }
 
-static size_t ttdevread_info(void *va, long n, uintptr_t timestamp)
-{
-	char * const buffer = get_cur_genbuf();
-	const int len = TT_SAFE_GENBUF_SZ; /* Room for new line */
-	const int rec_len = TTRACEH_LEN + 12;  /* header + 3*h[4] versions */
-	dassert(rec_len <= len);
-	int c = 0;
-
-	c += ttdevputhdr(&buffer[c], len-c, rec_len, TTRACEH_TAG_INFO, timestamp);
-	c += ttdevputhex16(&buffer[c], len-c, TTRACEH_V1);
-	c += ttdevputhex16(&buffer[c], len-c, TTRACEE_V1);
-	c += ttdevputhex16(&buffer[c], len-c, num_cpus);
-	dassert(c == rec_len); // Don't count '\n' in rec_len
-	c += ttdevputchar(&buffer[c], len+1-c, '\n'); // Always have room for newline
-
-	return ttdevcopyout(va, n, /* offset */ 0, buffer, c);
-}
-
-// iotimestamp takes the timestamp pointer and the I/Os offset and returns the
-// minimum timestamp last requested in a write. In the case where the channel
-// has been opened readonly we will complete the offset == 0 request and return
-// end of file for all subsequent (offset > 0) requests; this allows cat to
-// return one page of data.
+/* iotimestamp takes the timestamp pointer and the I/Os offset and returns the
+ * minimum timestamp last requested in a write. In the case where the channel
+ * has been opened readonly we will complete the offset == 0 request and return
+ * end of file for all subsequent (offset > 0) requests; this allows cat to
+ * return one page of data. */
 static inline uintptr_t ttdevread_mintimestamp(const int tsid, int64_t offset)
 {
-	// If we don't have a timestamp, then only satisfy a single I/O, that is
-	// anything with an offset of 0
-	if (!tsid && offset)
+	/* ttdevread_cpu code can not deal sensibly with offsets without making the
+	 * code much more complicated, probably not worth it. */ 
+	if (offset)
 		return 0;
 
 	const uintptr_t min_timestamp = ttdevtimestamp[tsid];
@@ -296,13 +251,42 @@
 	return min_timestamp;
 }
 
+static inline long ttdevread_cpu(const int tsid,
+								 int coreid, void *va, long n, int64_t offset)
+{
+	ERRSTACK(1);
+	const uintptr_t min_timestamp = ttdevread_mintimestamp(tsid, offset);
+	if (!min_timestamp)
+		return 0;
+
+	struct ttdevread_cpu_ctxt *ctxt = kzalloc(sizeof(*ctxt), KMALLOC_WAIT);
+	if (!ctxt)
+		error(Enomem);
+	else if (waserror()) {
+		kfree(ctxt);
+		nexterror();
+	}
+
+	ctxt->min_timestamp = min_timestamp
+	ctxt->va = va;
+	ctxt->n = n;
+
+	struct trace_ring * const ring = get_ttrace_ring_for_core(coreid);
+	trace_ring_foreach(ring, &ttdevread_cpu_entry, ctxt);
+
+	kfree(ctxt);
+	poperror();
+	return ctxt->c;
+}
+
 static inline long ttdevread_ctl(void *va, long n, int64_t offset)
 {
-	// Read the ttrace_type_mask and create a 'setmask' ctl command
-	//
-	// cmd     ttrace_bits       bit mask
-	// setmask 0x0123456789abcdef 0x0123456789abcdef\n"
-	// 123456789012345678901234567890123456789012345 6  len 46 bytes
+	/* Read the ttrace_type_mask and create a 'setmask' ctl command
+	 *
+	 * cmd     ttrace_bits       bit mask
+	 * setmask 0x0123456789abcdef 0x0123456789abcdef\n"
+	 * 123456789012345678901234567890123456789012345 6  len 46 bytes
+	 */
 	char * const buffer = get_cur_genbuf();
 	static_assert(TTRACE_CTL_LEN <= GENBUF_SZ);
 
@@ -313,53 +297,52 @@
 	return readstr(offset, va, n, buffer);
 }
 
-static inline long ttdevread_data(const int tsid,
-								  uint8_t *va, long n, int64_t offset)
+/* This code will be more efficient if the user data is page aligned, but
+ * should work no matter which alignment the use gives.
+ * Output:
+ *   Page 0:   struct ttrace_version
+ *   Page 1-n: Auxillary buffer.
+ */
+static inline long ttdevread_aux(uint8_t *va, long n, int64_t offset)
 {
-	const int min_timestamp = ttdevread_mintimestamp(tsid, offset);
-	if (!min_timestamp)
-		return 0;
+	ptrdiff_t dummy_offset;
+	struct ttrace_version vers;
+	fill_ttrace_version(&vers);
 
-	// All data requested, Copy out basic data: version ids and syscall table
-	size_t c = 0;  // Number of characters output
-	if (min_timestamp == 1) {
-		c += ttdevread_info(&va[c], n - c, min_timestamp);
-		for (size_t i = 0; i < max_syscall; i++) {
-			const char * const name = syscall_table[i].name;
-			if (!name) continue;
+	const long buffer_length = vers.buffer_mask + 1;
+	if (offset)
+		return 0; // Only allow single reads at offset 0, all others are empty
+	else if (n < PGSIZE + buffer_length)
+		error(Etoosmall);
 
-			c += ttdev_readnamerec(&va[c], n - c, TTRACEH_TAG_SYSC,
-								   min_timestamp, i, name);
-		}
-	}
+	size_t c = PGSIZE; // Advance count to second page
 
-	// Read ttrace data file, contains names and other data, this data is slow
-	// to store and generate and is expected to be done only rarely. Such as at
-	// process, semaphore, ktask creation.
+	/* Implements reader side of auxillary buffer protocol, see
+	 * _ttrace_point_string comment in ttrace.c
+	 *
+	 * Touch memory to get any page faults out of the way now, hopefully we
+	 * will not be under paging pressure. Note that I'm accumulating into the
+	 * vers.last_offset so that the compiler doesn't throw out the memory touch
+	 * loop, the vers.last_offset is reset when we take a buffer snapshot.
+	 *
+	 * TODO(gvdl): formalise memory pinning for later I/O.
+	 */
+	vers.last_offset = 0;
+	size_t t = PGSIZE + ((uintptr_t) va & (sizeof(long) - 1));
+	for (t = 0; t < n, t += PGSIZE)
+		vers.last_offset += atomic_read((atomic_t *) va[t]);
+
+	get_ttrace_aux_buffer_snapshot(&dummy_offset, &vers.last_offset);
+	const uint8_t * const aux_buffer = get_ttrace_aux_buffer();
+	c += ttdevcopyout(va, n, c, aux_buffer, buffer_length);
+	get_ttrace_aux_buffer_snapshot(&vers.first_offset, &dummy_ffset);
+
+	/* Output version with buffer offsets last */
+	ttdevcopyout(va, n, 0, &vers, sizeof(vers));
+
 	return c;
 }
 
-static inline long ttdevread_cpu(const int tsid,
-								 int core_id, void *va, long n, int64_t offset)
-{
-	const int min_timestamp = ttdevread_mintimestamp(tsid, offset);
-	if (!min_timestamp)
-		return 0;
-
-#warning Scaffolding, test cpu timestamp reading
-	// Read the timestamp and output it in decimal (max 20 digits).
-	// cmd   timestamp
-	// setts 12345678901234567890\n"
-	// 12345678901234567890123456 7  len 27
-	char * const buffer = get_cur_genbuf();
-	static_assert(27 <= GENBUF_SZ);
-
-	int c = snprintf(buffer, GENBUF_SZ, "setts %lld\n", min_timestamp);
-	dassert(c <= 27);
-
-	return readstr(offset, va, n, buffer);
-}
-
 /*
  * ttrace write utility routines and macros
  */
@@ -372,8 +355,6 @@
 	return ret;
 }
 
-#define CONST_STRNCMP(vp, conststr) strncmp((vp), conststr, sizeof(conststr)-1)
-
 /*
  * ttrace devtab entry points
  */
@@ -381,29 +362,32 @@
 {
 	static_assert(MAX_NUM_CPUS <= Maskcpu);  // Assert encoding is still good
 
-	// Support upto 8 simultaneous opens on the ttrace/{data,cpu*} files.
+	/* Support upto 8 simultaneous opens on the ttrace/cpunnn files. */
 	const int pool_size
-		= min(TTRACE_MAX_TSID, TTRACE_NUM_OPENERS * (1 + num_cpus));
-	// Test for too many cpus for our tsid mechanism, re-implement.
-	dassert(1 + num_cpus <= pool_size);
-	if (1 + num_cpus > pool_size) return;
+		= min(TTRACE_MAX_TSID, TTRACE_NUM_OPENERS * num_cpus);
+	/* Test for too many cpus for our tsid mechanism, re-implement. */
+	dassert(num_cpus <= pool_size);
+	if (num_cpus > pool_size) {
+		printk("Insufficient ids for ttrace timestamp pool");
+		return;
+	}
 
 	const size_t ts_size = pool_size * sizeof(*ttdevtimestamp);
 	ttdevtimestamp = kmalloc(ts_size, KMALLOC_WAIT);
 	memset(ttdevtimestamp, 0xff, ts_size);
 	ttdevtspool = create_u16_pool(pool_size);
 
-	// Always allocate 0 as a unused/NULL sentinel
-	int tsidnull = get_u16(ttdevtspool);
+	/* Always allocate 0 as a unused/NULL sentinel */
+	int tsidnull = get_tsid(ttdevtspool);
 	assert(!tsidnull);
-	ttdevtimestamp[tsidnull] = 1;  // tsid==0 always has a 1 minimum timestamp.
+	ttdevtimestamp[tsidnull] = 1;  // tsid[0] is set to timestamp of 1.
 }
 
-#define kfree_and_null(x) do { kfree(x); x = NULL; } while(false)
+#define KFREE_AND_NULL(x) do { kfree(x); x = NULL; } while(false)
 static void ttdevshutdown(void)
 {
-	kfree_and_null(ttdevtspool);
-	kfree_and_null(ttdevtimestamp);
+	KFREE_AND_NULL(ttdevtspool);
+	KFREE_AND_NULL(ttdevtimestamp);
 }
 
 static struct chan *ttdevattach(char *spec)
@@ -437,14 +421,12 @@
 			error(Eperm);
 		break;
 
-	case Ttdevdataqid:
 	case Ttdevcpudataqid:
 		if (tsid)
 			break; // Already allocated, reopen
 
-		// Allocate a timestamp from the pool
 		if (o == O_RDWR) {
-			tsid = get_u16(ttdevtspool);
+			tsid = get_tsid(ttdevtspool);
 			if (tsid < 0)
 				error(Enoenv);
 			else
@@ -457,6 +439,7 @@
 			error(Eperm);
 		break;
 
+	case Ttdevauxqid:
 	case Ttdevctlqid:
 		break;
 	}
@@ -475,14 +458,14 @@
 
 	const int tsid = TTTSID(c->qid);
 	switch (TTYPE(c->qid)) {
-	case Ttdevdataqid:
 	case Ttdevcpudataqid:
-		// Release timestamp
+		/* Release timestamp */
 		if (tsid) {
 			ttdevtimestamp[tsid] = -1;
-			put_u16(ttdevtspool, tsid);
+			put_tsid(ttdevtspool, tsid);
 		}
 		break;
+	case Ttdevauxqid:
 	case Ttdevctlqid:
 	case Ttdevdirqid:
 		break;
@@ -501,14 +484,15 @@
 		return devdirread(c, va, n, NULL, 0, ttdevgen);
 	case Ttdevctlqid:
 		return ttdevread_ctl(va, n, offset);
-	case Ttdevdataqid:
-		return ttdevread_data(tsid, va, n, offset);
+	case Ttdevauxqid:
+		return ttdevread_aux(va, n, offset);
 	case Ttdevcpudataqid:
 		return ttdevread_cpu(tsid, TTCPU(c->qid), va, n, offset);
 	}
 	return 0; // Not us
 }
 
+#define CONST_STRNCMP(vp, conststr) strncmp((vp), conststr, sizeof(conststr)-1)
 static long ttdevwrite(struct chan *c, void *a, long n, int64_t unused_off)
 {
 	ERRSTACK(1);
@@ -516,7 +500,7 @@
 	struct cmdbuf *cb;
 	static const char ctlstring[]
 		= "setmask <value> <mask>|setbits <value>|clrbits <mask>";
-	static const char tsstring[] = "setts <minimum timestamp>";
+	static const char tsstring[] = "settimestamp <minimum timestamp>";
 
 	cb = parsecmd(a, n);
 	if (waserror()) {
@@ -545,10 +529,10 @@
 		else
 			error(ctlstring);
 
-		// Thread safe, but... lets face it if we have competing controllers
-		// setting anc clearing mask bits then the behaviour is going to be
-		// unexpected. Perhaps I should enforce exclusive open of the ctl
-		// channel.
+		/* Thread safe, but... lets face it if we have competing controllers
+		 * setting and clearing mask bits then the behaviour is going to be
+		 * unexpected. Perhaps we could enforce exclusive open of the ctl
+		 * channel. */
 		{
 			uint64_t cur_mask, new_mask;
 			do {
@@ -560,8 +544,7 @@
 		break;
 
 	case Ttdevcpudataqid:
-	case Ttdevdataqid:
-		if (cb->nf == 2 && !CONST_STRNCMP(cb->f[0], "setts")) {
+		if (cb->nf == 2 && !CONST_STRNCMP(cb->f[0], "settimestamp")) {
 			if (!tsid) error(Ebadfd);
 
 			const char *endptr = NULL;
diff --git a/kern/include/ros/ttrace.h b/kern/include/ros/ttrace.h
index a1fc87f..f4191ce 100644
--- a/kern/include/ros/ttrace.h
+++ b/kern/include/ros/ttrace.h
@@ -3,73 +3,146 @@
  * See LICENSE for details.
  *
  * Timed tracing.
- *
- * This is the first cut at a TTRACE header. At the moment only sufficient type
- * information for the initial implementation of kprof_ttrace logging output
- * subsystem to be prototyped. That is this file is currently part of the
- * scaffolding necessary to bring up a comlicated facility and break it up into
- * reasonably sized changes.
- */
+ * TODO(gvdl): Documentation goes here. */
 #ifndef INC_ROS_TTRACE_H
 #define INC_ROS_TTRACE_H
 
-#ifndef CONFIG_TTRACE
-
-// If the TTRACE is not configured then null out the initter
-#define ttrace_init()    do { } while(false)
-#define ttrace_cleanup() do { } while(false)
-
-#else /* CONFIG_TTRACE */
-
-extern uint64_t ttrace_type_mask;
+#define TTRACE_DEV        "#T"
+#define TTRACE_DEV_CTL    TTRACE_DEV "/ctl"
+#define TTRACE_DEV_AUX    TTRACE_DEV "/aux"
+#define TTRACE_DEV_CPUFMT TTRACE_DEV "/cpu%03d"
 
 enum {
+	TTRACE_ENTRY_MASK      = 0x0000000000000003ULL,  /* two spare bits */
+	/* Exact meaning varies depends on TYPE */
+	TTRACE_ENTRY_START     = 0x0000000000000000ULL,  /* Start/unblock */
+	TTRACE_ENTRY_STOP      = 0x0000000000000001ULL,  /* Stop/block */
+	TTRACE_ENTRY_ENTRY     = 0x0000000000000002ULL,  /* Entry */
+	TTRACE_ENTRY_EXIT      = 0x0000000000000003ULL,  /* Exit */
+
+	/* Tag mask for the ctx union */
+	TTRACE_CTX_MASK        = 0x0000000000000070ULL,  /* one spare bit */
+	TTRACE_CTX_KTASK       = 0x0000000000000000ULL,  /* ctx.kthread */
+	TTRACE_CTX_RKMSG       = 0x0000000000000010ULL,  /* ctx.kthread */
+	TTRACE_CTX_PIDSYSC     = 0x0000000000000020ULL,  /* ctx.pid_syscall */
+	TTRACE_CTX_PIDTRAP     = 0x0000000000000030ULL,  /* ctx.pid_syscall */
+	TTRACE_CTX_IRQ         = 0x0000000000000040ULL,  /* ctx.ctx_depth */
+	TTRACE_CTX_TRAP        = 0x0000000000000050ULL,  /* ctx.ctx_depth */
+
+	/* 12 bits to encode cpu_id */
+	TTRACE_TYPE_PCPU_MASK  = 0x00000000000fff00ULL,  /* Not set by kernel */
+
 	/* Block of scheduling tracking trace type bits */
 	TTRACE_TYPE_MASK       = 0xfffffffffff00000ULL,
+	TTRACE_TYPE_SCHED      = 0x0000000000100000ULL,
+	TTRACE_TYPE_SEMA       = 0x0000000000200000ULL,
+	TTRACE_TYPE_RWLOCK     = 0x0000000000400000ULL,
+	TTRACE_TYPE_SYSCALL    = 0x0000000000800000ULL,
+	TTRACE_TYPE_INTR       = 0x0000000001000000ULL,
+	TTRACE_TYPE_PROCESS    = 0x0000000002000000ULL,
 
-	/* ttrace/data tags and misc */
-	TTRACEH_V1             = 0x0100,  /* Version 1.0 ttrace/data format */
-	TTRACEE_V1             = 0x0090,  /* Version 0.9 ttrace/cpu* scaffold */
-	TTRACEH_TAG_CONT       = 0x80,    /* Continuation record top bit set */
-	TTRACEH_TAG_INFO       = 0x01,    /* Data file info version */
-	TTRACEH_TAG_SYSC       = 0x02,    /* Syscall entry */
-	TTRACEH_TAG_PROC       = 0x03,    /* Proc name */
-	TTRACEH_TAG_KTASK      = 0x04,    /* Ktask name */
-	TTRACEH_TAG_SEM        = 0x05,    /* Semaphore name */
-	TTRACEH_TAG_CV         = 0x06,    /* Condition name */
-	TTRACEH_TAG_QLOCK      = 0x07,    /* Qlock name */
+	/* ttrace_version */
+	TTRACEVH_MAGIC         = 0x0001020304050607;
+	TTRACEVH_CIGAM         = 0x0706050403020100;
+	TTRACEVH_H_V1_0        = 0x00010000,  /* Version 1.0 ttrace/data format */
+	TTRACEVH_S_V1_0        = 0x00010000,  /* Version 1.0 ttrace/data format */
+	TTRACEVH_C_V1_0        = 0x00010000,  /* Version 1.0 ttrace/cpu* format */
 
-	/* ttrace/data header lengths */
-	TTRACEH_CONT_LEN = 2 + 2, /* len, tag */
-	TTRACEH_LEN = TTRACEH_CONT_LEN + 16, /* len + tag + timestamp */
-	TTRACEH_NAME_LEN = TTRACEH_LEN + 16, /* header len + id len */
-};
+	/* ttrace/shared data tags */
+	TTRACES_TAG_CONT       = 0x80000000,  /* Tag continuation, top bit set */
+	TTRACES_TAG_INFO       = 0x00000001,  /* Data file specification */
+	TTRACES_TAG_SYSC       = 0x00000002,  /* Syscall entry */
+	TTRACES_TAG_PROC       = 0x00000003,  /* Proc name */
+	TTRACES_TAG_KTASK      = 0x00000004,  /* Ktask name */
+	TTRACES_TAG_SEM        = 0x00000005,  /* Semaphore name */
+	TTRACES_TAG_CV         = 0x00000006,  /* Condition name */
+	TTRACES_TAG_QLOCK      = 0x00000007,  /* Qlock name */
+}
 
-// Timed trace data version 1 record format,
-//   All lines are nl terminated.
-//   where h[n]-> n hex digits, s[n] -> n byte string
-//
-//   Header:
-//   h[2]: len  Max 127(0xff)
-//   h[2]: tag
-//   h[16]: timestamp
-//
-//   Info record:
-//   h[20]: header, timestamp is always 1 tag: TTRACEH_TAG_INFO
-//   h[4]: summary data version
-//   h[4]: cpu data version
-//   h[4]: num cpus
-//
-//   Name record:
-//   h[20]: header
-//   h[16]: id;  uintptr_t, such as kthread, pid or syscall number
-//   s[hdr.len-36]: string (not nul terminated)
-//
-//   Name continuation record(no timestamp), always follows a Name record:
-//   h[2]: len
-//   h[2]: tag (top bit set)
-//   s[hdr.len-4]: string (not nul terminated)
-//
+// Binary information is always in host endianness, the reader makes right
+// assert(((struct ttrace_version *) <hdrp>)->magic == TTRACEVH_MAGIC);
+// TODO: Endianness correcting inlines when required
 
-#endif /* CONFIG_TTRACE */
+struct ttrace_version {
+	uint64_t magic; // Endianness of ttrace binary data, reader makes right
+	uint32_t header_version; // Version of this header
+	uint32_t shared_data_version; // Version of shared data
+	uint32_t percpu_data_version; // Version of percpu entries
+	uint32_t num_cpus;
+	ptrdiff_t buffer_mask; // Mask for offsets into shared buffer
+	ptrdiff_t first_offset; // First valid data in auxilary buffer
+	ptrdiff_t last_offset; // Last data entry in auxilary
+} __attribute__ ((packed));
+
+struct ttrace_type {
+	/* 0x00 */ uint64_t type_id;
+	/* 0x08 */ union {
+		uintptr_t kthread;
+		uintptr_t pid_syscall;
+		uint32_t ctx_depth;
+	} ctx;
+} __attribute__ ((packed));
+
+/* ttrace entry must be a 64byte (that is power of two cachelines) */
+struct ttrace_entry {
+	/* 0x00 */ uint64_t timestamp;
+	/* 0x08 */ struct ttrace_type t;
+	/* 0x18 */ uintptr_t data0;
+	/* 0x20 */ uintptr_t data1;
+	/* 0x28 */ uintptr_t data2;
+	/* 0x30 */ uintptr_t data3;
+	/* 0x38 */ uintptr_t data4;
+} __attribute__ ((packed));
+
+/* Continuations have the same timestamp with 0x1 bit set */
+struct ttrace_entry_continuation {
+	/* 0x00 */ uint64_t timestamp;
+	/* 0x08 */ uintptr_t data0;
+	/* 0x10 */ uintptr_t data1;
+	/* 0x18 */ uintptr_t data2;
+	/* 0x20 */ uintptr_t data3;
+	/* 0x28 */ uintptr_t data4;
+	/* 0x30 */ uintptr_t data5;
+	/* 0x38 */ uintptr_t data6;
+} __attribute__ ((packed));
+
+// len in bytes but records are laid down on 64bit boundaries, actual
+// length in memory is computed using ttrace_aux_entry_size().
+struct ttrace_aux_entry {
+	uint64_t path;     // See path decode macros below
+	uint64_t timestamp;
+	// Payload
+	uintptr_t ident;  // Optional: Identifier for record, eg. syscall number
+	uint8_t string[]; // String is not nul terminated
+} __attribute__ ((packed));
+
+#define TTRACE_AUX_PATH(tag, coreid, len, flags) ({							\
+	const uint64_t t = (uint32_t) tag;										\
+	const uint64_t c = (uint16_t) coreid;									\
+	const uint64_t paylen = (uint16_t) len - 16;							\
+	const uint64_t f = flags & 0xf;											\
+	(t << 32) | c << 16 | paylen << 4 | f;									\
+})
+#define TTRACE_AUX_TAG(path)    ((path >> 32))
+#define TTRACE_AUX_CPUID(path)  ((path >> 16) & 0xffff)
+#define TTRACE_AUX_LEN(path)   (((path >>  4) &  0xfff) + 16)
+#define TTRACE_AUX_FLAGS(path)   (path & 0xf)
+#define TTRACE_AUX_ENTRY_MAXLEN ((size_t) 4096)
+#define TTRACE_AUX_ENTRY_MAXPAYLEN (TTRACE_AUX_ENTRY_MAXLEN  - 16)
+#define TTRACE_AUX_ENTRY_MAXSTRLEN \
+	(TTRACE_AUX_ENTRY_MAXLEN - sizeof(struct ttrace_aux_entry))
+
+static inline size_t ttrace_aux_align(ptrdiff_t offset)
+{
+	const size_t align_mask = sizeof(uint64_t) - 1;  // 8 byte alignment
+	return ((offset & ~align_mask) + align_mask) & ~align_mask;
+}
+
+static inline struct ttrace_aux_entry *ttrace_aux_entry_next(
+	   const struct ttrace_aux_entry * const entry)
+{
+	const size_t len = ttrace_aux_entry_size(entry->len);
+	return (struct ttrace_aux_entry *) &((uint8_t*) entry)[len];
+}
+
 #endif /* INC_ROS_TTRACE_H */
diff --git a/kern/include/trace.h b/kern/include/trace.h
index 611eb11..b854f04 100644
--- a/kern/include/trace.h
+++ b/kern/include/trace.h
@@ -28,10 +28,11 @@
  * 		}
  * 		trace_ring_foreach(my_trace_ring_ptr, trace_handler, optional_blob);
  *
- * Rings can be racy or not, and can overwrite entries or not.  If you are not
- * overwriting, the ring will stop giving you slots.  You need to reset the ring
- * to get fresh slots again.  If you are overwriting, you don't need to check
- * the return value of get_trace_slot_overwrite().
+ * Rings can be smp safe, per cpu or racy, and can overwrite entries or not.
+ * If you are not overwriting, the ring will stop giving you slots.  You need
+ * to reset the ring to get fresh slots again.  If you are overwriting, you
+ * don't need to check the return value of get_trace_slot_overwrite().  Per cpu
+ * rings are interrupt safe.
  *
  * Given there is overwrite, tr_next doesn't really tell us which ones were
  * used.  So your handler should check for a flag or something.  Timestamps
@@ -41,10 +42,11 @@
 #define ROS_INC_TRACE_H
 
 #include <ros/common.h>
+#include <arch/arch.h>
 #include <assert.h>
 
 struct trace_ring {
-	unsigned char				*tr_buf;
+	uint8_t						*tr_buf;
 	size_t						tr_buf_sz;
 	unsigned int				tr_event_sz_shift;
 	unsigned int				tr_max;
@@ -85,6 +87,18 @@
 	return __get_tr_slot(tr, slot);
 }
 
+/* Interrupt safe modification of tr_next */
+static inline unsigned int
+__get_next_percpu_and_add(struct trace_ring *tr, int8_t n)
+{
+	int8_t irq_state = 0;
+	disable_irqsave(&irq_state);
+	unsigned int ret = tr->tr_next;
+	tr->tr_next += n;
+	enable_irqsave(&irq_state);
+	return ret;
+}
+
 static inline void *get_trace_slot(struct trace_ring *tr)
 {
 	/* Using syncs, instead of atomics, since we access tr_next as both atomic
@@ -104,6 +118,22 @@
 	return __get_tr_slot_overwrite(tr, __sync_fetch_and_add(&tr->tr_next, 1));
 }
 
+static inline void *get_trace_slot_percpu(struct trace_ring *tr)
+{
+	unsigned long my_slot = __get_next_percpu_and_add(tr, 1);
+	if (my_slot >= tr->tr_max) {
+		/* See comment in get_trace_slot. */
+	    __get_next_percpu_and_add(tr, -1);
+		return 0;
+	}
+	return __get_tr_slot(tr, my_slot);
+}
+
+static inline void *get_trace_slot_overwrite_percpu(struct trace_ring *tr)
+{
+	return __get_tr_slot_overwrite(tr, __get_next_percpu_and_add(tr, 1));
+}
+
 static inline void *get_trace_slot_racy(struct trace_ring *tr)
 {
 	unsigned long my_slot = tr->tr_next;
diff --git a/kern/include/ttrace.h b/kern/include/ttrace.h
new file mode 100644
index 0000000..4b38c58
--- /dev/null
+++ b/kern/include/ttrace.h
@@ -0,0 +1,132 @@
+/* TODO(gvdl): Who holds the copyright?
+ * Godfrey van der Linden <gvdl@google.com>
+ * See LICENSE for details.
+ *
+ * Timed tracing.
+ * Kernel header
+ * TODO(gvdl): Documentation goes here. */
+#ifndef INC_TTRACE_H
+#define INC_TTRACE_H
+
+#define TTRACE_NULL_STATEMENT do { } while(0)
+
+#ifndef CONFIG_TTRACE
+
+// If the TTRACE is not configured then null out the initters
+#define ttrace_init()    TTRACE_NULL_STATEMENT
+#define ttrace_cleanup() TTRACE_NULL_STATEMENT
+
+#else /* CONFIG_TTRACE */
+
+#include <atomic.h>
+#include <ros/ttrace.h>
+
+// Global data
+
+// Active tracepoint mask, see TTRACE_TYPE... enums in ros/ttrace.h
+extern uint64_t ttrace_type_mask;
+
+// Two phase initialisation. Call ttrace_init() as soon as page memory
+// allocation is possible so that we can trace very early in system boot.
+// However, that early in boot we do not yet know how many cores we will
+// manange, ttrace_cleanup() is called once we know and returns memory
+// previously allocated for unused cores.
+extern void ttrace_init();
+extern void ttrace_cleanup();
+
+// devttrace accessor routines
+struct trace_ring* get_ttrace_ring_for_core(uint32_t coreid);
+void fill_ttrace_version(struct ttrace_version *versp);
+const uint8_t *bufferp get_ttrace_aux_buffer(void)
+void get_ttrace_aux_buffer_snapshot(ptrdiff_t *firstp, ptrdiff_t *lastp);
+
+// Low level data collection apis, do not use directly but rather use the macro
+// wrappers, such as TTRACE_PROC_ALLOC() below.
+extern uint64_t _ttrace_point(uint64_t type_start_stop,
+							  uintptr_t d0, uintptr_t d1, uintptr_t d2,
+							  uintptr_t d3, uintptr_t d4);
+extern void _ttrace_point_cont(uint64_t timestamp, uintptr_t d0, uintptr_t d1,
+							   uintptr_t d2, uintptr_t d3, uintptr_t d4,
+							   uintptr_t d5, uintptr_t d6);
+// Slow takes rwlock, updates auxillary buffer, call this rarely
+extern void _ttrace_point_string(uint8_t tag, uintptr_t ident, const char *str);
+
+static inline uintptr_t __attribute__((always_inline))
+take_tracepoint(uintptr_t type_start_stop)
+{
+	uintptr_t cur_mask = atomic_read((atomic_t *) &ttrace_type_mask);
+	return cur_mask & type_start_stop & TTRACE_TYPE_MASK;
+}
+
+static inline uint64_t _ttrace_point5(uint64_t type_start_stop,
+									  uintptr_t d0, uintptr_t d1, uintptr_t d2,
+									  uintptr_t d3, uintptr_t d4)
+{
+	if (take_tracepoint(type_start_stop))
+		return _ttrace_point(type_start_stop, d0, d1, d2, d3, d4);
+	return 0;
+}
+#define TTRACE_POINT5(type, start_stop, data...) do {						\
+	static_assert(type & TTRACE_TYPE_MASK);									\
+	_ttrace_point5(type | start_stop, data);								\
+} while(false)
+
+static inline uint64_t _ttrace_point12(uint64_t type_start_stop,
+									   uintptr_t d0, uintptr_t d1, uintptr_t d2,
+									   uintptr_t d3, uintptr_t d4, uintptr_t d5,
+									   uintptr_t d6, uintptr_t d7, uintptr_t d8,
+									   uintptr_t d9, uintptr_t da, uintptr_t db)
+{
+	if (take_tracepoint(type_start_stop)) {
+		const uint64_t tsc = _ttrace_point(type_start_stop, d0, d1, d2, d3, d4);
+		_ttrace_point_cont(tsc, d5, d6, d7, d8, d9, da, db);
+		return tsc;
+	} else
+		return 0;
+}
+#define TTRACE_POINT12(type, start_stop, data...) do {						\
+	static_assert(type & TTRACE_TYPE_MASK);									\
+	_ttrace_point12(type | start_stop, data);								\
+} while(false)
+
+static inline void _ttrace_string(uint64_t type, uint8_t tag,
+								  uintptr_t ident, const char* str)
+{
+	if (take_tracepoint(type))
+		_ttrace_point_string(tag, ident, str);
+}
+// string len < TTRACE_AUX_ENTRY_MAXSTRLEN
+#define TTRACE_STRING(type, tag, ident, str) do {							\
+	static_assert(type & TTRACE_TYPE_MASK);									\
+	static_assert(tag & TTRACEH_TAG_MASK);									\
+	_ttrace_string(type, tag, ident, str);
+} while(false)
+
+//
+// CONFIG_TTRACE macros
+//
+// These macros are the data collection side of ttrace. Each CONFIG_TTRACE type
+// has its own group of macros which are empty if the group is not configured.
+
+// CONFIG_TTRACE_PROCESS trace point macros.
+#ifdef CONFIG_TTRACE_PROCESS
+#define TTRACE_PROC_ALLOC(pid, ppid) TTRACE_POINT5(							\
+	TTRACE_TYPE_PROCESS, TTRACE_ENTRY_START, pid, ppid, 0, 0, 0);
+#define TTRACE_PROC_FREE(pid, cpid) TTRACE_POINT5(							\
+	TTRACE_TYPE_PROCESS, TTRACE_ENTRY_STOP, pid, cpid, 0, 0, 0);
+#define TTRACE_PROC_READY(pid) TTRACE_POINT5(								\
+	TTRACE_TYPE_PROCESS, TTRACE_ENTRY_ENTRY, pid, 0, 0, 0, 0);
+#define TTRACE_PROC_SETSTATE(pid, state, curstate) TTRACE_POINT5(			\
+	TTRACE_TYPE_PROCESS, TTRACE_ENTRY_EXIT, pid, state, curstate, 0, 0);
+#define TTRACE_PROC_SETNAME(pid, name)										\
+	TTRACE_STRING(TTRACE_TYPE_PROCESS, TTRACEH_TAG_PROC, pid, name)
+#else /* !CONFIG_TTRACE_PROCESS */
+#define TTRACE_PROC_ALLOC(pid, ppid)               TTRACE_NULL_STATEMENT
+#define TTRACE_PROC_FREE(pid, cpid)                TTRACE_NULL_STATEMENT
+#define TTRACE_PROC_READY(pid)                     TTRACE_NULL_STATEMENT
+#define TTRACE_PROC_SETSTATE(pid, state, curstate) TTRACE_NULL_STATEMENT
+#define TTRACE_PROC_SETNAME(pid, name)             TTRACE_NULL_STATEMENT
+#endif /* CONFIG_TTRACE_PROCESS */
+
+#endif /* !CONFIG_TTRACE */
+#endif /* INC_TTRACE_H */
diff --git a/kern/src/Kbuild b/kern/src/Kbuild
index e62fdce..4cce4b4 100644
--- a/kern/src/Kbuild
+++ b/kern/src/Kbuild
@@ -52,6 +52,7 @@
 obj-y						+= taskqueue.o
 obj-y						+= time.o
 obj-y						+= trace.o
+obj-$(CONFIG_TTRACE)        += ttrace.o
 obj-y						+= trap.o
 obj-y						+= ucq.o
 obj-y						+= umem.o
diff --git a/kern/src/init.c b/kern/src/init.c
index be24026..ff49a5a 100644
--- a/kern/src/init.c
+++ b/kern/src/init.c
@@ -43,6 +43,7 @@
 #include <ip.h>
 #include <acpi.h>
 #include <coreboot_tables.h>
+#include <ros/ttrace.h>
 
 int booting = 1;
 struct sysinfo_t sysinfo;
@@ -65,6 +66,7 @@
 
 	cache_init();					// Determine systems's cache properties
 	pmem_init(multiboot_kaddr);
+	ttrace_init();                  // Setup timed tracer, depends on pmem
 	kmem_cache_init();              // Sets up slab allocator
 	kmalloc_init();
 	hashtable_init();
@@ -85,6 +87,7 @@
 	kb_buf_init(&cons_buf);
 	arch_init();
 	block_init();
+	ttrace_cleanup(); // Must be called after arch_init()
 	enable_irq();
 	run_linker_funcs();
 	/* reset/init devtab after linker funcs 3 and 4.  these run NIC and medium
diff --git a/kern/src/ttrace.c b/kern/src/ttrace.c
new file mode 100644
index 0000000..9cfa60a
--- /dev/null
+++ b/kern/src/ttrace.c
@@ -0,0 +1,402 @@
+/* TODO(gvdl): Who holds the copyright?
+ * Godfrey van der Linden <gvdl@google.com>
+ * See LICENSE for details.
+ *
+ * Timed tracing.
+ *
+ * This file is only included if CONFIG_TTRACE=y
+ * TODO(gvdl): Documentation goes here. */
+
+#include <ttrace.h>
+
+#include <arch/arch.h>
+#include <atomic.h>
+#include <env.h>
+#include <page_alloc.h>
+#include <rwlock.h>
+#include <smp.h>
+#include <syscall.h>
+#include <trace.h>
+#include <trap.h>
+
+#ifndef min
+#define min(a, b) ({ \
+	typeof (a) _a = (a); typeof (b) _b = (b); _a < _b ? _a : _b; })
+#endif
+
+// Shared buffer must be a power of 2, static_asserted in ttrace_init.
+#define TTRACE_AUX_BUFFER_MB   ((size_t) 1)
+#define TTRACE_AUX_BUFFER_SZ   (TTRACE_AUX_BUFFER_MB * 1024 * 1024)
+#define TTRACE_AUX_BUFFER_MASK (TTRACE_AUX_BUFFER_SZ - 1)
+
+// TODO(gvdl): Need to make this a config parameter
+// TODO(gvdl): Implement dynamic buffer resize, virtually contig pages
+#define TTRACE_PER_CPU_BUFFER_MB ((size_t) 1) // 16K ttrace entries per cpu
+
+#define TTRACE_PER_CPU_BUFFER_SZ (TTRACE_PER_CPU_BUFFER_MB * 1024 * 1024)
+#define PER_CPU_ORDER ({													\
+	const uint64_t per_cpu_pages = TTRACE_PER_CPU_BUFFER_SZ / PGSIZE;		\
+	(size_t) LOG2_UP(per_cpu_pages);										\
+})
+
+// Global data
+uint64_t ttrace_type_mask;
+
+// Local data
+static rwlock_t ttrace_aux_rwlock;
+static ptrdiff_t ttrace_aux_first, ttrace_aux_last; // Offsets into aux
+static uint8_t ttrace_aux_buffer[TTRACE_AUX_BUFFER_SZ];
+static struct trace_ring ttrace_ring[MAX_NUM_CPUS];
+
+//
+// TTRACE implementation
+//
+void ttrace_init()
+{
+	// Single tracepoint entries should be atomically written.
+	static_assert(sizeof(struct ttrace_entry) == ARCH_CL_SIZE);
+	static_assert(
+	   sizeof(struct ttrace_entry_continuation) == sizeof(struct ttrace_entry));
+	static_assert(sizeof(ttrace_type_mask) == sizeof(atomic_t));
+	static_assert(IS_PWR2(TTRACE_AUX_BUFFER_MB));
+
+	atomic_init((atomic_t *) &ttrace_type_mask, 0);
+	rwinit(&ttrace_aux_rwlock);
+
+	/* ttrace_init() is called as soon as memory is initialised so that we can
+	 * profile the early boot sequence if we wish.  But that means it is called
+	 * before arch_init and until arch_init is called num_cpus is invalid. So
+	 * we allocate as much memory as we'll ever need here and will give the
+	 * memory back in ttrace_cleanup() which is called much later in the init
+	 * process. */
+	const size_t per_cpu_order = PER_CPU_ORDER;
+	const size_t per_cpu_bytes = (1 << per_cpu_order) * PGSIZE;
+
+	for (int i = 0; i < MAX_NUM_CPUS; i++) {
+		/* TODO(gvdl): Only need virtually contiguous pages! */
+		trace_ring_init(&ttrace_ring[i], get_cont_pages(per_cpu_order, 0),
+						TTRACE_PER_CPU_BUFFER_SZ, sizeof(struct ttrace_entry));
+	}
+
+	// Quickly record all of the systems calls, known at compile time
+	for (size_t i = 0; i < max_syscall; i++) {
+		const char * const name = syscall_table[i].name;
+		if (name)
+			_ttrace_point_string(TTRACEH_TAG_SYSC, i, name);
+	}
+}
+
+void ttrace_cleanup()
+{
+	/* Give back un-used trace_rings, assumes core_id is contiguous from 0 */
+	const size_t per_cpu_order = PER_CPU_ORDER;
+	for (int i = num_cpus; i < MAX_NUM_CPUS; i++) {
+		free_cont_pages(ttrace_ring[i].tr_buf, per_cpu_order);
+		memset(&ttrace_ring[i], 0, sizeof(ttrace_ring[i]));
+	}
+}
+
+//
+// devttrace interface
+//
+struct trace_ring* get_ttrace_ring_for_core(uint32_t coreid)
+{
+	return coreid < num_cpus? &ttrace_ring[coreid] : NULL;
+}
+
+void fill_ttrace_version(struct ttrace_version *versp)
+{
+	versp->magic = TTRACEVH_MAGIC;
+	versp->header_version = TTRACEVH_H_V1_0;
+	versp->shared_data_version = TTRACEVH_S_V1_0;
+	versp->percpu_data_version = TTRACEVH_C_V1_0;
+	versp->num_cpus = num_cpus;
+	versp->buffer_mask = TTRACE_AUX_BUFFER_MASK;
+	versp->first_offset = versp->last_offset = (ptrdiff_t) -1;
+}
+
+const uint8_t *bufferp get_ttrace_aux_buffer(void)
+{
+	return ttrace_aux_buffer;
+}
+
+void get_ttrace_aux_buffer_snapshot(ptrdiff_t *firstp, ptrdiff_t *lastp)
+{
+	rlock(&ttrace_aux_rwlock);
+	*firstp = ttrace_aux_first;
+	*lastp = ttrace_aux_last;
+	runlock(&ttrace_aux_rwlock);
+}
+
+//
+// ttrace data collection
+//
+
+// From Barret
+// An arbitrary kernel thread, running in the kernel, that might block in
+// sem_down can have a few sources:
+//   1) Process issued syscall, when it was submitted locally (trap/sysenter)
+//      and before it blocks. Then owning_proc, cur_proc, owning_vcoreid,
+//      kthread->sysc are accurate.
+//   2) Process issued syscall, either remotely or after it blocked and
+//      restarted at least once.  cur_proc and kthread->sysc are accurate.
+//   3) Process issued trap (SCP page faults on files are handled by the kernel
+//      with blocking kthreads.  There may be others.)  cur_proc is accurate.
+//      kthread->sysc should be 0. Might be fault into in cur_ctx.
+//   4) Kernel task, aka ktask.  It's kernel thread not in a user's context,
+//      has a name. No cur_proc or anything like that. There is kthread->name.
+//   5) Unnamed routine kernel messages.  Like a ktask, there is no user
+//      component. They have no names. Can use kmsg function pointer.
+//
+// For the syscalls (case 1 and 2), it is tempting to track the vcoreid, but it
+// can be wrong in case 2.  I think cur_proc->pid and kthread->sysc->num will
+// work.
+//
+// Case 3 is like case 1 and 2, but there is no kthread->sysc. Right now, it's
+// probably not worth it, but you can peak in cur_ctx and extract the trap
+// number.  that might be arch-dependent and nasty, so probably not worth it.
+// the only one that happens now is a PF.
+//
+// Case 4 and 5 are basically the same.  5 just has a blank name.
+// Incidentally, we can try to attach names to all RKMs.  It'll require some
+// work when it comes to how kthread->name is managed.  Might not be worth it,
+// just for this, unless we have a lot of unnamed RKMs gumming up the works.
+
+// TODO(gvdl): This function is a bit heavy weight for a fast path. It will be
+// better to add ttrace_ctx to per_cpu_info and set it when we change contexts
+// which should be way less frequently than computing it for every record.
+static inline void _ttrace_set_type(struct ttrace_type *tt, uint64_t type,
+									uint32_t coreid)
+{
+	const struct per_cpu_info *pcpui = &per_cpu_info[coreid];
+	const struct kthread * const kt = pcpui->cur_kthread;
+	const struct proc * const cp = pcpui->cur_proc;
+
+	tt->type_id = type & ~TTRACE_CTX_MASK;
+#if 0
+	/* TODO(gvdl) Investigate TRAP and IRQ contexts */
+	if (pcpui->__ctx_depth) {
+		tt->type_id |= TTRACE_CTX_IRQTRAP;
+		tt->ctx.ctx_depth = pcpui->__ctx_depth;
+	}
+#endif // 0
+	if (!cp) {
+		// case 4-5 Kernel task or routine KMSG
+		tt->ctx.kthread = (uintptr_t) kt;
+		tt->type_id |= (kt->name)? TTRACE_CTX_KTASK : TTRACE_CTX_RKMSG;
+	} else if (kt->sysc) { // case 1-2.
+		// TODO(gvdl) I'd like to log vcore if possible
+		tt->ctx.pid_syscall = (uintptr_t) cp->pid << 32 | kt->sysc->num;
+		tt->type_id |= TTRACE_CTX_PIDSYSC;
+	} else { // case 3
+		// TODO(gvdl) Trap id?
+		tt->ctx.pid_syscall = (uintptr_t) cp->pid << 32;
+		tt->type_id |= TTRACE_CTX_PIDTRAP;
+	}
+}
+
+static inline struct ttrace_entry *_ttrace_next_line(uint32_t coreid)
+{
+	struct trace_ring *per_cpu_ttrace_ring = &ttrace_ring[coreid];
+	return get_trace_slot_overwrite_racy(per_cpu_ttrace_ring);
+}
+
+static inline bool __attribute__((always_inline))
+_ttrace_on_fast_path(uint64_t type_start_stop)
+{
+	(void) type_start_stop; /* unused */
+	return true;  // No slow path dynamic checks yet, will be in future.
+}
+
+uint64_t _ttrace_point(uint64_t type_start_stop, uintptr_t d0, uintptr_t d1,
+					   uintptr_t d2, uintptr_t d3, uintptr_t d4)
+{
+	if (!_ttrace_on_fast_path(type_start_stop)) {
+		/* Slow path, will contain a list of more selective tests than the
+		 * simple TTRACE mask test in the header. */
+		assert(false);  // Unimplemented
+		return 0;
+	}
+
+	/* record tracepoint */
+	const uint32_t coreid = core_id();
+	struct ttrace_entry *cur_trace_entry = _ttrace_next_line(coreid);
+
+	/* Bottom bit of timestamp is used to indicate a continuation record */
+	const uint64_t tsc = read_tscp() & ~1ULL; // Clear bottom bit
+	cur_trace_entry->timestamp = -1ULL; // Invalidate entry
+	wmb();
+
+	_ttrace_set_type(&cur_trace_entry->t, type_start_stop, coreid);
+	cur_trace_entry->data0 = d0;
+	cur_trace_entry->data1 = d1;
+	cur_trace_entry->data2 = d2;
+	cur_trace_entry->data3 = d3;
+	cur_trace_entry->data4 = d4;
+	cur_trace_entry->timestamp = tsc; // Entry complete
+	wmb();
+
+	return tsc | 1;  /* Timestamp for any required continuation */
+}
+
+void _ttrace_point_cont(uint64_t timestamp,
+						uintptr_t d0, uintptr_t d1, uintptr_t d2, uintptr_t d3,
+						uintptr_t d4, uintptr_t d5, uintptr_t d6)
+{
+	dassert(timestamp & 1);
+	const uint32_t coreid = core_id();
+	struct ttrace_entry_continuation *cur_trace_cont
+		= (struct ttrace_entry_continuation *) _ttrace_next_line(coreid);
+
+	cur_trace_cont->timestamp = -1ULL; // Invalidate entry
+	wmb();
+
+	cur_trace_cont->data0 = d0;
+	cur_trace_cont->data1 = d1;
+	cur_trace_cont->data2 = d2;
+	cur_trace_cont->data3 = d3;
+	cur_trace_cont->data4 = d4;
+	cur_trace_cont->data5 = d5;
+	cur_trace_cont->data6 = d6;
+	cur_trace_cont->timestamp = timestamp; // Entry complete
+	wmb();
+}
+
+// Auxiliary buffer handling.
+// Offsets can be greater than the buffer's size
+static inline struct ttrace_aux_entry *ttrace_aux(ptrdiff_t offset)
+{
+	offset &= TTRACE_AUX_BUFFER_MASK;
+	return (struct ttrace_aux_entry *) (&ttrace_aux_buffer[offset]);
+}
+
+// Copy header concatenated with payload data into target, may wrap around the
+// ttrace_aux_buffer.
+static inline void ttrace_fill_aux(void *vtarget,
+								   const void *vheader,  const int header_len,
+								   const void *vpaydata, const int pay_len)
+{
+	uint8_t * const target = (uint8_t *) vtarget;
+	uint8_t * const header = (uint8_t *) vheader;
+	uint8_t * const paydata = (uint8_t *) vpaydata;
+	size_t offset = 0;
+	size_t remaining = header_len + pay_len;
+	do {
+		const ptrdiff_t buf_remaining
+			= &ttrace_aux_buffer[sizeof(ttrace_aux_buffer)] - target;
+		size_t len = min(remaining, buf_remaining);
+		uint8_t * const dcp = &target[offset];
+		const uint8_t *scp = NULL;
+		if (offset < header_len) {
+			scp = &header[offset];
+			if ((header_len - offset) < len)
+				len = header_len - offset;
+		} else {
+			const int pay_offset = offset - header_len;
+			scp = &paydata[pay_offset];
+			if ((pay_len - pay_offset) < len)
+				len = pay_len - pay_offset;
+		}
+		memcpy(dcp, scp, len);
+		offset += len;
+		remaining -= len;
+	} while (remaining > 0);
+}
+
+/* The ttrace_aux_buffer is copied out to user land as a large binary blob. As
+ * _ttrace_point_string is called from performance critical times, e.g.  proc
+ * creation, it is important that writing data does not contend on a long held
+ * reader lock.  Hence the complicated protocol between _ttrace_point_string
+ * and devttrace:ttdevread_aux. The goal is to minimize mutual exclusion time
+ * both between all writers and an occasional reader.
+ *
+ * aux write:
+ *   wlock
+ *     compute last entry
+ *     compute end of new last entry
+ *     advance first entry pointer past new last entry end
+ *     mark entry path, (tag, coreid, len, Entry INVALID)
+ *     wmb()  // Is this barrier necessary before an unlock?
+ *   wunlock
+ *   memcpy data to aux buffer at new entry location
+ *   wmb()
+ *   mark entry path, (tag, coreid, len, Entry VALID)
+ *
+ *  aux_read:
+ *    // Minimise time between offset copies by taking page faults now
+ *    touch every page in user space
+ *    rlock
+ *      copy last entry offset
+ *    runlock
+ *    memcpy auxilary buffer
+ *    rlock
+ *      copy first entry offset
+ *    runlock
+ *  On return from aux_read copied last and first entries are valid, probably,
+ *  and each entry can be checked by the client to determine if it stamped as
+ *  VALID. Note even if the entry is INVALID the length is always ok.
+ */
+void _ttrace_point_string(uint8_t tag, uintptr_t ident, const char *str)
+{
+	int len = strnlen(str, TTRACE_AUX_ENTRY_MAXSTRLEN);
+	assert(!str[len]);
+	len += sizeof(ttrace_aux_entry);  // Length of new record
+	const uint64_t path = TTRACE_AUX_PATH(tag, core_id(), len, 0);
+	struct ttrace_aux_entry *last_aux = NULL;
+
+	wlock(&ttrace_aux_rwlock);
+	do {
+		dassert(ttrace_aux_last == ttrace_aux_align(ttrace_aux_last));
+		dassert(ttrace_aux_first == ttrace_aux_align(ttrace_aux_first));
+
+		const ptrdiff_t old_aux_last = ttrace_aux_last;
+		struct ttrace_aux_entry *old_last_aux = ttrace_aux(old_aux_last);
+
+		// Don't care about wrapping the ttrace_aux function masks out high bits
+		ptrdiff_t new_aux_first = ttrace_aux_first;
+		ptrdiff_t new_aux_last
+			= old_aux_last + ttrace_aux_align(old_last_aux->len & ~1);
+		dassert(new_aux_last == ttrace_aux_align(new_aux_last));
+
+		const ptrdiff_t next_aux_last = new_aux_last + ttrace_aux_align(len);
+		struct ttrace_aux_entry *first_aux = ttrace_aux(new_aux_first);
+		while (new_aux_first + TTRACE_AUX_BUFFER_SZ < next_aux_last) {
+			const ptrdiff_t first_len
+				= ttrace_aux_align(TTRACE_AUX_LEN(first_aux->tag_len));
+			new_aux_first += first_len;
+			dassert(new_aux_first == ttrace_aux_align(new_aux_first));
+			first_aux = ttrace_aux(new_aux_first);
+		}
+		/* Check first for wrap, in which case wrap both first and last */
+		if (new_aux_first >= TTRACE_AUX_BUFFER_SZ) {
+			new_aux_first -= TTRACE_AUX_BUFFER_SZ;
+			new_aux_last -= TTRACE_AUX_BUFFER_SZ;
+		}
+		dassert(new_aux_first < TTRACE_AUX_BUFFER_SZ);
+		dassert(new_aux_last < 2 * TTRACE_AUX_BUFFER_SZ);
+
+		ttrace_aux_last = new_aux_last;
+		ttrace_aux_first = new_aux_first;
+		last_aux = ttrace_aux(new_aux_last);
+		atomic_set((atomic_t *) last_aux, path | 1);  // mark entry as invalid
+		/* No barrier necessary before an unlock */
+	} while(false);
+	wunlock(&ttrace_aux_rwlock);
+
+	struct ttrace_aux_entry header = {
+		.path = path | 1;
+		.timestamp = read_tscp();
+		.ident = ident
+	};
+	ttrace_fill_aux(last_aux, &header, sizeof(header), str, len);
+
+	/* The reader is doing a memcpy, not a formal read acquire. but I don't
+	 * think that will be an issue. It just means that a record will not be
+	 * valid this time the buffer is read. Next time the buffer is read the
+	 * write will most probably be complete.  But the wmb is still important
+	 * here because we can't have the payload write and release write swapped.
+	 * Not that that would happen anyway given we have just called a
+	 * function. */
+	wmb(); // Write release, record is valid now.
+	atomic_set((atomic_t *) last_aux, path);
+}