TTRACE timed trace kernel profiler.
Initial ttrace development, checkpoint commit.
diff --git a/Kconfig b/Kconfig
index b37e18d..fb30c90 100644
--- a/Kconfig
+++ b/Kconfig
@@ -131,14 +131,6 @@
(meaning it'll clobber older events). Always enables scheduling timed
tracing.
-config TTRACE_SEMAPHORES
- bool "Timed Trace Semaphores"
- depends on TTRACE
- default n
- help
- Turn on timestamp based thread tracing recording a timestamp, a
- semaphore up/down event type thread_id(gvdl?) in the TTRACE buffer.
-
config TTRACE_CV
bool "Timed Trace Condition Vars"
depends on TTRACE
@@ -148,14 +140,30 @@
condition waits and signals and underlying semaphore up/down event
types thread_id(gvdl?) in the TTRACE buffer.
+config TTRACE_PROCESS
+ bool "Timed Trace Process spawn, exec and death"
+ depends on TTRACE
+ default n
+ help
+ Turn on timestamp based processtracing recording the timestamp,
+ of interesting process related events such as proc creation,
+ destruction and execing.
+
config TTRACE_RWLOCK
bool "Timed Trace Readers/Writers locking"
depends on TTRACE
default n
help
- Turn on timestamp based thread tracing recording the timestamp,
- reader/writer lock type, thread_id, number of readers waiting, number
- of writes waiting.
+ Turn on timestamp based tracing recording timestamp and other
+ data for reader/writer lock interactions.
+
+config TTRACE_SEMAPHORES
+ bool "Timed Trace Semaphores"
+ depends on TTRACE
+ default n
+ help
+ Turn on timestamp based thread tracing recording a timestamp, a
+ semaphore up/down event type thread_id(gvdl?) in the TTRACE buffer.
endmenu
diff --git a/kern/drivers/dev/devttrace.c b/kern/drivers/dev/devttrace.c
index 067282f..7ea62bd 100644
--- a/kern/drivers/dev/devttrace.c
+++ b/kern/drivers/dev/devttrace.c
@@ -7,9 +7,9 @@
* in the LICENSE file.
*/
-//
-// This file implements the #T device and was based upon the UCB Plan 9 kprof.c
-//
+/*
+ * This file implements the #T device and was based upon the UCB Plan 9 kprof.c
+ */
#include <assert.h>
#include <atomic.h>
@@ -19,14 +19,11 @@
#include <smp.h>
#include <stdio.h>
#include <string.h>
-#include <syscall.h>
+#include <trace.h>
+#include <ttrace.h>
#include <umem.h>
#include <ros/fs.h>
-#include <ros/ttrace.h>
-
-#warning Remove this scaffold when ttrace profiling is going
-uint64_t ttrace_type_mask;
/*
* ttrace macros and constant data
@@ -41,10 +38,10 @@
#define TTRACE_NUM_OPENERS 8
#define TT_SAFE_GENBUF_SZ (GENBUF_SZ-1) // Leave room for newline
-// TODO(gvdl): I don't get plan 9's permissions, why do directories get group
-// rx permissions, and what's with the DMDIR. Some devices use it and others
-// don't. In theory the DMDIR is copied over by a higher layer but I have no
-// idea why two copies seems necessary.
+/* TODO(gvdl): I don't get plan 9's permissions, why do directories get group
+ * rx permissions, and what's with the DMDIR. Some devices use it and others
+ * don't. In theory the DMDIR is copied over by a higher layer but I have no
+ * idea why two copies seems necessary. */
#define TTPERMDIR (S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|DMDIR)
#define TTPERMRWFILE (S_IRUSR|S_IWUSR)
#define TTPERMROFILE (S_IRUSR)
@@ -53,7 +50,7 @@
Ttdevdirqid = 0,
Ttdevbase,
Ttdevctlqid = Ttdevbase, // 1
- Ttdevdataqid, // 2
+ Ttdevauxqid, // 2
Ttdevcpudataqid, // 3
Logtype = 4, // Enough for 16 unique qids types
@@ -64,8 +61,8 @@
Maskcpu = (1 << Logcpu) - 1,
Shiftcpu = Shifttype + Logtype,
- // ttrace timestamp id, used by data file readers
- Logtsid = 13 + 3, // 4096 ttrace/cpus + a ttrace/data file by 8 opened
+ /* ttrace timestamp id, used by data file readers */
+ Logtsid = 12 + 3, // 4096 cpus by 8 simultaneous opened
Masktsid = (1 << Logtsid) - 1,
Shifttsid = Shiftcpu + Logcpu,
};
@@ -77,34 +74,42 @@
#define TTTSIDQID(q, i) ( ((i) << Shifttsid) | (q).path )
/*
- * ttrace static data
+ * ttrace timestamp pool and accessor
*/
static uintptr_t *ttdevtimestamp; // array of open file timestamps
-static struct u16_pool *ttdevtspool; // pool of timestamp indices
+static struct u16_pool *ttdevtspool; // pool of timestamp indices
+static inline int get_tsid(void) {
+ return (ttdevtspool)? get_u16(ttdevtspool) : -1;
+}
+static inline put_tsid(int tsid) {
+ dassert(tsid >= 1 && ttdevtspool);
+ put_u16(ttdevtspool, tsid);
+}
-//
-// ttrace device gen implementation
-//
-// #T directory layout
-// [-1] {".", {Ttdevdirqid, 0, QTDIR}, 0, TTPERMDIR},
-// [0..ncpu-1] {"cpunnn", {Ttdevcpudataqid|core_id}, 0, TTPERMRWFILE},
-// [ncpu] {"ctl", {Ttdevctlqid}, TTRACE_CTL_LEN, TTPERMRWFILE},
-// [ncpu+1] {"data", {Ttdevdataqid}, 0, TTPERMRWFILE},
+/*
+ * ttrace device gen implementation
+ *
+ * #T directory layout
+ * [-1] {".", {Ttdevdirqid, 0, QTDIR}, 0, TTPERMDIR},
+ * [0..ncpu-1] {"cpunnn", {Ttdevcpudataqid|coreid}, 0, TTPERMRWFILE},
+ * [ncpu] {"ctl", {Ttdevctlqid}, TTRACE_CTL_LEN, TTPERMROFILE},
+ * [ncpu+1] {"aux", {Ttdevauxqid}, 0, TTPERMRWFILE},
+ */
-// Generate qids for the top level directory
+/* Generate qids for the top level directory */
static inline int ttdev1gen(const struct chan *c, int s, struct qid *qp)
{
int ret = 1;
int path = -1;
- // Must only be called to decode top level dir channel
+ /* Must only be called to decode top level dir channel */
dassert(TTYPE(c->qid) == Ttdevdirqid);
if (s < num_cpus) // "cpunnn" data files
path = TTCPUQID(s, Ttdevcpudataqid);
else {
switch (s - num_cpus) {
- case 0: path = Ttdevctlqid; break; // "ctl"
- case 1: path = Ttdevdataqid; break; // "data"
+ case 0: path = Ttdevctlqid; break; // "ctl"
+ case 1: path = Ttdevauxqid; break; // "aux"
default: return -1;
}
}
@@ -130,9 +135,10 @@
return -1;
const char *name = NULL;
+ long perm = TTPERMRWFILE;
switch (TTYPE(q)) {
- case Ttdevctlqid: name = "ctl"; break;
- case Ttdevdataqid: name = "data"; break;
+ case Ttdevctlqid: name = "ctl"; break;
+ case Ttdevauxqid: name = "aux"; perm = TTPERMROFILE; break;
case Ttdevcpudataqid:
snprintf(get_cur_genbuf(), GENBUF_SZ, "cpu%03d", TTCPU(q));
name = get_cur_genbuf();
@@ -144,7 +150,7 @@
panic("devttrace: What happened to ttdev1gen decode?\n");
}
dassert(name);
- devdir(c, q, (char *) name, 0, eve, TTPERMRWFILE, dp);
+ devdir(c, q, (char *) name, 0, eve, perm, dp);
return 1;
}
@@ -160,6 +166,7 @@
if (!current)
memcpy(&va[offset], buf, len);
else if (ESUCCESS != memcpy_to_user(current, &va[offset], buf, len)) {
+ /* UMEM */
// TODO(gvdl): No p9 equivalent to EFAULT, determine causes of failure.
error(Enovmem);
}
@@ -167,124 +174,72 @@
return len;
}
-static inline int ttdevputchar(char *buf, int len, char c)
+/* Context for trace_ring_foreach call of ttdevread_cpu_entry() */
+#define TTRACE_ENTRY_QUADS (sizeof(struct ttrace_entry) / sizeof(uint64_t))
+/* #quads * (whitespace + len(hex(quad))) */
+#define CTXT_GENBUF_SZ (TTRACE_ENTRY_QUADS * (1 + 2 * sizeof(uint64_t)))
+
+struct ttdevread_cpu_ctxt {
+ int64_t c;
+ uintptr_t min_timestamp;
+ char *va;
+ long n;
+ char genbuf[CTXT_GENBUF_SZ];
+};
+
+static inline int ttdevhexdigit(uint8_t x)
{
- dassert(len >= 1);
- *buf = c;
- return 1;
+ return "0123456789abcdef"[x];
}
-static inline int ttdevputhex8(char* buf, int len, uint8_t x)
+static void ttdevread_cpu_entry(void *ventry, void *vctxt)
{
- static const char hex_digits[] = "0123456789abcdef";
- int c = 0;
- c += ttdevputchar(&buf[c], len-c, hex_digits[x >> 4]);
- c += ttdevputchar(&buf[c], len-c, hex_digits[x & 0xf]);
- return c;
-}
+ struct ttdevread_cpu_ctxt *ctxt = (struct ttdevread_cpu_ctxt *) vctxt;
+ /* A cache line aligned copy of the input entry, should make partial entrys
+ * less likely. Still an entry is bracketted with timestamp == -1 */
+ uint8_t buf[2 * sizeof(struct ttrace_entry)]; // 128 byte buffer
+ const uintptr_t size_mask = sizeof(struct ttrace_entry) - 1;
+ struct ttrace_entry* entry = (struct ttrace_entry *)
+ (((uintptr_t) buf + size_mask) & ~size_mask); // align to cache line
+ *entry = *((struct ttrace_entry *) ventry); // Grab the entry
-static inline int ttdevputhex16(char *buf, int len, uint16_t x)
-{
- int c = ttdevputhex8(&buf[0], len, x >> 8);
- ttdevputhex8(&buf[c], len-c, (uint8_t) x);
- return sizeof(uint16_t) * 2;
-}
+ /* If time stamp == -1 (i.e. entry is a partial) or is less than
+ * the minimum then ignore this entry */
+ if (!(entry->timestamp + 1) || entry->timestamp < ctxt->min_timestamp)
+ return;
-static int ttdevputhex32(char *buf, int len, uint32_t x)
-{
- int c = ttdevputhex16(&buf[0], len, x >> 16);
- ttdevputhex16(&buf[c], len-c, (uint16_t) x);
- return sizeof(uint32_t) * 2;
-}
-
-static int ttdevputhex64(char *buf, int len, uint64_t x)
-{
- int c = ttdevputhex32(&buf[0], len, x >> 32);
- ttdevputhex32(&buf[c], len-c, (uint32_t) x);
- return sizeof(uint64_t) * 2;
-}
-
-static inline int ttdevputmem(char *buf, int len, const void *mem, int mem_len)
-{
- dassert(mem_len <= len);
- memcpy(buf, mem, mem_len);
- return mem_len;
-}
-
-static int ttdevputhdr(char *buf, int len,
- uint8_t rec_len, uint8_t tag, uintptr_t timestamp)
-{
- int c = 0;
- c += ttdevputhex8(&buf[c], len-c, rec_len);
- c += ttdevputhex8(&buf[c], len-c, tag);
- c += ttdevputhex64(&buf[c], len-c, timestamp);
- return c;
-}
-
-static size_t ttdev_readnamerec(void *va, long n, uint8_t tag,
- uintptr_t timestamp, uint64_t id,
- const char *name)
-{
- const int len = TT_SAFE_GENBUF_SZ; // Always leave room for newline
- char * const buffer = get_cur_genbuf();
- int remaining = strlen(name);
- size_t out_len = 0;
-
- /* Output first header */
- int nc = min(len - TTRACEH_NAME_LEN, remaining);
- int rec_len = TTRACEH_NAME_LEN + nc; dassert(rec_len <= len);
- int c = 0;
- c += ttdevputhdr(&buffer[c], len-c, rec_len, tag, timestamp);
- c += ttdevputhex64(&buffer[c], len-c, id);
- c += ttdevputmem(&buffer[c], len-c, name, nc);
- c += ttdevputchar(&buffer[c], len+1-c, '\n');
- out_len = ttdevcopyout(va, n, out_len, buffer, c);
-
- tag |= TTRACEH_TAG_CONT;
- const int buf_nm_len = len - TTRACEH_CONT_LEN;
- while ((remaining -= nc) > 0) {
- name += nc;
-
- nc = min(buf_nm_len, remaining);
- rec_len = TTRACEH_CONT_LEN + nc; dassert(rec_len <= len);
- c = 0;
- c += ttdevputhex8(&buffer[c], len-c, rec_len);
- c += ttdevputhex8(&buffer[c], len-c, tag);
- c += ttdevputmem(&buffer[c], len-c, name, nc);
- c += ttdevputchar(&buffer[c], len+1-c, '\n');
- out_len += ttdevcopyout(va, n, out_len, buffer, c);
+ uint64_t *sqp = (uint64_t *) entry;
+ char *dcp = ctxt->genbuf;
+ for (int i = 0; i < TTRACE_ENTRY_QUADS; i++) {
+ const uint64_t quad = sqp[i];
+ dcp[0] = ttdevhexdigit((quad >> 28) & 0xf);
+ dcp[1] = ttdevhexdigit((quad >> 24) & 0xf);
+ dcp[2] = ttdevhexdigit((quad >> 20) & 0xf);
+ dcp[3] = ttdevhexdigit((quad >> 16) & 0xf);
+ dcp[4] = ttdevhexdigit((quad >> 12) & 0xf);
+ dcp[5] = ttdevhexdigit((quad >> 8) & 0xf);
+ dcp[6] = ttdevhexdigit((quad >> 4) & 0xf);
+ dcp[7] = ttdevhexdigit((quad >> 0) & 0xf);
+ dcp[8] = ' ';
+ dcp += 9;
}
- return out_len;
+ dassert(&ctxt->genbuf[sizeof(ctxt->genbuf)] == dcp);
+ dcp[-1] = '\n'; // Replace trailing space with a newline
+
+ ctxt->c += ttdevcopyout(ctxt->va, ctxt->n, ctxt->c,
+ ctxt->genbuf, sizeof(ctxt->genbuf));
}
-static size_t ttdevread_info(void *va, long n, uintptr_t timestamp)
-{
- char * const buffer = get_cur_genbuf();
- const int len = TT_SAFE_GENBUF_SZ; /* Room for new line */
- const int rec_len = TTRACEH_LEN + 12; /* header + 3*h[4] versions */
- dassert(rec_len <= len);
- int c = 0;
-
- c += ttdevputhdr(&buffer[c], len-c, rec_len, TTRACEH_TAG_INFO, timestamp);
- c += ttdevputhex16(&buffer[c], len-c, TTRACEH_V1);
- c += ttdevputhex16(&buffer[c], len-c, TTRACEE_V1);
- c += ttdevputhex16(&buffer[c], len-c, num_cpus);
- dassert(c == rec_len); // Don't count '\n' in rec_len
- c += ttdevputchar(&buffer[c], len+1-c, '\n'); // Always have room for newline
-
- return ttdevcopyout(va, n, /* offset */ 0, buffer, c);
-}
-
-// iotimestamp takes the timestamp pointer and the I/Os offset and returns the
-// minimum timestamp last requested in a write. In the case where the channel
-// has been opened readonly we will complete the offset == 0 request and return
-// end of file for all subsequent (offset > 0) requests; this allows cat to
-// return one page of data.
+/* iotimestamp takes the timestamp pointer and the I/Os offset and returns the
+ * minimum timestamp last requested in a write. In the case where the channel
+ * has been opened readonly we will complete the offset == 0 request and return
+ * end of file for all subsequent (offset > 0) requests; this allows cat to
+ * return one page of data. */
static inline uintptr_t ttdevread_mintimestamp(const int tsid, int64_t offset)
{
- // If we don't have a timestamp, then only satisfy a single I/O, that is
- // anything with an offset of 0
- if (!tsid && offset)
+ /* ttdevread_cpu code can not deal sensibly with offsets without making the
+ * code much more complicated, probably not worth it. */
+ if (offset)
return 0;
const uintptr_t min_timestamp = ttdevtimestamp[tsid];
@@ -296,13 +251,42 @@
return min_timestamp;
}
+static inline long ttdevread_cpu(const int tsid,
+ int coreid, void *va, long n, int64_t offset)
+{
+ ERRSTACK(1);
+ const uintptr_t min_timestamp = ttdevread_mintimestamp(tsid, offset);
+ if (!min_timestamp)
+ return 0;
+
+ struct ttdevread_cpu_ctxt *ctxt = kzalloc(sizeof(*ctxt), KMALLOC_WAIT);
+ if (!ctxt)
+ error(Enomem);
+ else if (waserror()) {
+ kfree(ctxt);
+ nexterror();
+ }
+
+ ctxt->min_timestamp = min_timestamp
+ ctxt->va = va;
+ ctxt->n = n;
+
+ struct trace_ring * const ring = get_ttrace_ring_for_core(coreid);
+ trace_ring_foreach(ring, &ttdevread_cpu_entry, ctxt);
+
+ kfree(ctxt);
+ poperror();
+ return ctxt->c;
+}
+
static inline long ttdevread_ctl(void *va, long n, int64_t offset)
{
- // Read the ttrace_type_mask and create a 'setmask' ctl command
- //
- // cmd ttrace_bits bit mask
- // setmask 0x0123456789abcdef 0x0123456789abcdef\n"
- // 123456789012345678901234567890123456789012345 6 len 46 bytes
+ /* Read the ttrace_type_mask and create a 'setmask' ctl command
+ *
+ * cmd ttrace_bits bit mask
+ * setmask 0x0123456789abcdef 0x0123456789abcdef\n"
+ * 123456789012345678901234567890123456789012345 6 len 46 bytes
+ */
char * const buffer = get_cur_genbuf();
static_assert(TTRACE_CTL_LEN <= GENBUF_SZ);
@@ -313,53 +297,52 @@
return readstr(offset, va, n, buffer);
}
-static inline long ttdevread_data(const int tsid,
- uint8_t *va, long n, int64_t offset)
+/* This code will be more efficient if the user data is page aligned, but
+ * should work no matter which alignment the use gives.
+ * Output:
+ * Page 0: struct ttrace_version
+ * Page 1-n: Auxillary buffer.
+ */
+static inline long ttdevread_aux(uint8_t *va, long n, int64_t offset)
{
- const int min_timestamp = ttdevread_mintimestamp(tsid, offset);
- if (!min_timestamp)
- return 0;
+ ptrdiff_t dummy_offset;
+ struct ttrace_version vers;
+ fill_ttrace_version(&vers);
- // All data requested, Copy out basic data: version ids and syscall table
- size_t c = 0; // Number of characters output
- if (min_timestamp == 1) {
- c += ttdevread_info(&va[c], n - c, min_timestamp);
- for (size_t i = 0; i < max_syscall; i++) {
- const char * const name = syscall_table[i].name;
- if (!name) continue;
+ const long buffer_length = vers.buffer_mask + 1;
+ if (offset)
+ return 0; // Only allow single reads at offset 0, all others are empty
+ else if (n < PGSIZE + buffer_length)
+ error(Etoosmall);
- c += ttdev_readnamerec(&va[c], n - c, TTRACEH_TAG_SYSC,
- min_timestamp, i, name);
- }
- }
+ size_t c = PGSIZE; // Advance count to second page
- // Read ttrace data file, contains names and other data, this data is slow
- // to store and generate and is expected to be done only rarely. Such as at
- // process, semaphore, ktask creation.
+ /* Implements reader side of auxillary buffer protocol, see
+ * _ttrace_point_string comment in ttrace.c
+ *
+ * Touch memory to get any page faults out of the way now, hopefully we
+ * will not be under paging pressure. Note that I'm accumulating into the
+ * vers.last_offset so that the compiler doesn't throw out the memory touch
+ * loop, the vers.last_offset is reset when we take a buffer snapshot.
+ *
+ * TODO(gvdl): formalise memory pinning for later I/O.
+ */
+ vers.last_offset = 0;
+ size_t t = PGSIZE + ((uintptr_t) va & (sizeof(long) - 1));
+ for (t = 0; t < n, t += PGSIZE)
+ vers.last_offset += atomic_read((atomic_t *) va[t]);
+
+ get_ttrace_aux_buffer_snapshot(&dummy_offset, &vers.last_offset);
+ const uint8_t * const aux_buffer = get_ttrace_aux_buffer();
+ c += ttdevcopyout(va, n, c, aux_buffer, buffer_length);
+ get_ttrace_aux_buffer_snapshot(&vers.first_offset, &dummy_ffset);
+
+ /* Output version with buffer offsets last */
+ ttdevcopyout(va, n, 0, &vers, sizeof(vers));
+
return c;
}
-static inline long ttdevread_cpu(const int tsid,
- int core_id, void *va, long n, int64_t offset)
-{
- const int min_timestamp = ttdevread_mintimestamp(tsid, offset);
- if (!min_timestamp)
- return 0;
-
-#warning Scaffolding, test cpu timestamp reading
- // Read the timestamp and output it in decimal (max 20 digits).
- // cmd timestamp
- // setts 12345678901234567890\n"
- // 12345678901234567890123456 7 len 27
- char * const buffer = get_cur_genbuf();
- static_assert(27 <= GENBUF_SZ);
-
- int c = snprintf(buffer, GENBUF_SZ, "setts %lld\n", min_timestamp);
- dassert(c <= 27);
-
- return readstr(offset, va, n, buffer);
-}
-
/*
* ttrace write utility routines and macros
*/
@@ -372,8 +355,6 @@
return ret;
}
-#define CONST_STRNCMP(vp, conststr) strncmp((vp), conststr, sizeof(conststr)-1)
-
/*
* ttrace devtab entry points
*/
@@ -381,29 +362,32 @@
{
static_assert(MAX_NUM_CPUS <= Maskcpu); // Assert encoding is still good
- // Support upto 8 simultaneous opens on the ttrace/{data,cpu*} files.
+ /* Support upto 8 simultaneous opens on the ttrace/cpunnn files. */
const int pool_size
- = min(TTRACE_MAX_TSID, TTRACE_NUM_OPENERS * (1 + num_cpus));
- // Test for too many cpus for our tsid mechanism, re-implement.
- dassert(1 + num_cpus <= pool_size);
- if (1 + num_cpus > pool_size) return;
+ = min(TTRACE_MAX_TSID, TTRACE_NUM_OPENERS * num_cpus);
+ /* Test for too many cpus for our tsid mechanism, re-implement. */
+ dassert(num_cpus <= pool_size);
+ if (num_cpus > pool_size) {
+ printk("Insufficient ids for ttrace timestamp pool");
+ return;
+ }
const size_t ts_size = pool_size * sizeof(*ttdevtimestamp);
ttdevtimestamp = kmalloc(ts_size, KMALLOC_WAIT);
memset(ttdevtimestamp, 0xff, ts_size);
ttdevtspool = create_u16_pool(pool_size);
- // Always allocate 0 as a unused/NULL sentinel
- int tsidnull = get_u16(ttdevtspool);
+ /* Always allocate 0 as a unused/NULL sentinel */
+ int tsidnull = get_tsid(ttdevtspool);
assert(!tsidnull);
- ttdevtimestamp[tsidnull] = 1; // tsid==0 always has a 1 minimum timestamp.
+ ttdevtimestamp[tsidnull] = 1; // tsid[0] is set to timestamp of 1.
}
-#define kfree_and_null(x) do { kfree(x); x = NULL; } while(false)
+#define KFREE_AND_NULL(x) do { kfree(x); x = NULL; } while(false)
static void ttdevshutdown(void)
{
- kfree_and_null(ttdevtspool);
- kfree_and_null(ttdevtimestamp);
+ KFREE_AND_NULL(ttdevtspool);
+ KFREE_AND_NULL(ttdevtimestamp);
}
static struct chan *ttdevattach(char *spec)
@@ -437,14 +421,12 @@
error(Eperm);
break;
- case Ttdevdataqid:
case Ttdevcpudataqid:
if (tsid)
break; // Already allocated, reopen
- // Allocate a timestamp from the pool
if (o == O_RDWR) {
- tsid = get_u16(ttdevtspool);
+ tsid = get_tsid(ttdevtspool);
if (tsid < 0)
error(Enoenv);
else
@@ -457,6 +439,7 @@
error(Eperm);
break;
+ case Ttdevauxqid:
case Ttdevctlqid:
break;
}
@@ -475,14 +458,14 @@
const int tsid = TTTSID(c->qid);
switch (TTYPE(c->qid)) {
- case Ttdevdataqid:
case Ttdevcpudataqid:
- // Release timestamp
+ /* Release timestamp */
if (tsid) {
ttdevtimestamp[tsid] = -1;
- put_u16(ttdevtspool, tsid);
+ put_tsid(ttdevtspool, tsid);
}
break;
+ case Ttdevauxqid:
case Ttdevctlqid:
case Ttdevdirqid:
break;
@@ -501,14 +484,15 @@
return devdirread(c, va, n, NULL, 0, ttdevgen);
case Ttdevctlqid:
return ttdevread_ctl(va, n, offset);
- case Ttdevdataqid:
- return ttdevread_data(tsid, va, n, offset);
+ case Ttdevauxqid:
+ return ttdevread_aux(va, n, offset);
case Ttdevcpudataqid:
return ttdevread_cpu(tsid, TTCPU(c->qid), va, n, offset);
}
return 0; // Not us
}
+#define CONST_STRNCMP(vp, conststr) strncmp((vp), conststr, sizeof(conststr)-1)
static long ttdevwrite(struct chan *c, void *a, long n, int64_t unused_off)
{
ERRSTACK(1);
@@ -516,7 +500,7 @@
struct cmdbuf *cb;
static const char ctlstring[]
= "setmask <value> <mask>|setbits <value>|clrbits <mask>";
- static const char tsstring[] = "setts <minimum timestamp>";
+ static const char tsstring[] = "settimestamp <minimum timestamp>";
cb = parsecmd(a, n);
if (waserror()) {
@@ -545,10 +529,10 @@
else
error(ctlstring);
- // Thread safe, but... lets face it if we have competing controllers
- // setting anc clearing mask bits then the behaviour is going to be
- // unexpected. Perhaps I should enforce exclusive open of the ctl
- // channel.
+ /* Thread safe, but... lets face it if we have competing controllers
+ * setting and clearing mask bits then the behaviour is going to be
+ * unexpected. Perhaps we could enforce exclusive open of the ctl
+ * channel. */
{
uint64_t cur_mask, new_mask;
do {
@@ -560,8 +544,7 @@
break;
case Ttdevcpudataqid:
- case Ttdevdataqid:
- if (cb->nf == 2 && !CONST_STRNCMP(cb->f[0], "setts")) {
+ if (cb->nf == 2 && !CONST_STRNCMP(cb->f[0], "settimestamp")) {
if (!tsid) error(Ebadfd);
const char *endptr = NULL;
diff --git a/kern/include/ros/ttrace.h b/kern/include/ros/ttrace.h
index a1fc87f..f4191ce 100644
--- a/kern/include/ros/ttrace.h
+++ b/kern/include/ros/ttrace.h
@@ -3,73 +3,146 @@
* See LICENSE for details.
*
* Timed tracing.
- *
- * This is the first cut at a TTRACE header. At the moment only sufficient type
- * information for the initial implementation of kprof_ttrace logging output
- * subsystem to be prototyped. That is this file is currently part of the
- * scaffolding necessary to bring up a comlicated facility and break it up into
- * reasonably sized changes.
- */
+ * TODO(gvdl): Documentation goes here. */
#ifndef INC_ROS_TTRACE_H
#define INC_ROS_TTRACE_H
-#ifndef CONFIG_TTRACE
-
-// If the TTRACE is not configured then null out the initter
-#define ttrace_init() do { } while(false)
-#define ttrace_cleanup() do { } while(false)
-
-#else /* CONFIG_TTRACE */
-
-extern uint64_t ttrace_type_mask;
+#define TTRACE_DEV "#T"
+#define TTRACE_DEV_CTL TTRACE_DEV "/ctl"
+#define TTRACE_DEV_AUX TTRACE_DEV "/aux"
+#define TTRACE_DEV_CPUFMT TTRACE_DEV "/cpu%03d"
enum {
+ TTRACE_ENTRY_MASK = 0x0000000000000003ULL, /* two spare bits */
+ /* Exact meaning varies depends on TYPE */
+ TTRACE_ENTRY_START = 0x0000000000000000ULL, /* Start/unblock */
+ TTRACE_ENTRY_STOP = 0x0000000000000001ULL, /* Stop/block */
+ TTRACE_ENTRY_ENTRY = 0x0000000000000002ULL, /* Entry */
+ TTRACE_ENTRY_EXIT = 0x0000000000000003ULL, /* Exit */
+
+ /* Tag mask for the ctx union */
+ TTRACE_CTX_MASK = 0x0000000000000070ULL, /* one spare bit */
+ TTRACE_CTX_KTASK = 0x0000000000000000ULL, /* ctx.kthread */
+ TTRACE_CTX_RKMSG = 0x0000000000000010ULL, /* ctx.kthread */
+ TTRACE_CTX_PIDSYSC = 0x0000000000000020ULL, /* ctx.pid_syscall */
+ TTRACE_CTX_PIDTRAP = 0x0000000000000030ULL, /* ctx.pid_syscall */
+ TTRACE_CTX_IRQ = 0x0000000000000040ULL, /* ctx.ctx_depth */
+ TTRACE_CTX_TRAP = 0x0000000000000050ULL, /* ctx.ctx_depth */
+
+ /* 12 bits to encode cpu_id */
+ TTRACE_TYPE_PCPU_MASK = 0x00000000000fff00ULL, /* Not set by kernel */
+
/* Block of scheduling tracking trace type bits */
TTRACE_TYPE_MASK = 0xfffffffffff00000ULL,
+ TTRACE_TYPE_SCHED = 0x0000000000100000ULL,
+ TTRACE_TYPE_SEMA = 0x0000000000200000ULL,
+ TTRACE_TYPE_RWLOCK = 0x0000000000400000ULL,
+ TTRACE_TYPE_SYSCALL = 0x0000000000800000ULL,
+ TTRACE_TYPE_INTR = 0x0000000001000000ULL,
+ TTRACE_TYPE_PROCESS = 0x0000000002000000ULL,
- /* ttrace/data tags and misc */
- TTRACEH_V1 = 0x0100, /* Version 1.0 ttrace/data format */
- TTRACEE_V1 = 0x0090, /* Version 0.9 ttrace/cpu* scaffold */
- TTRACEH_TAG_CONT = 0x80, /* Continuation record top bit set */
- TTRACEH_TAG_INFO = 0x01, /* Data file info version */
- TTRACEH_TAG_SYSC = 0x02, /* Syscall entry */
- TTRACEH_TAG_PROC = 0x03, /* Proc name */
- TTRACEH_TAG_KTASK = 0x04, /* Ktask name */
- TTRACEH_TAG_SEM = 0x05, /* Semaphore name */
- TTRACEH_TAG_CV = 0x06, /* Condition name */
- TTRACEH_TAG_QLOCK = 0x07, /* Qlock name */
+ /* ttrace_version */
+ TTRACEVH_MAGIC = 0x0001020304050607;
+ TTRACEVH_CIGAM = 0x0706050403020100;
+ TTRACEVH_H_V1_0 = 0x00010000, /* Version 1.0 ttrace/data format */
+ TTRACEVH_S_V1_0 = 0x00010000, /* Version 1.0 ttrace/data format */
+ TTRACEVH_C_V1_0 = 0x00010000, /* Version 1.0 ttrace/cpu* format */
- /* ttrace/data header lengths */
- TTRACEH_CONT_LEN = 2 + 2, /* len, tag */
- TTRACEH_LEN = TTRACEH_CONT_LEN + 16, /* len + tag + timestamp */
- TTRACEH_NAME_LEN = TTRACEH_LEN + 16, /* header len + id len */
-};
+ /* ttrace/shared data tags */
+ TTRACES_TAG_CONT = 0x80000000, /* Tag continuation, top bit set */
+ TTRACES_TAG_INFO = 0x00000001, /* Data file specification */
+ TTRACES_TAG_SYSC = 0x00000002, /* Syscall entry */
+ TTRACES_TAG_PROC = 0x00000003, /* Proc name */
+ TTRACES_TAG_KTASK = 0x00000004, /* Ktask name */
+ TTRACES_TAG_SEM = 0x00000005, /* Semaphore name */
+ TTRACES_TAG_CV = 0x00000006, /* Condition name */
+ TTRACES_TAG_QLOCK = 0x00000007, /* Qlock name */
+}
-// Timed trace data version 1 record format,
-// All lines are nl terminated.
-// where h[n]-> n hex digits, s[n] -> n byte string
-//
-// Header:
-// h[2]: len Max 127(0xff)
-// h[2]: tag
-// h[16]: timestamp
-//
-// Info record:
-// h[20]: header, timestamp is always 1 tag: TTRACEH_TAG_INFO
-// h[4]: summary data version
-// h[4]: cpu data version
-// h[4]: num cpus
-//
-// Name record:
-// h[20]: header
-// h[16]: id; uintptr_t, such as kthread, pid or syscall number
-// s[hdr.len-36]: string (not nul terminated)
-//
-// Name continuation record(no timestamp), always follows a Name record:
-// h[2]: len
-// h[2]: tag (top bit set)
-// s[hdr.len-4]: string (not nul terminated)
-//
+// Binary information is always in host endianness, the reader makes right
+// assert(((struct ttrace_version *) <hdrp>)->magic == TTRACEVH_MAGIC);
+// TODO: Endianness correcting inlines when required
-#endif /* CONFIG_TTRACE */
+struct ttrace_version {
+ uint64_t magic; // Endianness of ttrace binary data, reader makes right
+ uint32_t header_version; // Version of this header
+ uint32_t shared_data_version; // Version of shared data
+ uint32_t percpu_data_version; // Version of percpu entries
+ uint32_t num_cpus;
+ ptrdiff_t buffer_mask; // Mask for offsets into shared buffer
+ ptrdiff_t first_offset; // First valid data in auxilary buffer
+ ptrdiff_t last_offset; // Last data entry in auxilary
+} __attribute__ ((packed));
+
+struct ttrace_type {
+ /* 0x00 */ uint64_t type_id;
+ /* 0x08 */ union {
+ uintptr_t kthread;
+ uintptr_t pid_syscall;
+ uint32_t ctx_depth;
+ } ctx;
+} __attribute__ ((packed));
+
+/* ttrace entry must be a 64byte (that is power of two cachelines) */
+struct ttrace_entry {
+ /* 0x00 */ uint64_t timestamp;
+ /* 0x08 */ struct ttrace_type t;
+ /* 0x18 */ uintptr_t data0;
+ /* 0x20 */ uintptr_t data1;
+ /* 0x28 */ uintptr_t data2;
+ /* 0x30 */ uintptr_t data3;
+ /* 0x38 */ uintptr_t data4;
+} __attribute__ ((packed));
+
+/* Continuations have the same timestamp with 0x1 bit set */
+struct ttrace_entry_continuation {
+ /* 0x00 */ uint64_t timestamp;
+ /* 0x08 */ uintptr_t data0;
+ /* 0x10 */ uintptr_t data1;
+ /* 0x18 */ uintptr_t data2;
+ /* 0x20 */ uintptr_t data3;
+ /* 0x28 */ uintptr_t data4;
+ /* 0x30 */ uintptr_t data5;
+ /* 0x38 */ uintptr_t data6;
+} __attribute__ ((packed));
+
+// len in bytes but records are laid down on 64bit boundaries, actual
+// length in memory is computed using ttrace_aux_entry_size().
+struct ttrace_aux_entry {
+ uint64_t path; // See path decode macros below
+ uint64_t timestamp;
+ // Payload
+ uintptr_t ident; // Optional: Identifier for record, eg. syscall number
+ uint8_t string[]; // String is not nul terminated
+} __attribute__ ((packed));
+
+#define TTRACE_AUX_PATH(tag, coreid, len, flags) ({ \
+ const uint64_t t = (uint32_t) tag; \
+ const uint64_t c = (uint16_t) coreid; \
+ const uint64_t paylen = (uint16_t) len - 16; \
+ const uint64_t f = flags & 0xf; \
+ (t << 32) | c << 16 | paylen << 4 | f; \
+})
+#define TTRACE_AUX_TAG(path) ((path >> 32))
+#define TTRACE_AUX_CPUID(path) ((path >> 16) & 0xffff)
+#define TTRACE_AUX_LEN(path) (((path >> 4) & 0xfff) + 16)
+#define TTRACE_AUX_FLAGS(path) (path & 0xf)
+#define TTRACE_AUX_ENTRY_MAXLEN ((size_t) 4096)
+#define TTRACE_AUX_ENTRY_MAXPAYLEN (TTRACE_AUX_ENTRY_MAXLEN - 16)
+#define TTRACE_AUX_ENTRY_MAXSTRLEN \
+ (TTRACE_AUX_ENTRY_MAXLEN - sizeof(struct ttrace_aux_entry))
+
+static inline size_t ttrace_aux_align(ptrdiff_t offset)
+{
+ const size_t align_mask = sizeof(uint64_t) - 1; // 8 byte alignment
+ return ((offset & ~align_mask) + align_mask) & ~align_mask;
+}
+
+static inline struct ttrace_aux_entry *ttrace_aux_entry_next(
+ const struct ttrace_aux_entry * const entry)
+{
+ const size_t len = ttrace_aux_entry_size(entry->len);
+ return (struct ttrace_aux_entry *) &((uint8_t*) entry)[len];
+}
+
#endif /* INC_ROS_TTRACE_H */
diff --git a/kern/include/trace.h b/kern/include/trace.h
index 611eb11..b854f04 100644
--- a/kern/include/trace.h
+++ b/kern/include/trace.h
@@ -28,10 +28,11 @@
* }
* trace_ring_foreach(my_trace_ring_ptr, trace_handler, optional_blob);
*
- * Rings can be racy or not, and can overwrite entries or not. If you are not
- * overwriting, the ring will stop giving you slots. You need to reset the ring
- * to get fresh slots again. If you are overwriting, you don't need to check
- * the return value of get_trace_slot_overwrite().
+ * Rings can be smp safe, per cpu or racy, and can overwrite entries or not.
+ * If you are not overwriting, the ring will stop giving you slots. You need
+ * to reset the ring to get fresh slots again. If you are overwriting, you
+ * don't need to check the return value of get_trace_slot_overwrite(). Per cpu
+ * rings are interrupt safe.
*
* Given there is overwrite, tr_next doesn't really tell us which ones were
* used. So your handler should check for a flag or something. Timestamps
@@ -41,10 +42,11 @@
#define ROS_INC_TRACE_H
#include <ros/common.h>
+#include <arch/arch.h>
#include <assert.h>
struct trace_ring {
- unsigned char *tr_buf;
+ uint8_t *tr_buf;
size_t tr_buf_sz;
unsigned int tr_event_sz_shift;
unsigned int tr_max;
@@ -85,6 +87,18 @@
return __get_tr_slot(tr, slot);
}
+/* Interrupt safe modification of tr_next */
+static inline unsigned int
+__get_next_percpu_and_add(struct trace_ring *tr, int8_t n)
+{
+ int8_t irq_state = 0;
+ disable_irqsave(&irq_state);
+ unsigned int ret = tr->tr_next;
+ tr->tr_next += n;
+ enable_irqsave(&irq_state);
+ return ret;
+}
+
static inline void *get_trace_slot(struct trace_ring *tr)
{
/* Using syncs, instead of atomics, since we access tr_next as both atomic
@@ -104,6 +118,22 @@
return __get_tr_slot_overwrite(tr, __sync_fetch_and_add(&tr->tr_next, 1));
}
+static inline void *get_trace_slot_percpu(struct trace_ring *tr)
+{
+ unsigned long my_slot = __get_next_percpu_and_add(tr, 1);
+ if (my_slot >= tr->tr_max) {
+ /* See comment in get_trace_slot. */
+ __get_next_percpu_and_add(tr, -1);
+ return 0;
+ }
+ return __get_tr_slot(tr, my_slot);
+}
+
+static inline void *get_trace_slot_overwrite_percpu(struct trace_ring *tr)
+{
+ return __get_tr_slot_overwrite(tr, __get_next_percpu_and_add(tr, 1));
+}
+
static inline void *get_trace_slot_racy(struct trace_ring *tr)
{
unsigned long my_slot = tr->tr_next;
diff --git a/kern/include/ttrace.h b/kern/include/ttrace.h
new file mode 100644
index 0000000..4b38c58
--- /dev/null
+++ b/kern/include/ttrace.h
@@ -0,0 +1,132 @@
+/* TODO(gvdl): Who holds the copyright?
+ * Godfrey van der Linden <gvdl@google.com>
+ * See LICENSE for details.
+ *
+ * Timed tracing.
+ * Kernel header
+ * TODO(gvdl): Documentation goes here. */
+#ifndef INC_TTRACE_H
+#define INC_TTRACE_H
+
+#define TTRACE_NULL_STATEMENT do { } while(0)
+
+#ifndef CONFIG_TTRACE
+
+// If the TTRACE is not configured then null out the initters
+#define ttrace_init() TTRACE_NULL_STATEMENT
+#define ttrace_cleanup() TTRACE_NULL_STATEMENT
+
+#else /* CONFIG_TTRACE */
+
+#include <atomic.h>
+#include <ros/ttrace.h>
+
+// Global data
+
+// Active tracepoint mask, see TTRACE_TYPE... enums in ros/ttrace.h
+extern uint64_t ttrace_type_mask;
+
+// Two phase initialisation. Call ttrace_init() as soon as page memory
+// allocation is possible so that we can trace very early in system boot.
+// However, that early in boot we do not yet know how many cores we will
+// manange, ttrace_cleanup() is called once we know and returns memory
+// previously allocated for unused cores.
+extern void ttrace_init();
+extern void ttrace_cleanup();
+
+// devttrace accessor routines
+struct trace_ring* get_ttrace_ring_for_core(uint32_t coreid);
+void fill_ttrace_version(struct ttrace_version *versp);
+const uint8_t *bufferp get_ttrace_aux_buffer(void)
+void get_ttrace_aux_buffer_snapshot(ptrdiff_t *firstp, ptrdiff_t *lastp);
+
+// Low level data collection apis, do not use directly but rather use the macro
+// wrappers, such as TTRACE_PROC_ALLOC() below.
+extern uint64_t _ttrace_point(uint64_t type_start_stop,
+ uintptr_t d0, uintptr_t d1, uintptr_t d2,
+ uintptr_t d3, uintptr_t d4);
+extern void _ttrace_point_cont(uint64_t timestamp, uintptr_t d0, uintptr_t d1,
+ uintptr_t d2, uintptr_t d3, uintptr_t d4,
+ uintptr_t d5, uintptr_t d6);
+// Slow takes rwlock, updates auxillary buffer, call this rarely
+extern void _ttrace_point_string(uint8_t tag, uintptr_t ident, const char *str);
+
+static inline uintptr_t __attribute__((always_inline))
+take_tracepoint(uintptr_t type_start_stop)
+{
+ uintptr_t cur_mask = atomic_read((atomic_t *) &ttrace_type_mask);
+ return cur_mask & type_start_stop & TTRACE_TYPE_MASK;
+}
+
+static inline uint64_t _ttrace_point5(uint64_t type_start_stop,
+ uintptr_t d0, uintptr_t d1, uintptr_t d2,
+ uintptr_t d3, uintptr_t d4)
+{
+ if (take_tracepoint(type_start_stop))
+ return _ttrace_point(type_start_stop, d0, d1, d2, d3, d4);
+ return 0;
+}
+#define TTRACE_POINT5(type, start_stop, data...) do { \
+ static_assert(type & TTRACE_TYPE_MASK); \
+ _ttrace_point5(type | start_stop, data); \
+} while(false)
+
+static inline uint64_t _ttrace_point12(uint64_t type_start_stop,
+ uintptr_t d0, uintptr_t d1, uintptr_t d2,
+ uintptr_t d3, uintptr_t d4, uintptr_t d5,
+ uintptr_t d6, uintptr_t d7, uintptr_t d8,
+ uintptr_t d9, uintptr_t da, uintptr_t db)
+{
+ if (take_tracepoint(type_start_stop)) {
+ const uint64_t tsc = _ttrace_point(type_start_stop, d0, d1, d2, d3, d4);
+ _ttrace_point_cont(tsc, d5, d6, d7, d8, d9, da, db);
+ return tsc;
+ } else
+ return 0;
+}
+#define TTRACE_POINT12(type, start_stop, data...) do { \
+ static_assert(type & TTRACE_TYPE_MASK); \
+ _ttrace_point12(type | start_stop, data); \
+} while(false)
+
+static inline void _ttrace_string(uint64_t type, uint8_t tag,
+ uintptr_t ident, const char* str)
+{
+ if (take_tracepoint(type))
+ _ttrace_point_string(tag, ident, str);
+}
+// string len < TTRACE_AUX_ENTRY_MAXSTRLEN
+#define TTRACE_STRING(type, tag, ident, str) do { \
+ static_assert(type & TTRACE_TYPE_MASK); \
+ static_assert(tag & TTRACEH_TAG_MASK); \
+ _ttrace_string(type, tag, ident, str);
+} while(false)
+
+//
+// CONFIG_TTRACE macros
+//
+// These macros are the data collection side of ttrace. Each CONFIG_TTRACE type
+// has its own group of macros which are empty if the group is not configured.
+
+// CONFIG_TTRACE_PROCESS trace point macros.
+#ifdef CONFIG_TTRACE_PROCESS
+#define TTRACE_PROC_ALLOC(pid, ppid) TTRACE_POINT5( \
+ TTRACE_TYPE_PROCESS, TTRACE_ENTRY_START, pid, ppid, 0, 0, 0);
+#define TTRACE_PROC_FREE(pid, cpid) TTRACE_POINT5( \
+ TTRACE_TYPE_PROCESS, TTRACE_ENTRY_STOP, pid, cpid, 0, 0, 0);
+#define TTRACE_PROC_READY(pid) TTRACE_POINT5( \
+ TTRACE_TYPE_PROCESS, TTRACE_ENTRY_ENTRY, pid, 0, 0, 0, 0);
+#define TTRACE_PROC_SETSTATE(pid, state, curstate) TTRACE_POINT5( \
+ TTRACE_TYPE_PROCESS, TTRACE_ENTRY_EXIT, pid, state, curstate, 0, 0);
+#define TTRACE_PROC_SETNAME(pid, name) \
+ TTRACE_STRING(TTRACE_TYPE_PROCESS, TTRACEH_TAG_PROC, pid, name)
+#else /* !CONFIG_TTRACE_PROCESS */
+#define TTRACE_PROC_ALLOC(pid, ppid) TTRACE_NULL_STATEMENT
+#define TTRACE_PROC_FREE(pid, cpid) TTRACE_NULL_STATEMENT
+#define TTRACE_PROC_READY(pid) TTRACE_NULL_STATEMENT
+#define TTRACE_PROC_SETSTATE(pid, state, curstate) TTRACE_NULL_STATEMENT
+#define TTRACE_PROC_SETNAME(pid, name) TTRACE_NULL_STATEMENT
+#endif /* CONFIG_TTRACE_PROCESS */
+
+#endif /* !CONFIG_TTRACE */
+#endif /* INC_TTRACE_H */
diff --git a/kern/src/Kbuild b/kern/src/Kbuild
index e62fdce..4cce4b4 100644
--- a/kern/src/Kbuild
+++ b/kern/src/Kbuild
@@ -52,6 +52,7 @@
obj-y += taskqueue.o
obj-y += time.o
obj-y += trace.o
+obj-$(CONFIG_TTRACE) += ttrace.o
obj-y += trap.o
obj-y += ucq.o
obj-y += umem.o
diff --git a/kern/src/init.c b/kern/src/init.c
index be24026..ff49a5a 100644
--- a/kern/src/init.c
+++ b/kern/src/init.c
@@ -43,6 +43,7 @@
#include <ip.h>
#include <acpi.h>
#include <coreboot_tables.h>
+#include <ros/ttrace.h>
int booting = 1;
struct sysinfo_t sysinfo;
@@ -65,6 +66,7 @@
cache_init(); // Determine systems's cache properties
pmem_init(multiboot_kaddr);
+ ttrace_init(); // Setup timed tracer, depends on pmem
kmem_cache_init(); // Sets up slab allocator
kmalloc_init();
hashtable_init();
@@ -85,6 +87,7 @@
kb_buf_init(&cons_buf);
arch_init();
block_init();
+ ttrace_cleanup(); // Must be called after arch_init()
enable_irq();
run_linker_funcs();
/* reset/init devtab after linker funcs 3 and 4. these run NIC and medium
diff --git a/kern/src/ttrace.c b/kern/src/ttrace.c
new file mode 100644
index 0000000..9cfa60a
--- /dev/null
+++ b/kern/src/ttrace.c
@@ -0,0 +1,402 @@
+/* TODO(gvdl): Who holds the copyright?
+ * Godfrey van der Linden <gvdl@google.com>
+ * See LICENSE for details.
+ *
+ * Timed tracing.
+ *
+ * This file is only included if CONFIG_TTRACE=y
+ * TODO(gvdl): Documentation goes here. */
+
+#include <ttrace.h>
+
+#include <arch/arch.h>
+#include <atomic.h>
+#include <env.h>
+#include <page_alloc.h>
+#include <rwlock.h>
+#include <smp.h>
+#include <syscall.h>
+#include <trace.h>
+#include <trap.h>
+
+#ifndef min
+#define min(a, b) ({ \
+ typeof (a) _a = (a); typeof (b) _b = (b); _a < _b ? _a : _b; })
+#endif
+
+// Shared buffer must be a power of 2, static_asserted in ttrace_init.
+#define TTRACE_AUX_BUFFER_MB ((size_t) 1)
+#define TTRACE_AUX_BUFFER_SZ (TTRACE_AUX_BUFFER_MB * 1024 * 1024)
+#define TTRACE_AUX_BUFFER_MASK (TTRACE_AUX_BUFFER_SZ - 1)
+
+// TODO(gvdl): Need to make this a config parameter
+// TODO(gvdl): Implement dynamic buffer resize, virtually contig pages
+#define TTRACE_PER_CPU_BUFFER_MB ((size_t) 1) // 16K ttrace entries per cpu
+
+#define TTRACE_PER_CPU_BUFFER_SZ (TTRACE_PER_CPU_BUFFER_MB * 1024 * 1024)
+#define PER_CPU_ORDER ({ \
+ const uint64_t per_cpu_pages = TTRACE_PER_CPU_BUFFER_SZ / PGSIZE; \
+ (size_t) LOG2_UP(per_cpu_pages); \
+})
+
+// Global data
+uint64_t ttrace_type_mask;
+
+// Local data
+static rwlock_t ttrace_aux_rwlock;
+static ptrdiff_t ttrace_aux_first, ttrace_aux_last; // Offsets into aux
+static uint8_t ttrace_aux_buffer[TTRACE_AUX_BUFFER_SZ];
+static struct trace_ring ttrace_ring[MAX_NUM_CPUS];
+
+//
+// TTRACE implementation
+//
+void ttrace_init()
+{
+ // Single tracepoint entries should be atomically written.
+ static_assert(sizeof(struct ttrace_entry) == ARCH_CL_SIZE);
+ static_assert(
+ sizeof(struct ttrace_entry_continuation) == sizeof(struct ttrace_entry));
+ static_assert(sizeof(ttrace_type_mask) == sizeof(atomic_t));
+ static_assert(IS_PWR2(TTRACE_AUX_BUFFER_MB));
+
+ atomic_init((atomic_t *) &ttrace_type_mask, 0);
+ rwinit(&ttrace_aux_rwlock);
+
+ /* ttrace_init() is called as soon as memory is initialised so that we can
+ * profile the early boot sequence if we wish. But that means it is called
+ * before arch_init and until arch_init is called num_cpus is invalid. So
+ * we allocate as much memory as we'll ever need here and will give the
+ * memory back in ttrace_cleanup() which is called much later in the init
+ * process. */
+ const size_t per_cpu_order = PER_CPU_ORDER;
+ const size_t per_cpu_bytes = (1 << per_cpu_order) * PGSIZE;
+
+ for (int i = 0; i < MAX_NUM_CPUS; i++) {
+ /* TODO(gvdl): Only need virtually contiguous pages! */
+ trace_ring_init(&ttrace_ring[i], get_cont_pages(per_cpu_order, 0),
+ TTRACE_PER_CPU_BUFFER_SZ, sizeof(struct ttrace_entry));
+ }
+
+ // Quickly record all of the systems calls, known at compile time
+ for (size_t i = 0; i < max_syscall; i++) {
+ const char * const name = syscall_table[i].name;
+ if (name)
+ _ttrace_point_string(TTRACEH_TAG_SYSC, i, name);
+ }
+}
+
+void ttrace_cleanup()
+{
+ /* Give back un-used trace_rings, assumes core_id is contiguous from 0 */
+ const size_t per_cpu_order = PER_CPU_ORDER;
+ for (int i = num_cpus; i < MAX_NUM_CPUS; i++) {
+ free_cont_pages(ttrace_ring[i].tr_buf, per_cpu_order);
+ memset(&ttrace_ring[i], 0, sizeof(ttrace_ring[i]));
+ }
+}
+
+//
+// devttrace interface
+//
+struct trace_ring* get_ttrace_ring_for_core(uint32_t coreid)
+{
+ return coreid < num_cpus? &ttrace_ring[coreid] : NULL;
+}
+
+void fill_ttrace_version(struct ttrace_version *versp)
+{
+ versp->magic = TTRACEVH_MAGIC;
+ versp->header_version = TTRACEVH_H_V1_0;
+ versp->shared_data_version = TTRACEVH_S_V1_0;
+ versp->percpu_data_version = TTRACEVH_C_V1_0;
+ versp->num_cpus = num_cpus;
+ versp->buffer_mask = TTRACE_AUX_BUFFER_MASK;
+ versp->first_offset = versp->last_offset = (ptrdiff_t) -1;
+}
+
+const uint8_t *bufferp get_ttrace_aux_buffer(void)
+{
+ return ttrace_aux_buffer;
+}
+
+void get_ttrace_aux_buffer_snapshot(ptrdiff_t *firstp, ptrdiff_t *lastp)
+{
+ rlock(&ttrace_aux_rwlock);
+ *firstp = ttrace_aux_first;
+ *lastp = ttrace_aux_last;
+ runlock(&ttrace_aux_rwlock);
+}
+
+//
+// ttrace data collection
+//
+
+// From Barret
+// An arbitrary kernel thread, running in the kernel, that might block in
+// sem_down can have a few sources:
+// 1) Process issued syscall, when it was submitted locally (trap/sysenter)
+// and before it blocks. Then owning_proc, cur_proc, owning_vcoreid,
+// kthread->sysc are accurate.
+// 2) Process issued syscall, either remotely or after it blocked and
+// restarted at least once. cur_proc and kthread->sysc are accurate.
+// 3) Process issued trap (SCP page faults on files are handled by the kernel
+// with blocking kthreads. There may be others.) cur_proc is accurate.
+// kthread->sysc should be 0. Might be fault into in cur_ctx.
+// 4) Kernel task, aka ktask. It's kernel thread not in a user's context,
+// has a name. No cur_proc or anything like that. There is kthread->name.
+// 5) Unnamed routine kernel messages. Like a ktask, there is no user
+// component. They have no names. Can use kmsg function pointer.
+//
+// For the syscalls (case 1 and 2), it is tempting to track the vcoreid, but it
+// can be wrong in case 2. I think cur_proc->pid and kthread->sysc->num will
+// work.
+//
+// Case 3 is like case 1 and 2, but there is no kthread->sysc. Right now, it's
+// probably not worth it, but you can peak in cur_ctx and extract the trap
+// number. that might be arch-dependent and nasty, so probably not worth it.
+// the only one that happens now is a PF.
+//
+// Case 4 and 5 are basically the same. 5 just has a blank name.
+// Incidentally, we can try to attach names to all RKMs. It'll require some
+// work when it comes to how kthread->name is managed. Might not be worth it,
+// just for this, unless we have a lot of unnamed RKMs gumming up the works.
+
+// TODO(gvdl): This function is a bit heavy weight for a fast path. It will be
+// better to add ttrace_ctx to per_cpu_info and set it when we change contexts
+// which should be way less frequently than computing it for every record.
+static inline void _ttrace_set_type(struct ttrace_type *tt, uint64_t type,
+ uint32_t coreid)
+{
+ const struct per_cpu_info *pcpui = &per_cpu_info[coreid];
+ const struct kthread * const kt = pcpui->cur_kthread;
+ const struct proc * const cp = pcpui->cur_proc;
+
+ tt->type_id = type & ~TTRACE_CTX_MASK;
+#if 0
+ /* TODO(gvdl) Investigate TRAP and IRQ contexts */
+ if (pcpui->__ctx_depth) {
+ tt->type_id |= TTRACE_CTX_IRQTRAP;
+ tt->ctx.ctx_depth = pcpui->__ctx_depth;
+ }
+#endif // 0
+ if (!cp) {
+ // case 4-5 Kernel task or routine KMSG
+ tt->ctx.kthread = (uintptr_t) kt;
+ tt->type_id |= (kt->name)? TTRACE_CTX_KTASK : TTRACE_CTX_RKMSG;
+ } else if (kt->sysc) { // case 1-2.
+ // TODO(gvdl) I'd like to log vcore if possible
+ tt->ctx.pid_syscall = (uintptr_t) cp->pid << 32 | kt->sysc->num;
+ tt->type_id |= TTRACE_CTX_PIDSYSC;
+ } else { // case 3
+ // TODO(gvdl) Trap id?
+ tt->ctx.pid_syscall = (uintptr_t) cp->pid << 32;
+ tt->type_id |= TTRACE_CTX_PIDTRAP;
+ }
+}
+
+static inline struct ttrace_entry *_ttrace_next_line(uint32_t coreid)
+{
+ struct trace_ring *per_cpu_ttrace_ring = &ttrace_ring[coreid];
+ return get_trace_slot_overwrite_racy(per_cpu_ttrace_ring);
+}
+
+static inline bool __attribute__((always_inline))
+_ttrace_on_fast_path(uint64_t type_start_stop)
+{
+ (void) type_start_stop; /* unused */
+ return true; // No slow path dynamic checks yet, will be in future.
+}
+
+uint64_t _ttrace_point(uint64_t type_start_stop, uintptr_t d0, uintptr_t d1,
+ uintptr_t d2, uintptr_t d3, uintptr_t d4)
+{
+ if (!_ttrace_on_fast_path(type_start_stop)) {
+ /* Slow path, will contain a list of more selective tests than the
+ * simple TTRACE mask test in the header. */
+ assert(false); // Unimplemented
+ return 0;
+ }
+
+ /* record tracepoint */
+ const uint32_t coreid = core_id();
+ struct ttrace_entry *cur_trace_entry = _ttrace_next_line(coreid);
+
+ /* Bottom bit of timestamp is used to indicate a continuation record */
+ const uint64_t tsc = read_tscp() & ~1ULL; // Clear bottom bit
+ cur_trace_entry->timestamp = -1ULL; // Invalidate entry
+ wmb();
+
+ _ttrace_set_type(&cur_trace_entry->t, type_start_stop, coreid);
+ cur_trace_entry->data0 = d0;
+ cur_trace_entry->data1 = d1;
+ cur_trace_entry->data2 = d2;
+ cur_trace_entry->data3 = d3;
+ cur_trace_entry->data4 = d4;
+ cur_trace_entry->timestamp = tsc; // Entry complete
+ wmb();
+
+ return tsc | 1; /* Timestamp for any required continuation */
+}
+
+void _ttrace_point_cont(uint64_t timestamp,
+ uintptr_t d0, uintptr_t d1, uintptr_t d2, uintptr_t d3,
+ uintptr_t d4, uintptr_t d5, uintptr_t d6)
+{
+ dassert(timestamp & 1);
+ const uint32_t coreid = core_id();
+ struct ttrace_entry_continuation *cur_trace_cont
+ = (struct ttrace_entry_continuation *) _ttrace_next_line(coreid);
+
+ cur_trace_cont->timestamp = -1ULL; // Invalidate entry
+ wmb();
+
+ cur_trace_cont->data0 = d0;
+ cur_trace_cont->data1 = d1;
+ cur_trace_cont->data2 = d2;
+ cur_trace_cont->data3 = d3;
+ cur_trace_cont->data4 = d4;
+ cur_trace_cont->data5 = d5;
+ cur_trace_cont->data6 = d6;
+ cur_trace_cont->timestamp = timestamp; // Entry complete
+ wmb();
+}
+
+// Auxiliary buffer handling.
+// Offsets can be greater than the buffer's size
+static inline struct ttrace_aux_entry *ttrace_aux(ptrdiff_t offset)
+{
+ offset &= TTRACE_AUX_BUFFER_MASK;
+ return (struct ttrace_aux_entry *) (&ttrace_aux_buffer[offset]);
+}
+
+// Copy header concatenated with payload data into target, may wrap around the
+// ttrace_aux_buffer.
+static inline void ttrace_fill_aux(void *vtarget,
+ const void *vheader, const int header_len,
+ const void *vpaydata, const int pay_len)
+{
+ uint8_t * const target = (uint8_t *) vtarget;
+ uint8_t * const header = (uint8_t *) vheader;
+ uint8_t * const paydata = (uint8_t *) vpaydata;
+ size_t offset = 0;
+ size_t remaining = header_len + pay_len;
+ do {
+ const ptrdiff_t buf_remaining
+ = &ttrace_aux_buffer[sizeof(ttrace_aux_buffer)] - target;
+ size_t len = min(remaining, buf_remaining);
+ uint8_t * const dcp = &target[offset];
+ const uint8_t *scp = NULL;
+ if (offset < header_len) {
+ scp = &header[offset];
+ if ((header_len - offset) < len)
+ len = header_len - offset;
+ } else {
+ const int pay_offset = offset - header_len;
+ scp = &paydata[pay_offset];
+ if ((pay_len - pay_offset) < len)
+ len = pay_len - pay_offset;
+ }
+ memcpy(dcp, scp, len);
+ offset += len;
+ remaining -= len;
+ } while (remaining > 0);
+}
+
+/* The ttrace_aux_buffer is copied out to user land as a large binary blob. As
+ * _ttrace_point_string is called from performance critical times, e.g. proc
+ * creation, it is important that writing data does not contend on a long held
+ * reader lock. Hence the complicated protocol between _ttrace_point_string
+ * and devttrace:ttdevread_aux. The goal is to minimize mutual exclusion time
+ * both between all writers and an occasional reader.
+ *
+ * aux write:
+ * wlock
+ * compute last entry
+ * compute end of new last entry
+ * advance first entry pointer past new last entry end
+ * mark entry path, (tag, coreid, len, Entry INVALID)
+ * wmb() // Is this barrier necessary before an unlock?
+ * wunlock
+ * memcpy data to aux buffer at new entry location
+ * wmb()
+ * mark entry path, (tag, coreid, len, Entry VALID)
+ *
+ * aux_read:
+ * // Minimise time between offset copies by taking page faults now
+ * touch every page in user space
+ * rlock
+ * copy last entry offset
+ * runlock
+ * memcpy auxilary buffer
+ * rlock
+ * copy first entry offset
+ * runlock
+ * On return from aux_read copied last and first entries are valid, probably,
+ * and each entry can be checked by the client to determine if it stamped as
+ * VALID. Note even if the entry is INVALID the length is always ok.
+ */
+void _ttrace_point_string(uint8_t tag, uintptr_t ident, const char *str)
+{
+ int len = strnlen(str, TTRACE_AUX_ENTRY_MAXSTRLEN);
+ assert(!str[len]);
+ len += sizeof(ttrace_aux_entry); // Length of new record
+ const uint64_t path = TTRACE_AUX_PATH(tag, core_id(), len, 0);
+ struct ttrace_aux_entry *last_aux = NULL;
+
+ wlock(&ttrace_aux_rwlock);
+ do {
+ dassert(ttrace_aux_last == ttrace_aux_align(ttrace_aux_last));
+ dassert(ttrace_aux_first == ttrace_aux_align(ttrace_aux_first));
+
+ const ptrdiff_t old_aux_last = ttrace_aux_last;
+ struct ttrace_aux_entry *old_last_aux = ttrace_aux(old_aux_last);
+
+ // Don't care about wrapping the ttrace_aux function masks out high bits
+ ptrdiff_t new_aux_first = ttrace_aux_first;
+ ptrdiff_t new_aux_last
+ = old_aux_last + ttrace_aux_align(old_last_aux->len & ~1);
+ dassert(new_aux_last == ttrace_aux_align(new_aux_last));
+
+ const ptrdiff_t next_aux_last = new_aux_last + ttrace_aux_align(len);
+ struct ttrace_aux_entry *first_aux = ttrace_aux(new_aux_first);
+ while (new_aux_first + TTRACE_AUX_BUFFER_SZ < next_aux_last) {
+ const ptrdiff_t first_len
+ = ttrace_aux_align(TTRACE_AUX_LEN(first_aux->tag_len));
+ new_aux_first += first_len;
+ dassert(new_aux_first == ttrace_aux_align(new_aux_first));
+ first_aux = ttrace_aux(new_aux_first);
+ }
+ /* Check first for wrap, in which case wrap both first and last */
+ if (new_aux_first >= TTRACE_AUX_BUFFER_SZ) {
+ new_aux_first -= TTRACE_AUX_BUFFER_SZ;
+ new_aux_last -= TTRACE_AUX_BUFFER_SZ;
+ }
+ dassert(new_aux_first < TTRACE_AUX_BUFFER_SZ);
+ dassert(new_aux_last < 2 * TTRACE_AUX_BUFFER_SZ);
+
+ ttrace_aux_last = new_aux_last;
+ ttrace_aux_first = new_aux_first;
+ last_aux = ttrace_aux(new_aux_last);
+ atomic_set((atomic_t *) last_aux, path | 1); // mark entry as invalid
+ /* No barrier necessary before an unlock */
+ } while(false);
+ wunlock(&ttrace_aux_rwlock);
+
+ struct ttrace_aux_entry header = {
+ .path = path | 1;
+ .timestamp = read_tscp();
+ .ident = ident
+ };
+ ttrace_fill_aux(last_aux, &header, sizeof(header), str, len);
+
+ /* The reader is doing a memcpy, not a formal read acquire. but I don't
+ * think that will be an issue. It just means that a record will not be
+ * valid this time the buffer is read. Next time the buffer is read the
+ * write will most probably be complete. But the wmb is still important
+ * here because we can't have the payload write and release write swapped.
+ * Not that that would happen anyway given we have just called a
+ * function. */
+ wmb(); // Write release, record is valid now.
+ atomic_set((atomic_t *) last_aux, path);
+}