blob: 7ea62bdf65d16dfde4ff26e2d3492e151b495d92 [file] [log] [blame]
/*
* This file is part of the UCB release of Plan 9. It is subject to the license
* terms in the LICENSE file found in the top-level directory of this
* distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
* part of the UCB release of Plan 9, including this file, may be copied,
* modified, propagated, or distributed except according to the terms contained
* in the LICENSE file.
*/
/*
* This file implements the #T device and was based upon the UCB Plan 9 kprof.c
*/
#include <assert.h>
#include <atomic.h>
#include <kmalloc.h>
#include <ns.h>
#include <smallidpool.h>
#include <smp.h>
#include <stdio.h>
#include <string.h>
#include <trace.h>
#include <ttrace.h>
#include <umem.h>
#include <ros/fs.h>
/*
* ttrace macros and constant data
*/
#ifndef min
#define min(a, b) ({ \
typeof (a) _a = (a); typeof (b) _b = (b); _a < _b ? _a : _b; })
#endif
#define TTRACE_CTL_LEN 46
#define TTRACE_MAX_TSID min(MAX_U16_POOL_SZ, (1 << Logtsid))
#define TTRACE_NUM_OPENERS 8
#define TT_SAFE_GENBUF_SZ (GENBUF_SZ-1) // Leave room for newline
/* TODO(gvdl): I don't get plan 9's permissions, why do directories get group
* rx permissions, and what's with the DMDIR. Some devices use it and others
* don't. In theory the DMDIR is copied over by a higher layer but I have no
* idea why two copies seems necessary. */
#define TTPERMDIR (S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|DMDIR)
#define TTPERMRWFILE (S_IRUSR|S_IWUSR)
#define TTPERMROFILE (S_IRUSR)
enum {
Ttdevdirqid = 0,
Ttdevbase,
Ttdevctlqid = Ttdevbase, // 1
Ttdevauxqid, // 2
Ttdevcpudataqid, // 3
Logtype = 4, // Enough for 16 unique qids types
Masktype = (1 << Logtype) - 1,
Shifttype = 0,
Logcpu = 12, // Upto 4096 cpus can be time traced
Maskcpu = (1 << Logcpu) - 1,
Shiftcpu = Shifttype + Logtype,
/* ttrace timestamp id, used by data file readers */
Logtsid = 12 + 3, // 4096 cpus by 8 simultaneous opened
Masktsid = (1 << Logtsid) - 1,
Shifttsid = Shiftcpu + Logcpu,
};
#define TTYPE(x) ( (int) ((uint32_t)(x).path) & Masktype )
#define TTCPU(q) ( ((q).path >> Shiftcpu) & Maskcpu )
#define TTCPUQID(c, t) ( ((c) << Shiftcpu) | (t))
#define TTTSID(q) ( ((q).path >> Shifttsid) & Masktsid )
#define TTTSIDQID(q, i) ( ((i) << Shifttsid) | (q).path )
/*
* ttrace timestamp pool and accessor
*/
static uintptr_t *ttdevtimestamp; // array of open file timestamps
static struct u16_pool *ttdevtspool; // pool of timestamp indices
static inline int get_tsid(void) {
return (ttdevtspool)? get_u16(ttdevtspool) : -1;
}
static inline put_tsid(int tsid) {
dassert(tsid >= 1 && ttdevtspool);
put_u16(ttdevtspool, tsid);
}
/*
* ttrace device gen implementation
*
* #T directory layout
* [-1] {".", {Ttdevdirqid, 0, QTDIR}, 0, TTPERMDIR},
* [0..ncpu-1] {"cpunnn", {Ttdevcpudataqid|coreid}, 0, TTPERMRWFILE},
* [ncpu] {"ctl", {Ttdevctlqid}, TTRACE_CTL_LEN, TTPERMROFILE},
* [ncpu+1] {"aux", {Ttdevauxqid}, 0, TTPERMRWFILE},
*/
/* Generate qids for the top level directory */
static inline int ttdev1gen(const struct chan *c, int s, struct qid *qp)
{
int ret = 1;
int path = -1;
/* Must only be called to decode top level dir channel */
dassert(TTYPE(c->qid) == Ttdevdirqid);
if (s < num_cpus) // "cpunnn" data files
path = TTCPUQID(s, Ttdevcpudataqid);
else {
switch (s - num_cpus) {
case 0: path = Ttdevctlqid; break; // "ctl"
case 1: path = Ttdevauxqid; break; // "aux"
default: return -1;
}
}
dassert(path > 0);
mkqid(qp, path, 0, QTFILE);
return ret;
}
static int ttdevgen(struct chan *c, char *unused_name,
struct dirtab *unused_tab, int ntab, int s, struct dir *dp)
{
dassert(s >= 0); // DOTDOT must handled before getting here
/* Always return the top kprof dir for '..' */
if (s == DEVDOTDOT) {
static const struct qid topqid = {Ttdevdirqid, 0, QTDIR};
devdir(c, topqid, "#T", 0, eve, TTPERMDIR, dp);
return 1;
}
struct qid q = c->qid;
if (Ttdevdirqid == TTYPE(q) && ttdev1gen(c, s, &q) < 0)
return -1;
const char *name = NULL;
long perm = TTPERMRWFILE;
switch (TTYPE(q)) {
case Ttdevctlqid: name = "ctl"; break;
case Ttdevauxqid: name = "aux"; perm = TTPERMROFILE; break;
case Ttdevcpudataqid:
snprintf(get_cur_genbuf(), GENBUF_SZ, "cpu%03d", TTCPU(q));
name = get_cur_genbuf();
break;
default:
panic("devttrace: Where did bad qid come from?\n");
case Ttdevdirqid:
panic("devttrace: What happened to ttdev1gen decode?\n");
}
dassert(name);
devdir(c, q, (char *) name, 0, eve, perm, dp);
return 1;
}
/*
* ttrace read implementation
*/
static size_t ttdevcopyout(char *va, long n, size_t offset,
const char *buf, long len)
{
if (offset + len > n)
error(Eshort);
if (!current)
memcpy(&va[offset], buf, len);
else if (ESUCCESS != memcpy_to_user(current, &va[offset], buf, len)) {
/* UMEM */
// TODO(gvdl): No p9 equivalent to EFAULT, determine causes of failure.
error(Enovmem);
}
return len;
}
/* Context for trace_ring_foreach call of ttdevread_cpu_entry() */
#define TTRACE_ENTRY_QUADS (sizeof(struct ttrace_entry) / sizeof(uint64_t))
/* #quads * (whitespace + len(hex(quad))) */
#define CTXT_GENBUF_SZ (TTRACE_ENTRY_QUADS * (1 + 2 * sizeof(uint64_t)))
struct ttdevread_cpu_ctxt {
int64_t c;
uintptr_t min_timestamp;
char *va;
long n;
char genbuf[CTXT_GENBUF_SZ];
};
static inline int ttdevhexdigit(uint8_t x)
{
return "0123456789abcdef"[x];
}
static void ttdevread_cpu_entry(void *ventry, void *vctxt)
{
struct ttdevread_cpu_ctxt *ctxt = (struct ttdevread_cpu_ctxt *) vctxt;
/* A cache line aligned copy of the input entry, should make partial entrys
* less likely. Still an entry is bracketted with timestamp == -1 */
uint8_t buf[2 * sizeof(struct ttrace_entry)]; // 128 byte buffer
const uintptr_t size_mask = sizeof(struct ttrace_entry) - 1;
struct ttrace_entry* entry = (struct ttrace_entry *)
(((uintptr_t) buf + size_mask) & ~size_mask); // align to cache line
*entry = *((struct ttrace_entry *) ventry); // Grab the entry
/* If time stamp == -1 (i.e. entry is a partial) or is less than
* the minimum then ignore this entry */
if (!(entry->timestamp + 1) || entry->timestamp < ctxt->min_timestamp)
return;
uint64_t *sqp = (uint64_t *) entry;
char *dcp = ctxt->genbuf;
for (int i = 0; i < TTRACE_ENTRY_QUADS; i++) {
const uint64_t quad = sqp[i];
dcp[0] = ttdevhexdigit((quad >> 28) & 0xf);
dcp[1] = ttdevhexdigit((quad >> 24) & 0xf);
dcp[2] = ttdevhexdigit((quad >> 20) & 0xf);
dcp[3] = ttdevhexdigit((quad >> 16) & 0xf);
dcp[4] = ttdevhexdigit((quad >> 12) & 0xf);
dcp[5] = ttdevhexdigit((quad >> 8) & 0xf);
dcp[6] = ttdevhexdigit((quad >> 4) & 0xf);
dcp[7] = ttdevhexdigit((quad >> 0) & 0xf);
dcp[8] = ' ';
dcp += 9;
}
dassert(&ctxt->genbuf[sizeof(ctxt->genbuf)] == dcp);
dcp[-1] = '\n'; // Replace trailing space with a newline
ctxt->c += ttdevcopyout(ctxt->va, ctxt->n, ctxt->c,
ctxt->genbuf, sizeof(ctxt->genbuf));
}
/* iotimestamp takes the timestamp pointer and the I/Os offset and returns the
* minimum timestamp last requested in a write. In the case where the channel
* has been opened readonly we will complete the offset == 0 request and return
* end of file for all subsequent (offset > 0) requests; this allows cat to
* return one page of data. */
static inline uintptr_t ttdevread_mintimestamp(const int tsid, int64_t offset)
{
/* ttdevread_cpu code can not deal sensibly with offsets without making the
* code much more complicated, probably not worth it. */
if (offset)
return 0;
const uintptr_t min_timestamp = ttdevtimestamp[tsid];
if (min_timestamp > read_tscp()) {
// no point in trying to read the future.
error(Ebadarg);
}
return min_timestamp;
}
static inline long ttdevread_cpu(const int tsid,
int coreid, void *va, long n, int64_t offset)
{
ERRSTACK(1);
const uintptr_t min_timestamp = ttdevread_mintimestamp(tsid, offset);
if (!min_timestamp)
return 0;
struct ttdevread_cpu_ctxt *ctxt = kzalloc(sizeof(*ctxt), KMALLOC_WAIT);
if (!ctxt)
error(Enomem);
else if (waserror()) {
kfree(ctxt);
nexterror();
}
ctxt->min_timestamp = min_timestamp
ctxt->va = va;
ctxt->n = n;
struct trace_ring * const ring = get_ttrace_ring_for_core(coreid);
trace_ring_foreach(ring, &ttdevread_cpu_entry, ctxt);
kfree(ctxt);
poperror();
return ctxt->c;
}
static inline long ttdevread_ctl(void *va, long n, int64_t offset)
{
/* Read the ttrace_type_mask and create a 'setmask' ctl command
*
* cmd ttrace_bits bit mask
* setmask 0x0123456789abcdef 0x0123456789abcdef\n"
* 123456789012345678901234567890123456789012345 6 len 46 bytes
*/
char * const buffer = get_cur_genbuf();
static_assert(TTRACE_CTL_LEN <= GENBUF_SZ);
int c = snprintf(buffer, GENBUF_SZ, "setmask 0x%016llx 0x%016llx\n",
ttrace_type_mask & TTRACE_TYPE_MASK, TTRACE_TYPE_MASK);
dassert(TTRACE_CTL_LEN == c);
return readstr(offset, va, n, buffer);
}
/* This code will be more efficient if the user data is page aligned, but
* should work no matter which alignment the use gives.
* Output:
* Page 0: struct ttrace_version
* Page 1-n: Auxillary buffer.
*/
static inline long ttdevread_aux(uint8_t *va, long n, int64_t offset)
{
ptrdiff_t dummy_offset;
struct ttrace_version vers;
fill_ttrace_version(&vers);
const long buffer_length = vers.buffer_mask + 1;
if (offset)
return 0; // Only allow single reads at offset 0, all others are empty
else if (n < PGSIZE + buffer_length)
error(Etoosmall);
size_t c = PGSIZE; // Advance count to second page
/* Implements reader side of auxillary buffer protocol, see
* _ttrace_point_string comment in ttrace.c
*
* Touch memory to get any page faults out of the way now, hopefully we
* will not be under paging pressure. Note that I'm accumulating into the
* vers.last_offset so that the compiler doesn't throw out the memory touch
* loop, the vers.last_offset is reset when we take a buffer snapshot.
*
* TODO(gvdl): formalise memory pinning for later I/O.
*/
vers.last_offset = 0;
size_t t = PGSIZE + ((uintptr_t) va & (sizeof(long) - 1));
for (t = 0; t < n, t += PGSIZE)
vers.last_offset += atomic_read((atomic_t *) va[t]);
get_ttrace_aux_buffer_snapshot(&dummy_offset, &vers.last_offset);
const uint8_t * const aux_buffer = get_ttrace_aux_buffer();
c += ttdevcopyout(va, n, c, aux_buffer, buffer_length);
get_ttrace_aux_buffer_snapshot(&vers.first_offset, &dummy_ffset);
/* Output version with buffer offsets last */
ttdevcopyout(va, n, 0, &vers, sizeof(vers));
return c;
}
/*
* ttrace write utility routines and macros
*/
static uint64_t parseul(const char * const num_str, int base)
{
char *end_num = NULL;
uint64_t ret = strtoul(num_str, &end_num, base);
if (num_str == end_num)
error(Ebadarg);
return ret;
}
/*
* ttrace devtab entry points
*/
static void ttdevinit(void)
{
static_assert(MAX_NUM_CPUS <= Maskcpu); // Assert encoding is still good
/* Support upto 8 simultaneous opens on the ttrace/cpunnn files. */
const int pool_size
= min(TTRACE_MAX_TSID, TTRACE_NUM_OPENERS * num_cpus);
/* Test for too many cpus for our tsid mechanism, re-implement. */
dassert(num_cpus <= pool_size);
if (num_cpus > pool_size) {
printk("Insufficient ids for ttrace timestamp pool");
return;
}
const size_t ts_size = pool_size * sizeof(*ttdevtimestamp);
ttdevtimestamp = kmalloc(ts_size, KMALLOC_WAIT);
memset(ttdevtimestamp, 0xff, ts_size);
ttdevtspool = create_u16_pool(pool_size);
/* Always allocate 0 as a unused/NULL sentinel */
int tsidnull = get_tsid(ttdevtspool);
assert(!tsidnull);
ttdevtimestamp[tsidnull] = 1; // tsid[0] is set to timestamp of 1.
}
#define KFREE_AND_NULL(x) do { kfree(x); x = NULL; } while(false)
static void ttdevshutdown(void)
{
KFREE_AND_NULL(ttdevtspool);
KFREE_AND_NULL(ttdevtimestamp);
}
static struct chan *ttdevattach(char *spec)
{
return devattach('T', spec);
}
static struct walkqid *ttdevwalk(struct chan *c, struct chan *nc,
char **name, int nname)
{
return devwalk(c, nc, name, nname, NULL, 0, ttdevgen);
}
static int ttdevstat(struct chan *c, uint8_t *db, int n)
{
int ret = devstat(c, db, n, NULL, 0, ttdevgen);
return ret;
}
static struct chan *ttdevopen(struct chan *c, int omode)
{
const int o = openmode(omode);
int tsid = TTTSID(c->qid);
switch(TTYPE(c->qid)) {
default:
assert(false); // How did a bad chan get to us?
case Ttdevdirqid:
dassert(c->qid.type & QTDIR);
if (openmode(omode) != OREAD)
error(Eperm);
break;
case Ttdevcpudataqid:
if (tsid)
break; // Already allocated, reopen
if (o == O_RDWR) {
tsid = get_tsid(ttdevtspool);
if (tsid < 0)
error(Enoenv);
else
dassert(tsid);
ttdevtimestamp[tsid] = 1;
c->qid.path = TTTSIDQID(c->qid, tsid); // Record tsid
} else if (o == O_RDONLY) {
// Nothing to do for O_RDONLY
} else
error(Eperm);
break;
case Ttdevauxqid:
case Ttdevctlqid:
break;
}
c->mode = o;
c->flag |= COPEN;
c->offset = 0;
return c;
}
static void ttdevclose(struct chan *c)
{
if (!(c->flag & COPEN))
return;
const int tsid = TTTSID(c->qid);
switch (TTYPE(c->qid)) {
case Ttdevcpudataqid:
/* Release timestamp */
if (tsid) {
ttdevtimestamp[tsid] = -1;
put_tsid(ttdevtspool, tsid);
}
break;
case Ttdevauxqid:
case Ttdevctlqid:
case Ttdevdirqid:
break;
default:
assert(false);
}
}
static long ttdevread(struct chan *c, void *va, long n, int64_t offset)
{
const unsigned tsid = TTTSID(c->qid);
assert(tsid < ttdevtspool->size);
switch (TTYPE(c->qid)) {
case Ttdevdirqid: // ttrace directory read
return devdirread(c, va, n, NULL, 0, ttdevgen);
case Ttdevctlqid:
return ttdevread_ctl(va, n, offset);
case Ttdevauxqid:
return ttdevread_aux(va, n, offset);
case Ttdevcpudataqid:
return ttdevread_cpu(tsid, TTCPU(c->qid), va, n, offset);
}
return 0; // Not us
}
#define CONST_STRNCMP(vp, conststr) strncmp((vp), conststr, sizeof(conststr)-1)
static long ttdevwrite(struct chan *c, void *a, long n, int64_t unused_off)
{
ERRSTACK(1);
uintptr_t pc;
struct cmdbuf *cb;
static const char ctlstring[]
= "setmask <value> <mask>|setbits <value>|clrbits <mask>";
static const char tsstring[] = "settimestamp <minimum timestamp>";
cb = parsecmd(a, n);
if (waserror()) {
kfree(cb);
nexterror();
}
const unsigned tsid = TTTSID(c->qid);
assert(tsid < ttdevtspool->size);
uint64_t mask = TTRACE_TYPE_MASK;
uint64_t value = 0;
switch(TTYPE(c->qid)) {
default:
error(Ebadusefd);
case Ttdevctlqid:
if (cb->nf == 3 && !CONST_STRNCMP(cb->f[0], "setmask")) {
value = parseul(cb->f[1], 0);
mask &= parseul(cb->f[2], 0);
} else if (cb->nf == 2 && !CONST_STRNCMP(cb->f[0], "setbits"))
value = parseul(cb->f[1], 0);
else if (cb->nf == 2 && !CONST_STRNCMP(cb->f[0], "clrbits"))
mask &= parseul(cb->f[1], 0);
else
error(ctlstring);
/* Thread safe, but... lets face it if we have competing controllers
* setting and clearing mask bits then the behaviour is going to be
* unexpected. Perhaps we could enforce exclusive open of the ctl
* channel. */
{
uint64_t cur_mask, new_mask;
do {
cur_mask = atomic_read((void **) &ttrace_type_mask);
new_mask = (cur_mask & ~mask) | (value & mask);
} while (!atomic_cas((void **) &ttrace_type_mask,
cur_mask, new_mask));
}
break;
case Ttdevcpudataqid:
if (cb->nf == 2 && !CONST_STRNCMP(cb->f[0], "settimestamp")) {
if (!tsid) error(Ebadfd);
const char *endptr = NULL;
const unsigned long ts = parseul(cb->f[1], /* base */ 0);
if (ts > read_tscp()) // Is the timestamp in the future.
error(Ebadarg);
ttdevtimestamp[tsid] = ts;
} else
error((char *) tsstring);
break;
}
kfree(cb);
poperror();
return n;
}
struct dev ttdevdevtab __devtab = {
'T',
"ttrace",
devreset,
ttdevinit,
ttdevshutdown,
ttdevattach,
ttdevwalk,
ttdevstat,
ttdevopen,
devcreate,
ttdevclose,
ttdevread,
devbread,
ttdevwrite,
devbwrite,
devremove,
devwstat,
};