blob: 9cfa60aa29f0235fe2c58bfa7293bc0615c0bcee [file] [log] [blame]
/* TODO(gvdl): Who holds the copyright?
* Godfrey van der Linden <gvdl@google.com>
* See LICENSE for details.
*
* Timed tracing.
*
* This file is only included if CONFIG_TTRACE=y
* TODO(gvdl): Documentation goes here. */
#include <ttrace.h>
#include <arch/arch.h>
#include <atomic.h>
#include <env.h>
#include <page_alloc.h>
#include <rwlock.h>
#include <smp.h>
#include <syscall.h>
#include <trace.h>
#include <trap.h>
#ifndef min
#define min(a, b) ({ \
typeof (a) _a = (a); typeof (b) _b = (b); _a < _b ? _a : _b; })
#endif
// Shared buffer must be a power of 2, static_asserted in ttrace_init.
#define TTRACE_AUX_BUFFER_MB ((size_t) 1)
#define TTRACE_AUX_BUFFER_SZ (TTRACE_AUX_BUFFER_MB * 1024 * 1024)
#define TTRACE_AUX_BUFFER_MASK (TTRACE_AUX_BUFFER_SZ - 1)
// TODO(gvdl): Need to make this a config parameter
// TODO(gvdl): Implement dynamic buffer resize, virtually contig pages
#define TTRACE_PER_CPU_BUFFER_MB ((size_t) 1) // 16K ttrace entries per cpu
#define TTRACE_PER_CPU_BUFFER_SZ (TTRACE_PER_CPU_BUFFER_MB * 1024 * 1024)
#define PER_CPU_ORDER ({ \
const uint64_t per_cpu_pages = TTRACE_PER_CPU_BUFFER_SZ / PGSIZE; \
(size_t) LOG2_UP(per_cpu_pages); \
})
// Global data
uint64_t ttrace_type_mask;
// Local data
static rwlock_t ttrace_aux_rwlock;
static ptrdiff_t ttrace_aux_first, ttrace_aux_last; // Offsets into aux
static uint8_t ttrace_aux_buffer[TTRACE_AUX_BUFFER_SZ];
static struct trace_ring ttrace_ring[MAX_NUM_CPUS];
//
// TTRACE implementation
//
void ttrace_init()
{
// Single tracepoint entries should be atomically written.
static_assert(sizeof(struct ttrace_entry) == ARCH_CL_SIZE);
static_assert(
sizeof(struct ttrace_entry_continuation) == sizeof(struct ttrace_entry));
static_assert(sizeof(ttrace_type_mask) == sizeof(atomic_t));
static_assert(IS_PWR2(TTRACE_AUX_BUFFER_MB));
atomic_init((atomic_t *) &ttrace_type_mask, 0);
rwinit(&ttrace_aux_rwlock);
/* ttrace_init() is called as soon as memory is initialised so that we can
* profile the early boot sequence if we wish. But that means it is called
* before arch_init and until arch_init is called num_cpus is invalid. So
* we allocate as much memory as we'll ever need here and will give the
* memory back in ttrace_cleanup() which is called much later in the init
* process. */
const size_t per_cpu_order = PER_CPU_ORDER;
const size_t per_cpu_bytes = (1 << per_cpu_order) * PGSIZE;
for (int i = 0; i < MAX_NUM_CPUS; i++) {
/* TODO(gvdl): Only need virtually contiguous pages! */
trace_ring_init(&ttrace_ring[i], get_cont_pages(per_cpu_order, 0),
TTRACE_PER_CPU_BUFFER_SZ, sizeof(struct ttrace_entry));
}
// Quickly record all of the systems calls, known at compile time
for (size_t i = 0; i < max_syscall; i++) {
const char * const name = syscall_table[i].name;
if (name)
_ttrace_point_string(TTRACEH_TAG_SYSC, i, name);
}
}
void ttrace_cleanup()
{
/* Give back un-used trace_rings, assumes core_id is contiguous from 0 */
const size_t per_cpu_order = PER_CPU_ORDER;
for (int i = num_cpus; i < MAX_NUM_CPUS; i++) {
free_cont_pages(ttrace_ring[i].tr_buf, per_cpu_order);
memset(&ttrace_ring[i], 0, sizeof(ttrace_ring[i]));
}
}
//
// devttrace interface
//
struct trace_ring* get_ttrace_ring_for_core(uint32_t coreid)
{
return coreid < num_cpus? &ttrace_ring[coreid] : NULL;
}
void fill_ttrace_version(struct ttrace_version *versp)
{
versp->magic = TTRACEVH_MAGIC;
versp->header_version = TTRACEVH_H_V1_0;
versp->shared_data_version = TTRACEVH_S_V1_0;
versp->percpu_data_version = TTRACEVH_C_V1_0;
versp->num_cpus = num_cpus;
versp->buffer_mask = TTRACE_AUX_BUFFER_MASK;
versp->first_offset = versp->last_offset = (ptrdiff_t) -1;
}
const uint8_t *bufferp get_ttrace_aux_buffer(void)
{
return ttrace_aux_buffer;
}
void get_ttrace_aux_buffer_snapshot(ptrdiff_t *firstp, ptrdiff_t *lastp)
{
rlock(&ttrace_aux_rwlock);
*firstp = ttrace_aux_first;
*lastp = ttrace_aux_last;
runlock(&ttrace_aux_rwlock);
}
//
// ttrace data collection
//
// From Barret
// An arbitrary kernel thread, running in the kernel, that might block in
// sem_down can have a few sources:
// 1) Process issued syscall, when it was submitted locally (trap/sysenter)
// and before it blocks. Then owning_proc, cur_proc, owning_vcoreid,
// kthread->sysc are accurate.
// 2) Process issued syscall, either remotely or after it blocked and
// restarted at least once. cur_proc and kthread->sysc are accurate.
// 3) Process issued trap (SCP page faults on files are handled by the kernel
// with blocking kthreads. There may be others.) cur_proc is accurate.
// kthread->sysc should be 0. Might be fault into in cur_ctx.
// 4) Kernel task, aka ktask. It's kernel thread not in a user's context,
// has a name. No cur_proc or anything like that. There is kthread->name.
// 5) Unnamed routine kernel messages. Like a ktask, there is no user
// component. They have no names. Can use kmsg function pointer.
//
// For the syscalls (case 1 and 2), it is tempting to track the vcoreid, but it
// can be wrong in case 2. I think cur_proc->pid and kthread->sysc->num will
// work.
//
// Case 3 is like case 1 and 2, but there is no kthread->sysc. Right now, it's
// probably not worth it, but you can peak in cur_ctx and extract the trap
// number. that might be arch-dependent and nasty, so probably not worth it.
// the only one that happens now is a PF.
//
// Case 4 and 5 are basically the same. 5 just has a blank name.
// Incidentally, we can try to attach names to all RKMs. It'll require some
// work when it comes to how kthread->name is managed. Might not be worth it,
// just for this, unless we have a lot of unnamed RKMs gumming up the works.
// TODO(gvdl): This function is a bit heavy weight for a fast path. It will be
// better to add ttrace_ctx to per_cpu_info and set it when we change contexts
// which should be way less frequently than computing it for every record.
static inline void _ttrace_set_type(struct ttrace_type *tt, uint64_t type,
uint32_t coreid)
{
const struct per_cpu_info *pcpui = &per_cpu_info[coreid];
const struct kthread * const kt = pcpui->cur_kthread;
const struct proc * const cp = pcpui->cur_proc;
tt->type_id = type & ~TTRACE_CTX_MASK;
#if 0
/* TODO(gvdl) Investigate TRAP and IRQ contexts */
if (pcpui->__ctx_depth) {
tt->type_id |= TTRACE_CTX_IRQTRAP;
tt->ctx.ctx_depth = pcpui->__ctx_depth;
}
#endif // 0
if (!cp) {
// case 4-5 Kernel task or routine KMSG
tt->ctx.kthread = (uintptr_t) kt;
tt->type_id |= (kt->name)? TTRACE_CTX_KTASK : TTRACE_CTX_RKMSG;
} else if (kt->sysc) { // case 1-2.
// TODO(gvdl) I'd like to log vcore if possible
tt->ctx.pid_syscall = (uintptr_t) cp->pid << 32 | kt->sysc->num;
tt->type_id |= TTRACE_CTX_PIDSYSC;
} else { // case 3
// TODO(gvdl) Trap id?
tt->ctx.pid_syscall = (uintptr_t) cp->pid << 32;
tt->type_id |= TTRACE_CTX_PIDTRAP;
}
}
static inline struct ttrace_entry *_ttrace_next_line(uint32_t coreid)
{
struct trace_ring *per_cpu_ttrace_ring = &ttrace_ring[coreid];
return get_trace_slot_overwrite_racy(per_cpu_ttrace_ring);
}
static inline bool __attribute__((always_inline))
_ttrace_on_fast_path(uint64_t type_start_stop)
{
(void) type_start_stop; /* unused */
return true; // No slow path dynamic checks yet, will be in future.
}
uint64_t _ttrace_point(uint64_t type_start_stop, uintptr_t d0, uintptr_t d1,
uintptr_t d2, uintptr_t d3, uintptr_t d4)
{
if (!_ttrace_on_fast_path(type_start_stop)) {
/* Slow path, will contain a list of more selective tests than the
* simple TTRACE mask test in the header. */
assert(false); // Unimplemented
return 0;
}
/* record tracepoint */
const uint32_t coreid = core_id();
struct ttrace_entry *cur_trace_entry = _ttrace_next_line(coreid);
/* Bottom bit of timestamp is used to indicate a continuation record */
const uint64_t tsc = read_tscp() & ~1ULL; // Clear bottom bit
cur_trace_entry->timestamp = -1ULL; // Invalidate entry
wmb();
_ttrace_set_type(&cur_trace_entry->t, type_start_stop, coreid);
cur_trace_entry->data0 = d0;
cur_trace_entry->data1 = d1;
cur_trace_entry->data2 = d2;
cur_trace_entry->data3 = d3;
cur_trace_entry->data4 = d4;
cur_trace_entry->timestamp = tsc; // Entry complete
wmb();
return tsc | 1; /* Timestamp for any required continuation */
}
void _ttrace_point_cont(uint64_t timestamp,
uintptr_t d0, uintptr_t d1, uintptr_t d2, uintptr_t d3,
uintptr_t d4, uintptr_t d5, uintptr_t d6)
{
dassert(timestamp & 1);
const uint32_t coreid = core_id();
struct ttrace_entry_continuation *cur_trace_cont
= (struct ttrace_entry_continuation *) _ttrace_next_line(coreid);
cur_trace_cont->timestamp = -1ULL; // Invalidate entry
wmb();
cur_trace_cont->data0 = d0;
cur_trace_cont->data1 = d1;
cur_trace_cont->data2 = d2;
cur_trace_cont->data3 = d3;
cur_trace_cont->data4 = d4;
cur_trace_cont->data5 = d5;
cur_trace_cont->data6 = d6;
cur_trace_cont->timestamp = timestamp; // Entry complete
wmb();
}
// Auxiliary buffer handling.
// Offsets can be greater than the buffer's size
static inline struct ttrace_aux_entry *ttrace_aux(ptrdiff_t offset)
{
offset &= TTRACE_AUX_BUFFER_MASK;
return (struct ttrace_aux_entry *) (&ttrace_aux_buffer[offset]);
}
// Copy header concatenated with payload data into target, may wrap around the
// ttrace_aux_buffer.
static inline void ttrace_fill_aux(void *vtarget,
const void *vheader, const int header_len,
const void *vpaydata, const int pay_len)
{
uint8_t * const target = (uint8_t *) vtarget;
uint8_t * const header = (uint8_t *) vheader;
uint8_t * const paydata = (uint8_t *) vpaydata;
size_t offset = 0;
size_t remaining = header_len + pay_len;
do {
const ptrdiff_t buf_remaining
= &ttrace_aux_buffer[sizeof(ttrace_aux_buffer)] - target;
size_t len = min(remaining, buf_remaining);
uint8_t * const dcp = &target[offset];
const uint8_t *scp = NULL;
if (offset < header_len) {
scp = &header[offset];
if ((header_len - offset) < len)
len = header_len - offset;
} else {
const int pay_offset = offset - header_len;
scp = &paydata[pay_offset];
if ((pay_len - pay_offset) < len)
len = pay_len - pay_offset;
}
memcpy(dcp, scp, len);
offset += len;
remaining -= len;
} while (remaining > 0);
}
/* The ttrace_aux_buffer is copied out to user land as a large binary blob. As
* _ttrace_point_string is called from performance critical times, e.g. proc
* creation, it is important that writing data does not contend on a long held
* reader lock. Hence the complicated protocol between _ttrace_point_string
* and devttrace:ttdevread_aux. The goal is to minimize mutual exclusion time
* both between all writers and an occasional reader.
*
* aux write:
* wlock
* compute last entry
* compute end of new last entry
* advance first entry pointer past new last entry end
* mark entry path, (tag, coreid, len, Entry INVALID)
* wmb() // Is this barrier necessary before an unlock?
* wunlock
* memcpy data to aux buffer at new entry location
* wmb()
* mark entry path, (tag, coreid, len, Entry VALID)
*
* aux_read:
* // Minimise time between offset copies by taking page faults now
* touch every page in user space
* rlock
* copy last entry offset
* runlock
* memcpy auxilary buffer
* rlock
* copy first entry offset
* runlock
* On return from aux_read copied last and first entries are valid, probably,
* and each entry can be checked by the client to determine if it stamped as
* VALID. Note even if the entry is INVALID the length is always ok.
*/
void _ttrace_point_string(uint8_t tag, uintptr_t ident, const char *str)
{
int len = strnlen(str, TTRACE_AUX_ENTRY_MAXSTRLEN);
assert(!str[len]);
len += sizeof(ttrace_aux_entry); // Length of new record
const uint64_t path = TTRACE_AUX_PATH(tag, core_id(), len, 0);
struct ttrace_aux_entry *last_aux = NULL;
wlock(&ttrace_aux_rwlock);
do {
dassert(ttrace_aux_last == ttrace_aux_align(ttrace_aux_last));
dassert(ttrace_aux_first == ttrace_aux_align(ttrace_aux_first));
const ptrdiff_t old_aux_last = ttrace_aux_last;
struct ttrace_aux_entry *old_last_aux = ttrace_aux(old_aux_last);
// Don't care about wrapping the ttrace_aux function masks out high bits
ptrdiff_t new_aux_first = ttrace_aux_first;
ptrdiff_t new_aux_last
= old_aux_last + ttrace_aux_align(old_last_aux->len & ~1);
dassert(new_aux_last == ttrace_aux_align(new_aux_last));
const ptrdiff_t next_aux_last = new_aux_last + ttrace_aux_align(len);
struct ttrace_aux_entry *first_aux = ttrace_aux(new_aux_first);
while (new_aux_first + TTRACE_AUX_BUFFER_SZ < next_aux_last) {
const ptrdiff_t first_len
= ttrace_aux_align(TTRACE_AUX_LEN(first_aux->tag_len));
new_aux_first += first_len;
dassert(new_aux_first == ttrace_aux_align(new_aux_first));
first_aux = ttrace_aux(new_aux_first);
}
/* Check first for wrap, in which case wrap both first and last */
if (new_aux_first >= TTRACE_AUX_BUFFER_SZ) {
new_aux_first -= TTRACE_AUX_BUFFER_SZ;
new_aux_last -= TTRACE_AUX_BUFFER_SZ;
}
dassert(new_aux_first < TTRACE_AUX_BUFFER_SZ);
dassert(new_aux_last < 2 * TTRACE_AUX_BUFFER_SZ);
ttrace_aux_last = new_aux_last;
ttrace_aux_first = new_aux_first;
last_aux = ttrace_aux(new_aux_last);
atomic_set((atomic_t *) last_aux, path | 1); // mark entry as invalid
/* No barrier necessary before an unlock */
} while(false);
wunlock(&ttrace_aux_rwlock);
struct ttrace_aux_entry header = {
.path = path | 1;
.timestamp = read_tscp();
.ident = ident
};
ttrace_fill_aux(last_aux, &header, sizeof(header), str, len);
/* The reader is doing a memcpy, not a formal read acquire. but I don't
* think that will be an issue. It just means that a record will not be
* valid this time the buffer is read. Next time the buffer is read the
* write will most probably be complete. But the wmb is still important
* here because we can't have the payload write and release write swapped.
* Not that that would happen anyway given we have just called a
* function. */
wmb(); // Write release, record is valid now.
atomic_set((atomic_t *) last_aux, path);
}