|  | /* TODO(gvdl): Who holds the copyright? | 
|  | * Godfrey van der Linden <gvdl@google.com> | 
|  | * See LICENSE for details. | 
|  | * | 
|  | * Timed tracing. | 
|  | * | 
|  | * This file is only included if CONFIG_TTRACE=y | 
|  | * TODO(gvdl): Documentation goes here. */ | 
|  |  | 
|  | #include <ttrace.h> | 
|  |  | 
|  | #include <arch/arch.h> | 
|  | #include <atomic.h> | 
|  | #include <env.h> | 
|  | #include <page_alloc.h> | 
|  | #include <rwlock.h> | 
|  | #include <smp.h> | 
|  | #include <syscall.h> | 
|  | #include <trace.h> | 
|  | #include <trap.h> | 
|  |  | 
|  | #ifndef min | 
|  | #define min(a, b) ({ \ | 
|  | typeof (a) _a = (a); typeof (b) _b = (b); _a < _b ? _a : _b; }) | 
|  | #endif | 
|  |  | 
|  | // Shared buffer must be a power of 2, static_asserted in ttrace_init. | 
|  | #define TTRACE_AUX_BUFFER_MB   ((size_t) 1) | 
|  | #define TTRACE_AUX_BUFFER_SZ   (TTRACE_AUX_BUFFER_MB * 1024 * 1024) | 
|  | #define TTRACE_AUX_BUFFER_MASK (TTRACE_AUX_BUFFER_SZ - 1) | 
|  |  | 
|  | // TODO(gvdl): Need to make this a config parameter | 
|  | // TODO(gvdl): Implement dynamic buffer resize, virtually contig pages | 
|  | #define TTRACE_PER_CPU_BUFFER_MB ((size_t) 1) // 16K ttrace entries per cpu | 
|  |  | 
|  | #define TTRACE_PER_CPU_BUFFER_SZ (TTRACE_PER_CPU_BUFFER_MB * 1024 * 1024) | 
|  | #define PER_CPU_ORDER ({													\ | 
|  | const uint64_t per_cpu_pages = TTRACE_PER_CPU_BUFFER_SZ / PGSIZE;		\ | 
|  | (size_t) LOG2_UP(per_cpu_pages);										\ | 
|  | }) | 
|  |  | 
|  | // Global data | 
|  | uint64_t ttrace_type_mask; | 
|  |  | 
|  | // Local data | 
|  | static rwlock_t ttrace_aux_rwlock; | 
|  | static ptrdiff_t ttrace_aux_first, ttrace_aux_last; // Offsets into aux | 
|  | static uint8_t ttrace_aux_buffer[TTRACE_AUX_BUFFER_SZ]; | 
|  | static struct trace_ring ttrace_ring[MAX_NUM_CPUS]; | 
|  |  | 
|  | // | 
|  | // TTRACE implementation | 
|  | // | 
|  | void ttrace_init() | 
|  | { | 
|  | // Single tracepoint entries should be atomically written. | 
|  | static_assert(sizeof(struct ttrace_entry) == ARCH_CL_SIZE); | 
|  | static_assert( | 
|  | sizeof(struct ttrace_entry_continuation) == sizeof(struct ttrace_entry)); | 
|  | static_assert(sizeof(ttrace_type_mask) == sizeof(atomic_t)); | 
|  | static_assert(IS_PWR2(TTRACE_AUX_BUFFER_MB)); | 
|  |  | 
|  | atomic_init((atomic_t *) &ttrace_type_mask, 0); | 
|  | rwinit(&ttrace_aux_rwlock); | 
|  |  | 
|  | /* ttrace_init() is called as soon as memory is initialised so that we can | 
|  | * profile the early boot sequence if we wish.  But that means it is called | 
|  | * before arch_init and until arch_init is called num_cpus is invalid. So | 
|  | * we allocate as much memory as we'll ever need here and will give the | 
|  | * memory back in ttrace_cleanup() which is called much later in the init | 
|  | * process. */ | 
|  | const size_t per_cpu_order = PER_CPU_ORDER; | 
|  | const size_t per_cpu_bytes = (1 << per_cpu_order) * PGSIZE; | 
|  |  | 
|  | for (int i = 0; i < MAX_NUM_CPUS; i++) { | 
|  | /* TODO(gvdl): Only need virtually contiguous pages! */ | 
|  | trace_ring_init(&ttrace_ring[i], get_cont_pages(per_cpu_order, 0), | 
|  | TTRACE_PER_CPU_BUFFER_SZ, sizeof(struct ttrace_entry)); | 
|  | } | 
|  |  | 
|  | // Quickly record all of the systems calls, known at compile time | 
|  | for (size_t i = 0; i < max_syscall; i++) { | 
|  | const char * const name = syscall_table[i].name; | 
|  | if (name) | 
|  | _ttrace_point_string(TTRACEH_TAG_SYSC, i, name); | 
|  | } | 
|  | } | 
|  |  | 
|  | void ttrace_cleanup() | 
|  | { | 
|  | /* Give back un-used trace_rings, assumes core_id is contiguous from 0 */ | 
|  | const size_t per_cpu_order = PER_CPU_ORDER; | 
|  | for (int i = num_cpus; i < MAX_NUM_CPUS; i++) { | 
|  | free_cont_pages(ttrace_ring[i].tr_buf, per_cpu_order); | 
|  | memset(&ttrace_ring[i], 0, sizeof(ttrace_ring[i])); | 
|  | } | 
|  | } | 
|  |  | 
|  | // | 
|  | // devttrace interface | 
|  | // | 
|  | struct trace_ring* get_ttrace_ring_for_core(uint32_t coreid) | 
|  | { | 
|  | return coreid < num_cpus? &ttrace_ring[coreid] : NULL; | 
|  | } | 
|  |  | 
|  | void fill_ttrace_version(struct ttrace_version *versp) | 
|  | { | 
|  | versp->magic = TTRACEVH_MAGIC; | 
|  | versp->header_version = TTRACEVH_H_V1_0; | 
|  | versp->shared_data_version = TTRACEVH_S_V1_0; | 
|  | versp->percpu_data_version = TTRACEVH_C_V1_0; | 
|  | versp->num_cpus = num_cpus; | 
|  | versp->buffer_mask = TTRACE_AUX_BUFFER_MASK; | 
|  | versp->first_offset = versp->last_offset = (ptrdiff_t) -1; | 
|  | } | 
|  |  | 
|  | const uint8_t *bufferp get_ttrace_aux_buffer(void) | 
|  | { | 
|  | return ttrace_aux_buffer; | 
|  | } | 
|  |  | 
|  | void get_ttrace_aux_buffer_snapshot(ptrdiff_t *firstp, ptrdiff_t *lastp) | 
|  | { | 
|  | rlock(&ttrace_aux_rwlock); | 
|  | *firstp = ttrace_aux_first; | 
|  | *lastp = ttrace_aux_last; | 
|  | runlock(&ttrace_aux_rwlock); | 
|  | } | 
|  |  | 
|  | // | 
|  | // ttrace data collection | 
|  | // | 
|  |  | 
|  | // From Barret | 
|  | // An arbitrary kernel thread, running in the kernel, that might block in | 
|  | // sem_down can have a few sources: | 
|  | //   1) Process issued syscall, when it was submitted locally (trap/sysenter) | 
|  | //      and before it blocks. Then owning_proc, cur_proc, owning_vcoreid, | 
|  | //      kthread->sysc are accurate. | 
|  | //   2) Process issued syscall, either remotely or after it blocked and | 
|  | //      restarted at least once.  cur_proc and kthread->sysc are accurate. | 
|  | //   3) Process issued trap (SCP page faults on files are handled by the kernel | 
|  | //      with blocking kthreads.  There may be others.)  cur_proc is accurate. | 
|  | //      kthread->sysc should be 0. Might be fault into in cur_ctx. | 
|  | //   4) Kernel task, aka ktask.  It's kernel thread not in a user's context, | 
|  | //      has a name. No cur_proc or anything like that. There is kthread->name. | 
|  | //   5) Unnamed routine kernel messages.  Like a ktask, there is no user | 
|  | //      component. They have no names. Can use kmsg function pointer. | 
|  | // | 
|  | // For the syscalls (case 1 and 2), it is tempting to track the vcoreid, but it | 
|  | // can be wrong in case 2.  I think cur_proc->pid and kthread->sysc->num will | 
|  | // work. | 
|  | // | 
|  | // Case 3 is like case 1 and 2, but there is no kthread->sysc. Right now, it's | 
|  | // probably not worth it, but you can peak in cur_ctx and extract the trap | 
|  | // number.  that might be arch-dependent and nasty, so probably not worth it. | 
|  | // the only one that happens now is a PF. | 
|  | // | 
|  | // Case 4 and 5 are basically the same.  5 just has a blank name. | 
|  | // Incidentally, we can try to attach names to all RKMs.  It'll require some | 
|  | // work when it comes to how kthread->name is managed.  Might not be worth it, | 
|  | // just for this, unless we have a lot of unnamed RKMs gumming up the works. | 
|  |  | 
|  | // TODO(gvdl): This function is a bit heavy weight for a fast path. It will be | 
|  | // better to add ttrace_ctx to per_cpu_info and set it when we change contexts | 
|  | // which should be way less frequently than computing it for every record. | 
|  | static inline void _ttrace_set_type(struct ttrace_type *tt, uint64_t type, | 
|  | uint32_t coreid) | 
|  | { | 
|  | const struct per_cpu_info *pcpui = &per_cpu_info[coreid]; | 
|  | const struct kthread * const kt = pcpui->cur_kthread; | 
|  | const struct proc * const cp = pcpui->cur_proc; | 
|  |  | 
|  | tt->type_id = type & ~TTRACE_CTX_MASK; | 
|  | #if 0 | 
|  | /* TODO(gvdl) Investigate TRAP and IRQ contexts */ | 
|  | if (pcpui->__ctx_depth) { | 
|  | tt->type_id |= TTRACE_CTX_IRQTRAP; | 
|  | tt->ctx.ctx_depth = pcpui->__ctx_depth; | 
|  | } | 
|  | #endif // 0 | 
|  | if (!cp) { | 
|  | // case 4-5 Kernel task or routine KMSG | 
|  | tt->ctx.kthread = (uintptr_t) kt; | 
|  | tt->type_id |= (kt->name)? TTRACE_CTX_KTASK : TTRACE_CTX_RKMSG; | 
|  | } else if (kt->sysc) { // case 1-2. | 
|  | // TODO(gvdl) I'd like to log vcore if possible | 
|  | tt->ctx.pid_syscall = (uintptr_t) cp->pid << 32 | kt->sysc->num; | 
|  | tt->type_id |= TTRACE_CTX_PIDSYSC; | 
|  | } else { // case 3 | 
|  | // TODO(gvdl) Trap id? | 
|  | tt->ctx.pid_syscall = (uintptr_t) cp->pid << 32; | 
|  | tt->type_id |= TTRACE_CTX_PIDTRAP; | 
|  | } | 
|  | } | 
|  |  | 
|  | static inline struct ttrace_entry *_ttrace_next_line(uint32_t coreid) | 
|  | { | 
|  | struct trace_ring *per_cpu_ttrace_ring = &ttrace_ring[coreid]; | 
|  | return get_trace_slot_overwrite_racy(per_cpu_ttrace_ring); | 
|  | } | 
|  |  | 
|  | static inline bool __attribute__((always_inline)) | 
|  | _ttrace_on_fast_path(uint64_t type_start_stop) | 
|  | { | 
|  | (void) type_start_stop; /* unused */ | 
|  | return true;  // No slow path dynamic checks yet, will be in future. | 
|  | } | 
|  |  | 
|  | uint64_t _ttrace_point(uint64_t type_start_stop, uintptr_t d0, uintptr_t d1, | 
|  | uintptr_t d2, uintptr_t d3, uintptr_t d4) | 
|  | { | 
|  | if (!_ttrace_on_fast_path(type_start_stop)) { | 
|  | /* Slow path, will contain a list of more selective tests than the | 
|  | * simple TTRACE mask test in the header. */ | 
|  | assert(false);  // Unimplemented | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* record tracepoint */ | 
|  | const uint32_t coreid = core_id(); | 
|  | struct ttrace_entry *cur_trace_entry = _ttrace_next_line(coreid); | 
|  |  | 
|  | /* Bottom bit of timestamp is used to indicate a continuation record */ | 
|  | const uint64_t tsc = read_tscp() & ~1ULL; // Clear bottom bit | 
|  | cur_trace_entry->timestamp = -1ULL; // Invalidate entry | 
|  | wmb(); | 
|  |  | 
|  | _ttrace_set_type(&cur_trace_entry->t, type_start_stop, coreid); | 
|  | cur_trace_entry->data0 = d0; | 
|  | cur_trace_entry->data1 = d1; | 
|  | cur_trace_entry->data2 = d2; | 
|  | cur_trace_entry->data3 = d3; | 
|  | cur_trace_entry->data4 = d4; | 
|  | cur_trace_entry->timestamp = tsc; // Entry complete | 
|  | wmb(); | 
|  |  | 
|  | return tsc | 1;  /* Timestamp for any required continuation */ | 
|  | } | 
|  |  | 
|  | void _ttrace_point_cont(uint64_t timestamp, | 
|  | uintptr_t d0, uintptr_t d1, uintptr_t d2, uintptr_t d3, | 
|  | uintptr_t d4, uintptr_t d5, uintptr_t d6) | 
|  | { | 
|  | dassert(timestamp & 1); | 
|  | const uint32_t coreid = core_id(); | 
|  | struct ttrace_entry_continuation *cur_trace_cont | 
|  | = (struct ttrace_entry_continuation *) _ttrace_next_line(coreid); | 
|  |  | 
|  | cur_trace_cont->timestamp = -1ULL; // Invalidate entry | 
|  | wmb(); | 
|  |  | 
|  | cur_trace_cont->data0 = d0; | 
|  | cur_trace_cont->data1 = d1; | 
|  | cur_trace_cont->data2 = d2; | 
|  | cur_trace_cont->data3 = d3; | 
|  | cur_trace_cont->data4 = d4; | 
|  | cur_trace_cont->data5 = d5; | 
|  | cur_trace_cont->data6 = d6; | 
|  | cur_trace_cont->timestamp = timestamp; // Entry complete | 
|  | wmb(); | 
|  | } | 
|  |  | 
|  | // Auxiliary buffer handling. | 
|  | // Offsets can be greater than the buffer's size | 
|  | static inline struct ttrace_aux_entry *ttrace_aux(ptrdiff_t offset) | 
|  | { | 
|  | offset &= TTRACE_AUX_BUFFER_MASK; | 
|  | return (struct ttrace_aux_entry *) (&ttrace_aux_buffer[offset]); | 
|  | } | 
|  |  | 
|  | // Copy header concatenated with payload data into target, may wrap around the | 
|  | // ttrace_aux_buffer. | 
|  | static inline void ttrace_fill_aux(void *vtarget, | 
|  | const void *vheader,  const int header_len, | 
|  | const void *vpaydata, const int pay_len) | 
|  | { | 
|  | uint8_t * const target = (uint8_t *) vtarget; | 
|  | uint8_t * const header = (uint8_t *) vheader; | 
|  | uint8_t * const paydata = (uint8_t *) vpaydata; | 
|  | size_t offset = 0; | 
|  | size_t remaining = header_len + pay_len; | 
|  | do { | 
|  | const ptrdiff_t buf_remaining | 
|  | = &ttrace_aux_buffer[sizeof(ttrace_aux_buffer)] - target; | 
|  | size_t len = min(remaining, buf_remaining); | 
|  | uint8_t * const dcp = &target[offset]; | 
|  | const uint8_t *scp = NULL; | 
|  | if (offset < header_len) { | 
|  | scp = &header[offset]; | 
|  | if ((header_len - offset) < len) | 
|  | len = header_len - offset; | 
|  | } else { | 
|  | const int pay_offset = offset - header_len; | 
|  | scp = &paydata[pay_offset]; | 
|  | if ((pay_len - pay_offset) < len) | 
|  | len = pay_len - pay_offset; | 
|  | } | 
|  | memcpy(dcp, scp, len); | 
|  | offset += len; | 
|  | remaining -= len; | 
|  | } while (remaining > 0); | 
|  | } | 
|  |  | 
|  | /* The ttrace_aux_buffer is copied out to user land as a large binary blob. As | 
|  | * _ttrace_point_string is called from performance critical times, e.g.  proc | 
|  | * creation, it is important that writing data does not contend on a long held | 
|  | * reader lock.  Hence the complicated protocol between _ttrace_point_string | 
|  | * and devttrace:ttdevread_aux. The goal is to minimize mutual exclusion time | 
|  | * both between all writers and an occasional reader. | 
|  | * | 
|  | * aux write: | 
|  | *   wlock | 
|  | *     compute last entry | 
|  | *     compute end of new last entry | 
|  | *     advance first entry pointer past new last entry end | 
|  | *     mark entry path, (tag, coreid, len, Entry INVALID) | 
|  | *     wmb()  // Is this barrier necessary before an unlock? | 
|  | *   wunlock | 
|  | *   memcpy data to aux buffer at new entry location | 
|  | *   wmb() | 
|  | *   mark entry path, (tag, coreid, len, Entry VALID) | 
|  | * | 
|  | *  aux_read: | 
|  | *    // Minimise time between offset copies by taking page faults now | 
|  | *    touch every page in user space | 
|  | *    rlock | 
|  | *      copy last entry offset | 
|  | *    runlock | 
|  | *    memcpy auxilary buffer | 
|  | *    rlock | 
|  | *      copy first entry offset | 
|  | *    runlock | 
|  | *  On return from aux_read copied last and first entries are valid, probably, | 
|  | *  and each entry can be checked by the client to determine if it stamped as | 
|  | *  VALID. Note even if the entry is INVALID the length is always ok. | 
|  | */ | 
|  | void _ttrace_point_string(uint8_t tag, uintptr_t ident, const char *str) | 
|  | { | 
|  | int len = strnlen(str, TTRACE_AUX_ENTRY_MAXSTRLEN); | 
|  | assert(!str[len]); | 
|  | len += sizeof(ttrace_aux_entry);  // Length of new record | 
|  | const uint64_t path = TTRACE_AUX_PATH(tag, core_id(), len, 0); | 
|  | struct ttrace_aux_entry *last_aux = NULL; | 
|  |  | 
|  | wlock(&ttrace_aux_rwlock); | 
|  | do { | 
|  | dassert(ttrace_aux_last == ttrace_aux_align(ttrace_aux_last)); | 
|  | dassert(ttrace_aux_first == ttrace_aux_align(ttrace_aux_first)); | 
|  |  | 
|  | const ptrdiff_t old_aux_last = ttrace_aux_last; | 
|  | struct ttrace_aux_entry *old_last_aux = ttrace_aux(old_aux_last); | 
|  |  | 
|  | // Don't care about wrapping the ttrace_aux function masks out high bits | 
|  | ptrdiff_t new_aux_first = ttrace_aux_first; | 
|  | ptrdiff_t new_aux_last | 
|  | = old_aux_last + ttrace_aux_align(old_last_aux->len & ~1); | 
|  | dassert(new_aux_last == ttrace_aux_align(new_aux_last)); | 
|  |  | 
|  | const ptrdiff_t next_aux_last = new_aux_last + ttrace_aux_align(len); | 
|  | struct ttrace_aux_entry *first_aux = ttrace_aux(new_aux_first); | 
|  | while (new_aux_first + TTRACE_AUX_BUFFER_SZ < next_aux_last) { | 
|  | const ptrdiff_t first_len | 
|  | = ttrace_aux_align(TTRACE_AUX_LEN(first_aux->tag_len)); | 
|  | new_aux_first += first_len; | 
|  | dassert(new_aux_first == ttrace_aux_align(new_aux_first)); | 
|  | first_aux = ttrace_aux(new_aux_first); | 
|  | } | 
|  | /* Check first for wrap, in which case wrap both first and last */ | 
|  | if (new_aux_first >= TTRACE_AUX_BUFFER_SZ) { | 
|  | new_aux_first -= TTRACE_AUX_BUFFER_SZ; | 
|  | new_aux_last -= TTRACE_AUX_BUFFER_SZ; | 
|  | } | 
|  | dassert(new_aux_first < TTRACE_AUX_BUFFER_SZ); | 
|  | dassert(new_aux_last < 2 * TTRACE_AUX_BUFFER_SZ); | 
|  |  | 
|  | ttrace_aux_last = new_aux_last; | 
|  | ttrace_aux_first = new_aux_first; | 
|  | last_aux = ttrace_aux(new_aux_last); | 
|  | atomic_set((atomic_t *) last_aux, path | 1);  // mark entry as invalid | 
|  | /* No barrier necessary before an unlock */ | 
|  | } while(false); | 
|  | wunlock(&ttrace_aux_rwlock); | 
|  |  | 
|  | struct ttrace_aux_entry header = { | 
|  | .path = path | 1; | 
|  | .timestamp = read_tscp(); | 
|  | .ident = ident | 
|  | }; | 
|  | ttrace_fill_aux(last_aux, &header, sizeof(header), str, len); | 
|  |  | 
|  | /* The reader is doing a memcpy, not a formal read acquire. but I don't | 
|  | * think that will be an issue. It just means that a record will not be | 
|  | * valid this time the buffer is read. Next time the buffer is read the | 
|  | * write will most probably be complete.  But the wmb is still important | 
|  | * here because we can't have the payload write and release write swapped. | 
|  | * Not that that would happen anyway given we have just called a | 
|  | * function. */ | 
|  | wmb(); // Write release, record is valid now. | 
|  | atomic_set((atomic_t *) last_aux, path); | 
|  | } |