/* Copyright (c) 2015 Google Inc
 * Davide Libenzi <dlibenzi@google.com>
 * See LICENSE for details.
 *
 * This controls the emitting, collecting, and exporting of samples for perf
 * events.  Examples of events are PMU counter overflows, mmaps, and process
 * creation.
 *
 * Events are collected in a central qio queue.  High-frequency events (e.g.
 * IRQ backtraces()) are collected in per-core buffers, which are flushed to the
 * central queue when they fill up or on command.  Lower-frequency events (e.g.
 * profiler_notify_mmap()) just go straight to the central queue.
 *
 * Currently there is one global profiler.  Kprof is careful to only have one
 * open profiler at a time.  We assert that this is true.  TODO: stop using the
 * global profiler!
 *
 * A few other notes:
 * - profiler_control_trace() controls the per-core trace collection.  When it
 *   is disabled, it also flushes the per-core blocks to the central queue.
 * - The collection of mmap and comm samples is independent of trace collection.
 *   Those will occur whenever the profiler is open (refcnt check, for now). */

#include <ros/common.h>
#include <ros/mman.h>
#include <sys/types.h>
#include <smp.h>
#include <trap.h>
#include <kthread.h>
#include <env.h>
#include <process.h>
#include <mm.h>
#include <kmalloc.h>
#include <pmap.h>
#include <kref.h>
#include <atomic.h>
#include <umem.h>
#include <elf.h>
#include <ns.h>
#include <err.h>
#include <core_set.h>
#include <string.h>
#include "profiler.h"

#define PROFILER_MAX_PRG_PATH	256

#define VBE_MAX_SIZE(t) ((8 * sizeof(t) + 6) / 7)

/* Do not rely on the contents of the PCPU ctx with IRQs enabled. */
struct profiler_cpu_context {
	struct block *block;
	int cpu;
	int tracing;
	size_t dropped_data_cnt;
};

static int profiler_queue_limit = 64 * 1024 * 1024;
static size_t profiler_cpu_buffer_size = 65536;
static qlock_t profiler_mtx = QLOCK_INITIALIZER(profiler_mtx);
static struct kref profiler_kref;
static struct profiler_cpu_context *profiler_percpu_ctx;
static struct queue *profiler_queue;

static inline struct profiler_cpu_context *profiler_get_cpu_ctx(int cpu)
{
	return profiler_percpu_ctx + cpu;
}

static inline char *vb_encode_uint64(char *data, uint64_t n)
{
	/* Classical variable bytes encoding. Encodes 7 bits at a time, using
	 * bit number 7 in the byte, as indicator of end of sequence (when
	 * zero). */
	for (; n >= 0x80; n >>= 7)
		*data++ = (char) (n | 0x80);
	*data++ = (char) n;

	return data;
}

static struct block *profiler_buffer_write(struct profiler_cpu_context *cpu_buf,
                                           struct block *b)
{
	/* qpass will drop b if the queue is over its limit.  we're willing to
	 * lose traces, but we won't lose 'control' events, such as MMAP and
	 * PID. */
	if (b) {
		if (qpass(profiler_queue, b) < 0)
			cpu_buf->dropped_data_cnt++;
	}
	return block_alloc(profiler_cpu_buffer_size, MEM_ATOMIC);
}

/* Helper, paired with profiler_cpu_buffer_write_commit.  Ensures there is
 * enough room in the pcpu block for our write.  May alloc a new one.
 *
 * IRQs must be disabled before calling, until after write_commit. */
static char *profiler_cpu_buffer_write_reserve(
	struct profiler_cpu_context *cpu_buf, size_t size, struct block **pb)
{
	struct block *b = cpu_buf->block;

	if (unlikely((!b) || (b->lim - b->wp) < size)) {
		cpu_buf->block = b = profiler_buffer_write(cpu_buf, b);
		if (unlikely(!b))
			return NULL;
	}
	*pb = b;

	return (char *) b->wp;
}

/* Helper, paired with write_reserve.  Finalizes the writing into the block's
 * main body of @size bytes.  IRQs must be disabled until after this is called.
 */
static inline void profiler_cpu_buffer_write_commit(
	struct profiler_cpu_context *cpu_buf, struct block *b, size_t size)
{
	b->wp += size;
}

static inline size_t profiler_max_envelope_size(void)
{
	return 2 * VBE_MAX_SIZE(uint64_t);
}

static void profiler_push_kernel_trace64(struct profiler_cpu_context *cpu_buf,
                                         const uintptr_t *trace, size_t count,
                                         uint64_t info)
{
	struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
	size_t size = sizeof(struct proftype_kern_trace64) +
		count * sizeof(uint64_t);
	struct block *b;
	void *resptr, *ptr;

	assert(!irq_is_enabled());
	resptr = profiler_cpu_buffer_write_reserve(
	    cpu_buf, size + profiler_max_envelope_size(), &b);
	ptr = resptr;

	if (likely(ptr)) {
		struct proftype_kern_trace64 *record;

		ptr = vb_encode_uint64(ptr, PROFTYPE_KERN_TRACE64);
		ptr = vb_encode_uint64(ptr, size);

		record = (struct proftype_kern_trace64 *) ptr;
		ptr += size;

		record->info = info;
		record->tstamp = nsec();
		if (is_ktask(pcpui->cur_kthread) || !pcpui->cur_proc)
			record->pid = -1;
		else
			record->pid = pcpui->cur_proc->pid;
		record->cpu = cpu_buf->cpu;
		record->num_traces = count;
		for (size_t i = 0; i < count; i++)
			record->trace[i] = (uint64_t) trace[i];

		profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
	}
}

static void profiler_push_user_trace64(struct profiler_cpu_context *cpu_buf,
                                       struct proc *p, const uintptr_t *trace,
                                       size_t count, uint64_t info)
{
	size_t size = sizeof(struct proftype_user_trace64) +
		count * sizeof(uint64_t);
	struct block *b;
	void *resptr, *ptr;

	assert(!irq_is_enabled());
	resptr = profiler_cpu_buffer_write_reserve(
	    cpu_buf, size + profiler_max_envelope_size(), &b);
	ptr = resptr;

	if (likely(ptr)) {
		struct proftype_user_trace64 *record;

		ptr = vb_encode_uint64(ptr, PROFTYPE_USER_TRACE64);
		ptr = vb_encode_uint64(ptr, size);

		record = (struct proftype_user_trace64 *) ptr;
		ptr += size;

		record->info = info;
		record->tstamp = nsec();
		record->pid = p->pid;
		record->cpu = cpu_buf->cpu;
		record->num_traces = count;
		for (size_t i = 0; i < count; i++)
			record->trace[i] = (uint64_t) trace[i];

		profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
	}
}

static void profiler_push_pid_mmap(struct proc *p, uintptr_t addr, size_t msize,
                                   size_t offset, const char *path)
{
	size_t plen = strlen(path) + 1;
	size_t size = sizeof(struct proftype_pid_mmap64) + plen;
	void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);

	if (likely(resptr)) {
		void *ptr = resptr;
		struct proftype_pid_mmap64 *record;

		ptr = vb_encode_uint64(ptr, PROFTYPE_PID_MMAP64);
		ptr = vb_encode_uint64(ptr, size);

		record = (struct proftype_pid_mmap64 *) ptr;
		ptr += size;

		record->tstamp = nsec();
		record->pid = p->pid;
		record->addr = addr;
		record->size = msize;
		record->offset = offset;
		memcpy(record->path, path, plen);

		qiwrite(profiler_queue, resptr, (int) (ptr - resptr));

		kfree(resptr);
	}
}

static void profiler_push_new_process(struct proc *p)
{
	size_t plen = strlen(p->binary_path) + 1;
	size_t size = sizeof(struct proftype_new_process) + plen;
	void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);

	if (likely(resptr)) {
		void *ptr = resptr;
		struct proftype_new_process *record;

		ptr = vb_encode_uint64(ptr, PROFTYPE_NEW_PROCESS);
		ptr = vb_encode_uint64(ptr, size);

		record = (struct proftype_new_process *) ptr;
		ptr += size;

		record->tstamp = nsec();
		record->pid = p->pid;
		memcpy(record->path, p->binary_path, plen);

		qiwrite(profiler_queue, resptr, (int) (ptr - resptr));

		kfree(resptr);
	}
}

static void profiler_emit_current_system_status(void)
{
	void enum_proc(struct vm_region *vmr, void *opaque)
	{
		struct proc *p = (struct proc *) opaque;

		profiler_notify_mmap(p, vmr->vm_base,
				     vmr->vm_end - vmr->vm_base,
		                     vmr->vm_prot, vmr->vm_flags, vmr->__vm_foc,
		                     vmr->vm_foff);
	}

	ERRSTACK(1);
	struct process_set pset;

	proc_get_set(&pset);
	if (waserror()) {
		proc_free_set(&pset);
		nexterror();
	}

	for (size_t i = 0; i < pset.num_processes; i++) {
		profiler_notify_new_process(pset.procs[i]);
		enumerate_vmrs(pset.procs[i], enum_proc, pset.procs[i]);
	}

	poperror();
	proc_free_set(&pset);
}

static void free_cpu_buffers(void)
{
	kfree(profiler_percpu_ctx);
	profiler_percpu_ctx = NULL;

	if (profiler_queue) {
		qfree(profiler_queue);
		profiler_queue = NULL;
	}
}

static void alloc_cpu_buffers(void)
{
	ERRSTACK(1);

	/* It is very important that we enqueue and dequeue entire records at
	 * once.  If we leave partial records, the entire stream will be
	 * corrupt.  Our reader does its best to make sure it has room for
	 * complete records (checks qlen()).
	 *
	 * If we ever get corrupt streams, try making this a Qmsg.  Though it
	 * doesn't help every situation - we have issues with writes greater
	 * than Maxatomic regardless. */
	profiler_queue = qopen(profiler_queue_limit, 0, NULL, NULL);
	if (!profiler_queue)
		error(ENOMEM, ERROR_FIXME);
	if (waserror()) {
		free_cpu_buffers();
		nexterror();
	}

	profiler_percpu_ctx =
	    kzmalloc(sizeof(*profiler_percpu_ctx) * num_cores, MEM_WAIT);

	for (int i = 0; i < num_cores; i++) {
		struct profiler_cpu_context *b = &profiler_percpu_ctx[i];

		b->cpu = i;
	}
}

static long profiler_get_checked_value(const char *value, long k, long minval,
                                       long maxval)
{
	long lvalue = strtol(value, NULL, 0) * k;

	if (lvalue < minval)
		error(EFAIL, "Value should be greater than %ld", minval);
	if (lvalue > maxval)
		error(EFAIL, "Value should be lower than %ld", maxval);

	return lvalue;
}

int profiler_configure(struct cmdbuf *cb)
{
	if (!strcmp(cb->f[0], "prof_qlimit")) {
		if (cb->nf < 2)
			error(EFAIL, "prof_qlimit KB");
		if (kref_refcnt(&profiler_kref) > 0)
			error(EFAIL, "Profiler already running");
		profiler_queue_limit = (int) profiler_get_checked_value(
			cb->f[1], 1024, 1024 * 1024, max_pmem / 32);
		return 1;
	}
	if (!strcmp(cb->f[0], "prof_cpubufsz")) {
		if (cb->nf < 2)
			error(EFAIL, "prof_cpubufsz KB");
		profiler_cpu_buffer_size = (size_t) profiler_get_checked_value(
			cb->f[1], 1024, 16 * 1024, 1024 * 1024);
		return 1;
	}

	return 0;
}

void profiler_append_configure_usage(char *msgbuf, size_t buflen)
{
	const char * const cmds[] = {
		"prof_qlimit",
		"prof_cpubufsz",
	};

	for (int i = 0; i < ARRAY_SIZE(cmds); i++) {
		strlcat(msgbuf, "|", buflen);
		strlcat(msgbuf, cmds[i], buflen);
	}
}

static void profiler_release(struct kref *kref)
{
	bool got_reference = FALSE;

	assert(kref == &profiler_kref);
	qlock(&profiler_mtx);
	/* Make sure we did not race with profiler_setup(), that got the
	 * profiler_mtx lock just before us, and re-initialized the profiler
	 * for a new user.
	 * If we race here from another profiler_release() (user did a
	 * profiler_setup() immediately followed by a profiler_cleanup()) we are
	 * fine because free_cpu_buffers() can be called multiple times.
	 */
	if (!kref_get_not_zero(kref, 1))
		free_cpu_buffers();
	else
		got_reference = TRUE;
	qunlock(&profiler_mtx);
	/* We cannot call kref_put() within the profiler_kref lock, as such call
	 * might trigger anohter call to profiler_release().
	 */
	if (got_reference)
		kref_put(kref);
}

void profiler_init(void)
{
	assert(kref_refcnt(&profiler_kref) == 0);
	kref_init(&profiler_kref, profiler_release, 0);
}

void profiler_setup(void)
{
	ERRSTACK(1);

	qlock(&profiler_mtx);
	if (waserror()) {
		qunlock(&profiler_mtx);
		nexterror();
	}
	assert(!profiler_queue);
	alloc_cpu_buffers();

	/* Do this only when everything is initialized (as last init operation).
	 */
	__kref_get(&profiler_kref, 1);

	profiler_emit_current_system_status();

	poperror();
	qunlock(&profiler_mtx);
}

void profiler_cleanup(void)
{
	kref_put(&profiler_kref);
}

static void profiler_cpu_flush(struct profiler_cpu_context *cpu_buf)
{
	int8_t irq_state = 0;

	disable_irqsave(&irq_state);
	if (cpu_buf->block && profiler_queue) {
		qibwrite(profiler_queue, cpu_buf->block);

		cpu_buf->block = NULL;
	}
	enable_irqsave(&irq_state);
}

static void profiler_core_trace_enable(void *opaque)
{
	struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());

	cpu_buf->tracing = (int) (opaque != NULL);
	if (!cpu_buf->tracing)
		profiler_cpu_flush(cpu_buf);
}

static void profiler_control_trace(int onoff)
{
	struct core_set cset;

	error_assert(EINVAL, profiler_percpu_ctx);

	core_set_init(&cset);
	core_set_fill_available(&cset);
	smp_do_in_cores(&cset, profiler_core_trace_enable,
	                (void *) (uintptr_t) onoff);
}

void profiler_start(void)
{
	assert(profiler_queue);
	profiler_control_trace(1);
	qreopen(profiler_queue);
}

void profiler_stop(void)
{
	assert(profiler_queue);
	profiler_control_trace(0);
	qhangup(profiler_queue, 0);
}

static void profiler_core_flush(void *opaque)
{
	struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());

	profiler_cpu_flush(cpu_buf);
}

void profiler_trace_data_flush(void)
{
	struct core_set cset;

	error_assert(EINVAL, profiler_percpu_ctx);

	core_set_init(&cset);
	core_set_fill_available(&cset);
	smp_do_in_cores(&cset, profiler_core_flush, NULL);
}

void profiler_push_kernel_backtrace(uintptr_t *pc_list, size_t nr_pcs,
                                    uint64_t info)
{
	if (kref_get_not_zero(&profiler_kref, 1)) {
		struct profiler_cpu_context *cpu_buf =
			profiler_get_cpu_ctx(core_id());

		if (profiler_percpu_ctx && cpu_buf->tracing)
			profiler_push_kernel_trace64(cpu_buf, pc_list, nr_pcs,
						     info);
		kref_put(&profiler_kref);
	}
}

void profiler_push_user_backtrace(uintptr_t *pc_list, size_t nr_pcs,
                                  uint64_t info)
{
	if (kref_get_not_zero(&profiler_kref, 1)) {
		struct proc *p = current;
		struct profiler_cpu_context *cpu_buf =
			profiler_get_cpu_ctx(core_id());

		if (profiler_percpu_ctx && cpu_buf->tracing)
			profiler_push_user_trace64(cpu_buf, p, pc_list, nr_pcs,
						   info);
		kref_put(&profiler_kref);
	}
}

int profiler_size(void)
{
	return profiler_queue ? qlen(profiler_queue) : 0;
}

int profiler_read(void *va, int n)
{
	return profiler_queue ? qread(profiler_queue, va, n) : 0;
}

void profiler_notify_mmap(struct proc *p, uintptr_t addr, size_t size, int prot,
                          int flags, struct file_or_chan *foc, size_t offset)
{
	if (kref_get_not_zero(&profiler_kref, 1)) {
		if (foc && (prot & PROT_EXEC) && profiler_percpu_ctx) {
			char path_buf[PROFILER_MAX_PRG_PATH];
			char *path = foc_abs_path(foc, path_buf,
						  sizeof(path_buf));

			if (likely(path))
				profiler_push_pid_mmap(p, addr, size, offset,
						       path);
		}
		kref_put(&profiler_kref);
	}
}

void profiler_notify_new_process(struct proc *p)
{
	if (kref_get_not_zero(&profiler_kref, 1)) {
		if (profiler_percpu_ctx && p->binary_path)
			profiler_push_new_process(p);
		kref_put(&profiler_kref);
	}
}
