kern/src/profiler.c - upstream - Git at Google

 /* Copyright (c) 2015 Google Inc
  * Davide Libenzi <dlibenzi@google.com>
  * See LICENSE for details.
  *
  * This controls the emitting, collecting, and exporting of samples for perf
  * events.  Examples of events are PMU counter overflows, mmaps, and process
  * creation.
  *
  * Events are collected in a central qio queue.  High-frequency events (e.g.
  * IRQ backtraces()) are collected in per-core buffers, which are flushed to the
  * central queue when they fill up or on command.  Lower-frequency events (e.g.
  * profiler_notify_mmap()) just go straight to the central queue.
  *
  * Currently there is one global profiler.  Kprof is careful to only have one
  * open profiler at a time.  See profiler.h for more info.  The only sync we do
  * in this file is for the functions that are not called while holding the kprof
  * mutex - specifically the RCU-protected backtrace sampling code.
  *
  * A few other notes:
  * - profiler_control_trace() controls the per-core trace collection.  When it
  *   is disabled, it also flushes the per-core blocks to the central queue.
  * - The collection of mmap and comm samples is independent of trace collection.
  *   Those will occur whenever the profiler is open, even if it is not started.
  * - Looks like we don't bother with munmap records.  Not sure if perf can
  *   handle it or not. */

 #include <ros/common.h>
 #include <ros/mman.h>
 #include <sys/types.h>
 #include <smp.h>
 #include <trap.h>
 #include <kthread.h>
 #include <env.h>
 #include <process.h>
 #include <mm.h>
 #include <kmalloc.h>
 #include <pmap.h>
 #include <atomic.h>
 #include <umem.h>
 #include <elf.h>
 #include <ns.h>
 #include <err.h>
 #include <core_set.h>
 #include <string.h>
 #include "profiler.h"

 #define PROFILER_MAX_PRG_PATH	256

 #define VBE_MAX_SIZE(t) ((8 * sizeof(t) + 6) / 7)

 /* Do not rely on the contents of the PCPU ctx with IRQs enabled. */
 struct profiler_cpu_context {
 	struct block *block;
 	int cpu;
 	bool tracing;
 	size_t dropped_data_cnt;
 };

 /* These are a little hokey, and are currently global vars */
 static int profiler_queue_limit = 64 * 1024 * 1024;
 static size_t profiler_cpu_buffer_size = 65536;

 struct profiler {
 	struct profiler_cpu_context *pcpu_ctx;
 	struct queue *qio;
 	bool tracing;
 };

 static struct profiler __rcu *gbl_prof;

 static struct profiler_cpu_context *profiler_get_cpu_ctx(struct profiler *prof,
 							 int cpu)
 {
 	return prof->pcpu_ctx + cpu;
 }

 static inline char *vb_encode_uint64(char *data, uint64_t n)
 {
 	/* Classical variable bytes encoding. Encodes 7 bits at a time, using
 	 * bit number 7 in the byte, as indicator of end of sequence (when
 	 * zero). */
 	for (; n >= 0x80; n >>= 7)
 		*data++ = (char) (n | 0x80);
 	*data++ = (char) n;

 	return data;
 }

 static struct block *profiler_buffer_write(struct profiler *prof,
 					   struct profiler_cpu_context *cpu_buf,
                                            struct block *b)
 {
 	/* qpass will drop b if the queue is over its limit.  we're willing to
 	 * lose traces, but we won't lose 'control' events, such as MMAP and
 	 * PID. */
 	if (b) {
 		if (qpass(prof->qio, b) < 0)
 			cpu_buf->dropped_data_cnt++;
 	}
 	return block_alloc(profiler_cpu_buffer_size, MEM_ATOMIC);
 }

 /* Helper, paired with profiler_cpu_buffer_write_commit.  Ensures there is
  * enough room in the pcpu block for our write.  May alloc a new one.
  *
  * IRQs must be disabled before calling, until after write_commit. */
 static char *profiler_cpu_buffer_write_reserve(struct profiler *prof,
 	struct profiler_cpu_context *cpu_buf, size_t size, struct block **pb)
 {
 	struct block *b = cpu_buf->block;

 	if (unlikely((!b) || (b->lim - b->wp) < size)) {
 		cpu_buf->block = b = profiler_buffer_write(prof, cpu_buf, b);
 		if (unlikely(!b))
 			return NULL;
 	}
 	*pb = b;

 	return (char *) b->wp;
 }

 /* Helper, paired with write_reserve.  Finalizes the writing into the block's
  * main body of @size bytes.  IRQs must be disabled until after this is called.
  */
 static inline void profiler_cpu_buffer_write_commit(
 	struct profiler_cpu_context *cpu_buf, struct block *b, size_t size)
 {
 	b->wp += size;
 }

 static inline size_t profiler_max_envelope_size(void)
 {
 	return 2 * VBE_MAX_SIZE(uint64_t);
 }

 static void profiler_push_kernel_trace64(struct profiler *prof,
 					 struct profiler_cpu_context *cpu_buf,
                                          const uintptr_t *trace, size_t count,
                                          uint64_t info)
 {
 	struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 	size_t size = sizeof(struct proftype_kern_trace64) +
 		count * sizeof(uint64_t);
 	struct block *b;
 	void *resptr, *ptr;

 	assert(!irq_is_enabled());
 	resptr = profiler_cpu_buffer_write_reserve(prof,
 	    cpu_buf, size + profiler_max_envelope_size(), &b);
 	ptr = resptr;

 	if (likely(ptr)) {
 		struct proftype_kern_trace64 *record;

 		ptr = vb_encode_uint64(ptr, PROFTYPE_KERN_TRACE64);
 		ptr = vb_encode_uint64(ptr, size);

 		record = (struct proftype_kern_trace64 *) ptr;
 		ptr += size;

 		record->info = info;
 		record->tstamp = nsec();
 		if (is_ktask(pcpui->cur_kthread) || !pcpui->cur_proc)
 			record->pid = -1;
 		else
 			record->pid = pcpui->cur_proc->pid;
 		record->cpu = cpu_buf->cpu;
 		record->num_traces = count;
 		for (size_t i = 0; i < count; i++)
 			record->trace[i] = (uint64_t) trace[i];

 		profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
 	}
 }

 static void profiler_push_user_trace64(struct profiler *prof,
 				       struct profiler_cpu_context *cpu_buf,
                                        struct proc *p, const uintptr_t *trace,
                                        size_t count, uint64_t info)
 {
 	size_t size = sizeof(struct proftype_user_trace64) +
 		count * sizeof(uint64_t);
 	struct block *b;
 	void *resptr, *ptr;

 	assert(!irq_is_enabled());
 	resptr = profiler_cpu_buffer_write_reserve(prof,
 	    cpu_buf, size + profiler_max_envelope_size(), &b);
 	ptr = resptr;

 	if (likely(ptr)) {
 		struct proftype_user_trace64 *record;

 		ptr = vb_encode_uint64(ptr, PROFTYPE_USER_TRACE64);
 		ptr = vb_encode_uint64(ptr, size);

 		record = (struct proftype_user_trace64 *) ptr;
 		ptr += size;

 		record->info = info;
 		record->tstamp = nsec();
 		record->pid = p->pid;
 		record->cpu = cpu_buf->cpu;
 		record->num_traces = count;
 		for (size_t i = 0; i < count; i++)
 			record->trace[i] = (uint64_t) trace[i];

 		profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
 	}
 }

 static void profiler_push_pid_mmap(struct profiler *prof, struct proc *p,
 				   uintptr_t addr, size_t msize, size_t offset,
 				   const char *path)
 {
 	size_t plen = strlen(path) + 1;
 	size_t size = sizeof(struct proftype_pid_mmap64) + plen;
 	void *resptr = kmalloc(size + profiler_max_envelope_size(), MEM_ATOMIC);

 	if (likely(resptr)) {
 		void *ptr = resptr;
 		struct proftype_pid_mmap64 *record;

 		ptr = vb_encode_uint64(ptr, PROFTYPE_PID_MMAP64);
 		ptr = vb_encode_uint64(ptr, size);

 		record = (struct proftype_pid_mmap64 *) ptr;
 		ptr += size;

 		record->tstamp = nsec();
 		record->pid = p->pid;
 		record->addr = addr;
 		record->size = msize;
 		record->offset = offset;
 		memcpy(record->path, path, plen);

 		qiwrite(prof->qio, resptr, (int) (ptr - resptr));

 		kfree(resptr);
 	}
 }

 static void profiler_push_new_process(struct profiler *prof, struct proc *p)
 {
 	size_t plen = strlen(p->binary_path) + 1;
 	size_t size = sizeof(struct proftype_new_process) + plen;
 	void *resptr = kmalloc(size + profiler_max_envelope_size(), MEM_ATOMIC);

 	if (likely(resptr)) {
 		void *ptr = resptr;
 		struct proftype_new_process *record;

 		ptr = vb_encode_uint64(ptr, PROFTYPE_NEW_PROCESS);
 		ptr = vb_encode_uint64(ptr, size);

 		record = (struct proftype_new_process *) ptr;
 		ptr += size;

 		record->tstamp = nsec();
 		record->pid = p->pid;
 		memcpy(record->path, p->binary_path, plen);

 		qiwrite(prof->qio, resptr, (int) (ptr - resptr));

 		kfree(resptr);
 	}
 }

 static void profiler_emit_current_system_status(void)
 {
 	void enum_proc(struct vm_region *vmr, void *opaque)
 	{
 		struct proc *p = (struct proc *) opaque;

 		profiler_notify_mmap(p, vmr->vm_base,
 				     vmr->vm_end - vmr->vm_base,
 		                     vmr->vm_prot, vmr->vm_flags, vmr->__vm_foc,
 		                     vmr->vm_foff);
 	}

 	struct process_set pset;

 	proc_get_set(&pset);

 	for (size_t i = 0; i < pset.num_processes; i++) {
 		profiler_notify_new_process(pset.procs[i]);
 		enumerate_vmrs(pset.procs[i], enum_proc, pset.procs[i]);
 	}

 	proc_free_set(&pset);
 }

 static long profiler_get_checked_value(const char *value, long k, long minval,
                                        long maxval)
 {
 	long lvalue = strtol(value, NULL, 0) * k;

 	if (lvalue < minval)
 		error(EFAIL, "Value should be greater than %ld", minval);
 	if (lvalue > maxval)
 		error(EFAIL, "Value should be lower than %ld", maxval);

 	return lvalue;
 }

 /* TODO: This configure stuff is a little hokey.  You have to configure before
  * it's been opened, meaning before you have the kprofctlqid, but you can't
  * configure until you have the chan.  To use this, you'd need to open, then
  * config, then close, then hope that the global settings stick around, then
  * open and run it.
  *
  * Also note that no one uses this. */
 int profiler_configure(struct cmdbuf *cb)
 {
 	if (!strcmp(cb->f[0], "prof_qlimit")) {
 		if (cb->nf < 2)
 			error(EFAIL, "prof_qlimit KB");
 		/* If the profiler is already running, this won't take effect
 		 * until the next open.  Feel free to change this. */
 		WRITE_ONCE(profiler_queue_limit,
 			   profiler_get_checked_value(cb->f[1], 1024,
 						      1024 * 1024,
 						      max_pmem / 32));
 		return 1;
 	}
 	if (!strcmp(cb->f[0], "prof_cpubufsz")) {
 		if (cb->nf < 2)
 			error(EFAIL, "prof_cpubufsz KB");
 		WRITE_ONCE(profiler_cpu_buffer_size,
 			   profiler_get_checked_value(cb->f[1], 1024,
 						      16 * 1024,
 						      1024 * 1024));
 		return 1;
 	}

 	return 0;
 }

 void profiler_append_configure_usage(char *msgbuf, size_t buflen)
 {
 	const char * const cmds[] = {
 		"prof_qlimit",
 		"prof_cpubufsz",
 	};

 	for (int i = 0; i < ARRAY_SIZE(cmds); i++) {
 		strlcat(msgbuf, "|", buflen);
 		strlcat(msgbuf, cmds[i], buflen);
 	}
 }

 int profiler_setup(void)
 {
 	struct profiler *prof;

 	assert(!rcu_dereference_check(gbl_prof, true));
 	prof = kzmalloc(sizeof(struct profiler), MEM_WAIT);
 	/* It is very important that we enqueue and dequeue entire records at
 	 * once.  If we leave partial records, the entire stream will be
 	 * corrupt.  Our reader does its best to make sure it has room for
 	 * complete records (checks qlen()).
 	 *
 	 * If we ever get corrupt streams, try making this a Qmsg.  Though it
 	 * doesn't help every situation - we have issues with writes greater
 	 * than Maxatomic regardless. */
 	prof->qio = qopen(profiler_queue_limit, 0, NULL, NULL);
 	if (!prof->qio) {
 		kfree(prof);
 		return -1;
 	}
 	prof->pcpu_ctx = kzmalloc(sizeof(struct profiler_cpu_context)
 				  * num_cores, MEM_WAIT);
 	for (int i = 0; i < num_cores; i++) {
 		struct profiler_cpu_context *b = &prof->pcpu_ctx[i];

 		b->cpu = i;
 	}
 	rcu_assign_pointer(gbl_prof, prof);
 	profiler_emit_current_system_status();
 	return 0;
 }

 void profiler_cleanup(void)
 {
 	struct profiler *prof = rcu_dereference_protected(gbl_prof, true);

 	RCU_INIT_POINTER(gbl_prof, NULL);
 	synchronize_rcu();
 	kfree(prof->pcpu_ctx);
 	qfree(prof->qio);
 	kfree(prof);
 }

 static void profiler_cpu_flush(struct profiler *prof,
 			       struct profiler_cpu_context *cpu_buf)
 {
 	int8_t irq_state = 0;

 	disable_irqsave(&irq_state);
 	if (cpu_buf->block) {
 		qibwrite(prof->qio, cpu_buf->block);

 		cpu_buf->block = NULL;
 	}
 	enable_irqsave(&irq_state);
 }

 static void __profiler_core_trace_enable(void *opaque)
 {
 	struct profiler *prof = opaque;
 	struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(prof,
 								    core_id());

 	cpu_buf->tracing = prof->tracing;
 	if (!cpu_buf->tracing)
 		profiler_cpu_flush(prof, cpu_buf);
 }

 static void profiler_control_trace(struct profiler *prof, int onoff)
 {
 	struct core_set cset;

 	assert(prof);

 	core_set_init(&cset);
 	core_set_fill_available(&cset);
 	prof->tracing = onoff;
 	/* Note this blocks until all cores have run the function. */
 	smp_do_in_cores(&cset, __profiler_core_trace_enable, prof);
 }

 /* This must only be called by the Kprofctlqid FD holder, ensuring that the
  * profiler exists.  Not thread-safe. */
 void profiler_start(void)
 {
 	struct profiler *prof = rcu_dereference_protected(gbl_prof, true);

 	profiler_control_trace(prof, 1);
 	qreopen(prof->qio);
 }

 /* This must only be called by the Kprofctlqid FD holder, ensuring that the
  * profiler exists.  Not thread-safe. */
 void profiler_stop(void)
 {
 	struct profiler *prof = rcu_dereference_protected(gbl_prof, true);

 	profiler_control_trace(prof, 0);
 	qhangup(prof->qio, 0);
 }

 static void __profiler_core_flush(void *opaque)
 {
 	struct profiler *prof = opaque;
 	struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(prof,
 								    core_id());

 	profiler_cpu_flush(prof, cpu_buf);
 }

 /* This must only be called by the Kprofctlqid FD holder, ensuring that the
  * profiler exists. */
 void profiler_trace_data_flush(void)
 {
 	struct core_set cset;

 	core_set_init(&cset);
 	core_set_fill_available(&cset);
 	smp_do_in_cores(&cset, __profiler_core_flush, NULL);
 }

 void profiler_push_kernel_backtrace(uintptr_t *pc_list, size_t nr_pcs,
                                     uint64_t info)
 {
 	struct profiler *prof;

 	rcu_read_lock();
 	prof = rcu_dereference(gbl_prof);
 	if (prof) {
 		struct profiler_cpu_context *cpu_buf =
 			profiler_get_cpu_ctx(prof, core_id());

 		if (cpu_buf->tracing)
 			profiler_push_kernel_trace64(prof, cpu_buf, pc_list,
 						     nr_pcs, info);
 	}
 	rcu_read_unlock();
 }

 void profiler_push_user_backtrace(uintptr_t *pc_list, size_t nr_pcs,
                                   uint64_t info)
 {
 	struct profiler *prof;

 	rcu_read_lock();
 	prof = rcu_dereference(gbl_prof);
 	if (prof) {
 		struct profiler_cpu_context *cpu_buf =
 			profiler_get_cpu_ctx(prof, core_id());

 		if (cpu_buf->tracing)
 			profiler_push_user_trace64(prof, cpu_buf, current,
 						   pc_list, nr_pcs, info);
 	}
 	rcu_read_unlock();
 }

 size_t profiler_size(void)
 {
 	struct profiler *prof;
 	size_t ret;

 	rcu_read_lock();
 	prof = rcu_dereference(gbl_prof);
 	ret = prof ? qlen(prof->qio) : 0;
 	rcu_read_unlock();
 	return ret;
 }

 size_t profiler_read(void *va, size_t n)
 {
 	struct profiler *prof;
 	size_t ret;

 	rcu_read_lock();
 	prof = rcu_dereference(gbl_prof);
 	ret = prof ? qread(prof->qio, va, n) : 0;
 	rcu_read_unlock();
 	return ret;
 }

 void profiler_notify_mmap(struct proc *p, uintptr_t addr, size_t size, int prot,
                           int flags, struct file_or_chan *foc, size_t offset)
 {
 	struct profiler *prof;
 	char *path;

 	rcu_read_lock();
 	prof = rcu_dereference(gbl_prof);
 	if (prof && foc && (prot & PROT_EXEC))
 		profiler_push_pid_mmap(prof, p, addr, size, offset,
 				       foc_abs_path(foc));
 	rcu_read_unlock();
 }

 void profiler_notify_new_process(struct proc *p)
 {
 	struct profiler *prof;

 	rcu_read_lock();
 	prof = rcu_dereference(gbl_prof);
 	if (prof && p->binary_path)
 		profiler_push_new_process(prof, p);
 	rcu_read_unlock();
 }
	/* Copyright (c) 2015 Google Inc
	* Davide Libenzi <dlibenzi@google.com>
	* See LICENSE for details.
	*
	* This controls the emitting, collecting, and exporting of samples for perf
	* events. Examples of events are PMU counter overflows, mmaps, and process
	* creation.
	*
	* Events are collected in a central qio queue. High-frequency events (e.g.
	* IRQ backtraces()) are collected in per-core buffers, which are flushed to the
	* central queue when they fill up or on command. Lower-frequency events (e.g.
	* profiler_notify_mmap()) just go straight to the central queue.
	*
	* Currently there is one global profiler. Kprof is careful to only have one
	* open profiler at a time. See profiler.h for more info. The only sync we do
	* in this file is for the functions that are not called while holding the kprof
	* mutex - specifically the RCU-protected backtrace sampling code.
	*
	* A few other notes:
	* - profiler_control_trace() controls the per-core trace collection. When it
	* is disabled, it also flushes the per-core blocks to the central queue.
	* - The collection of mmap and comm samples is independent of trace collection.
	* Those will occur whenever the profiler is open, even if it is not started.
	* - Looks like we don't bother with munmap records. Not sure if perf can
	* handle it or not. */

	#include <ros/common.h>
	#include <ros/mman.h>
	#include <sys/types.h>
	#include <smp.h>
	#include <trap.h>
	#include <kthread.h>
	#include <env.h>
	#include <process.h>
	#include <mm.h>
	#include <kmalloc.h>
	#include <pmap.h>
	#include <atomic.h>
	#include <umem.h>
	#include <elf.h>
	#include <ns.h>
	#include <err.h>
	#include <core_set.h>
	#include <string.h>
	#include "profiler.h"

	#define PROFILER_MAX_PRG_PATH 256

	#define VBE_MAX_SIZE(t) ((8 * sizeof(t) + 6) / 7)

	/* Do not rely on the contents of the PCPU ctx with IRQs enabled. */
	struct profiler_cpu_context {
	struct block *block;
	int cpu;
	bool tracing;
	size_t dropped_data_cnt;
	};

	/* These are a little hokey, and are currently global vars */
	static int profiler_queue_limit = 64 * 1024 * 1024;
	static size_t profiler_cpu_buffer_size = 65536;

	struct profiler {
	struct profiler_cpu_context *pcpu_ctx;
	struct queue *qio;
	bool tracing;
	};

	static struct profiler __rcu *gbl_prof;

	static struct profiler_cpu_context profiler_get_cpu_ctx(struct profiler prof,
	int cpu)
	{
	return prof->pcpu_ctx + cpu;
	}

	static inline char vb_encode_uint64(char data, uint64_t n)
	{
	/* Classical variable bytes encoding. Encodes 7 bits at a time, using
	* bit number 7 in the byte, as indicator of end of sequence (when
	* zero). */
	for (; n >= 0x80; n >>= 7)
	*data++ = (char) (n \| 0x80);
	*data++ = (char) n;

	return data;
	}

	static struct block profiler_buffer_write(struct profiler prof,
	struct profiler_cpu_context *cpu_buf,
	struct block *b)
	{
	/* qpass will drop b if the queue is over its limit. we're willing to
	* lose traces, but we won't lose 'control' events, such as MMAP and
	* PID. */
	if (b) {
	if (qpass(prof->qio, b) < 0)
	cpu_buf->dropped_data_cnt++;
	}
	return block_alloc(profiler_cpu_buffer_size, MEM_ATOMIC);
	}

	/* Helper, paired with profiler_cpu_buffer_write_commit. Ensures there is
	* enough room in the pcpu block for our write. May alloc a new one.
	*
	* IRQs must be disabled before calling, until after write_commit. */
	static char profiler_cpu_buffer_write_reserve(struct profiler prof,
	struct profiler_cpu_context cpu_buf, size_t size, struct block *pb)
	{
	struct block *b = cpu_buf->block;

	if (unlikely((!b) \|\| (b->lim - b->wp) < size)) {
	cpu_buf->block = b = profiler_buffer_write(prof, cpu_buf, b);
	if (unlikely(!b))
	return NULL;
	}
	*pb = b;

	return (char *) b->wp;
	}

	/* Helper, paired with write_reserve. Finalizes the writing into the block's
	* main body of @size bytes. IRQs must be disabled until after this is called.
	*/
	static inline void profiler_cpu_buffer_write_commit(
	struct profiler_cpu_context cpu_buf, struct block b, size_t size)
	{
	b->wp += size;
	}

	static inline size_t profiler_max_envelope_size(void)
	{
	return 2 * VBE_MAX_SIZE(uint64_t);
	}

	static void profiler_push_kernel_trace64(struct profiler *prof,
	struct profiler_cpu_context *cpu_buf,
	const uintptr_t *trace, size_t count,
	uint64_t info)
	{
	struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
	size_t size = sizeof(struct proftype_kern_trace64) +
	count * sizeof(uint64_t);
	struct block *b;
	void resptr, ptr;

	assert(!irq_is_enabled());
	resptr = profiler_cpu_buffer_write_reserve(prof,
	cpu_buf, size + profiler_max_envelope_size(), &b);
	ptr = resptr;

	if (likely(ptr)) {
	struct proftype_kern_trace64 *record;

	ptr = vb_encode_uint64(ptr, PROFTYPE_KERN_TRACE64);
	ptr = vb_encode_uint64(ptr, size);

	record = (struct proftype_kern_trace64 *) ptr;
	ptr += size;

	record->info = info;
	record->tstamp = nsec();
	if (is_ktask(pcpui->cur_kthread) \|\| !pcpui->cur_proc)
	record->pid = -1;
	else
	record->pid = pcpui->cur_proc->pid;
	record->cpu = cpu_buf->cpu;
	record->num_traces = count;
	for (size_t i = 0; i < count; i++)
	record->trace[i] = (uint64_t) trace[i];

	profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
	}
	}

	static void profiler_push_user_trace64(struct profiler *prof,
	struct profiler_cpu_context *cpu_buf,
	struct proc p, const uintptr_t trace,
	size_t count, uint64_t info)
	{
	size_t size = sizeof(struct proftype_user_trace64) +
	count * sizeof(uint64_t);
	struct block *b;
	void resptr, ptr;

	assert(!irq_is_enabled());
	resptr = profiler_cpu_buffer_write_reserve(prof,
	cpu_buf, size + profiler_max_envelope_size(), &b);
	ptr = resptr;

	if (likely(ptr)) {
	struct proftype_user_trace64 *record;

	ptr = vb_encode_uint64(ptr, PROFTYPE_USER_TRACE64);
	ptr = vb_encode_uint64(ptr, size);

	record = (struct proftype_user_trace64 *) ptr;
	ptr += size;

	record->info = info;
	record->tstamp = nsec();
	record->pid = p->pid;
	record->cpu = cpu_buf->cpu;
	record->num_traces = count;
	for (size_t i = 0; i < count; i++)
	record->trace[i] = (uint64_t) trace[i];

	profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
	}
	}

	static void profiler_push_pid_mmap(struct profiler prof, struct proc p,
	uintptr_t addr, size_t msize, size_t offset,
	const char *path)
	{
	size_t plen = strlen(path) + 1;
	size_t size = sizeof(struct proftype_pid_mmap64) + plen;
	void *resptr = kmalloc(size + profiler_max_envelope_size(), MEM_ATOMIC);

	if (likely(resptr)) {
	void *ptr = resptr;
	struct proftype_pid_mmap64 *record;

	ptr = vb_encode_uint64(ptr, PROFTYPE_PID_MMAP64);
	ptr = vb_encode_uint64(ptr, size);

	record = (struct proftype_pid_mmap64 *) ptr;
	ptr += size;

	record->tstamp = nsec();
	record->pid = p->pid;
	record->addr = addr;
	record->size = msize;
	record->offset = offset;
	memcpy(record->path, path, plen);

	qiwrite(prof->qio, resptr, (int) (ptr - resptr));

	kfree(resptr);
	}
	}

	static void profiler_push_new_process(struct profiler prof, struct proc p)
	{
	size_t plen = strlen(p->binary_path) + 1;
	size_t size = sizeof(struct proftype_new_process) + plen;
	void *resptr = kmalloc(size + profiler_max_envelope_size(), MEM_ATOMIC);

	if (likely(resptr)) {
	void *ptr = resptr;
	struct proftype_new_process *record;

	ptr = vb_encode_uint64(ptr, PROFTYPE_NEW_PROCESS);
	ptr = vb_encode_uint64(ptr, size);

	record = (struct proftype_new_process *) ptr;
	ptr += size;

	record->tstamp = nsec();
	record->pid = p->pid;
	memcpy(record->path, p->binary_path, plen);

	qiwrite(prof->qio, resptr, (int) (ptr - resptr));

	kfree(resptr);
	}
	}

	static void profiler_emit_current_system_status(void)
	{
	void enum_proc(struct vm_region vmr, void opaque)
	{
	struct proc p = (struct proc ) opaque;

	profiler_notify_mmap(p, vmr->vm_base,
	vmr->vm_end - vmr->vm_base,
	vmr->vm_prot, vmr->vm_flags, vmr->__vm_foc,
	vmr->vm_foff);
	}

	struct process_set pset;

	proc_get_set(&pset);

	for (size_t i = 0; i < pset.num_processes; i++) {
	profiler_notify_new_process(pset.procs[i]);
	enumerate_vmrs(pset.procs[i], enum_proc, pset.procs[i]);
	}

	proc_free_set(&pset);
	}

	static long profiler_get_checked_value(const char *value, long k, long minval,
	long maxval)
	{
	long lvalue = strtol(value, NULL, 0) * k;

	if (lvalue < minval)
	error(EFAIL, "Value should be greater than %ld", minval);
	if (lvalue > maxval)
	error(EFAIL, "Value should be lower than %ld", maxval);

	return lvalue;
	}

	/* TODO: This configure stuff is a little hokey. You have to configure before
	* it's been opened, meaning before you have the kprofctlqid, but you can't
	* configure until you have the chan. To use this, you'd need to open, then
	* config, then close, then hope that the global settings stick around, then
	* open and run it.
	*
	* Also note that no one uses this. */
	int profiler_configure(struct cmdbuf *cb)
	{
	if (!strcmp(cb->f[0], "prof_qlimit")) {
	if (cb->nf < 2)
	error(EFAIL, "prof_qlimit KB");
	/* If the profiler is already running, this won't take effect
	* until the next open. Feel free to change this. */
	WRITE_ONCE(profiler_queue_limit,
	profiler_get_checked_value(cb->f[1], 1024,
	1024 * 1024,
	max_pmem / 32));
	return 1;
	}
	if (!strcmp(cb->f[0], "prof_cpubufsz")) {
	if (cb->nf < 2)
	error(EFAIL, "prof_cpubufsz KB");
	WRITE_ONCE(profiler_cpu_buffer_size,
	profiler_get_checked_value(cb->f[1], 1024,
	16 * 1024,
	1024 * 1024));
	return 1;
	}

	return 0;
	}

	void profiler_append_configure_usage(char *msgbuf, size_t buflen)
	{
	const char * const cmds[] = {
	"prof_qlimit",
	"prof_cpubufsz",
	};

	for (int i = 0; i < ARRAY_SIZE(cmds); i++) {
	strlcat(msgbuf, "\|", buflen);
	strlcat(msgbuf, cmds[i], buflen);
	}
	}

	int profiler_setup(void)
	{
	struct profiler *prof;

	assert(!rcu_dereference_check(gbl_prof, true));
	prof = kzmalloc(sizeof(struct profiler), MEM_WAIT);
	/* It is very important that we enqueue and dequeue entire records at
	* once. If we leave partial records, the entire stream will be
	* corrupt. Our reader does its best to make sure it has room for
	* complete records (checks qlen()).
	*
	* If we ever get corrupt streams, try making this a Qmsg. Though it
	* doesn't help every situation - we have issues with writes greater
	* than Maxatomic regardless. */
	prof->qio = qopen(profiler_queue_limit, 0, NULL, NULL);
	if (!prof->qio) {
	kfree(prof);
	return -1;
	}
	prof->pcpu_ctx = kzmalloc(sizeof(struct profiler_cpu_context)
	* num_cores, MEM_WAIT);
	for (int i = 0; i < num_cores; i++) {
	struct profiler_cpu_context *b = &prof->pcpu_ctx[i];

	b->cpu = i;
	}
	rcu_assign_pointer(gbl_prof, prof);
	profiler_emit_current_system_status();
	return 0;
	}

	void profiler_cleanup(void)
	{
	struct profiler *prof = rcu_dereference_protected(gbl_prof, true);

	RCU_INIT_POINTER(gbl_prof, NULL);
	synchronize_rcu();
	kfree(prof->pcpu_ctx);
	qfree(prof->qio);
	kfree(prof);
	}

	static void profiler_cpu_flush(struct profiler *prof,
	struct profiler_cpu_context *cpu_buf)
	{
	int8_t irq_state = 0;

	disable_irqsave(&irq_state);
	if (cpu_buf->block) {
	qibwrite(prof->qio, cpu_buf->block);

	cpu_buf->block = NULL;
	}
	enable_irqsave(&irq_state);
	}

	static void __profiler_core_trace_enable(void *opaque)
	{
	struct profiler *prof = opaque;
	struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(prof,
	core_id());

	cpu_buf->tracing = prof->tracing;
	if (!cpu_buf->tracing)
	profiler_cpu_flush(prof, cpu_buf);
	}

	static void profiler_control_trace(struct profiler *prof, int onoff)
	{
	struct core_set cset;

	assert(prof);

	core_set_init(&cset);
	core_set_fill_available(&cset);
	prof->tracing = onoff;
	/* Note this blocks until all cores have run the function. */
	smp_do_in_cores(&cset, __profiler_core_trace_enable, prof);
	}

	/* This must only be called by the Kprofctlqid FD holder, ensuring that the
	* profiler exists. Not thread-safe. */
	void profiler_start(void)
	{
	struct profiler *prof = rcu_dereference_protected(gbl_prof, true);

	profiler_control_trace(prof, 1);
	qreopen(prof->qio);
	}

	/* This must only be called by the Kprofctlqid FD holder, ensuring that the
	* profiler exists. Not thread-safe. */
	void profiler_stop(void)
	{
	struct profiler *prof = rcu_dereference_protected(gbl_prof, true);

	profiler_control_trace(prof, 0);
	qhangup(prof->qio, 0);
	}

	static void __profiler_core_flush(void *opaque)
	{
	struct profiler *prof = opaque;
	struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(prof,
	core_id());

	profiler_cpu_flush(prof, cpu_buf);
	}

	/* This must only be called by the Kprofctlqid FD holder, ensuring that the
	* profiler exists. */
	void profiler_trace_data_flush(void)
	{
	struct core_set cset;

	core_set_init(&cset);
	core_set_fill_available(&cset);
	smp_do_in_cores(&cset, __profiler_core_flush, NULL);
	}

	void profiler_push_kernel_backtrace(uintptr_t *pc_list, size_t nr_pcs,
	uint64_t info)
	{
	struct profiler *prof;

	rcu_read_lock();
	prof = rcu_dereference(gbl_prof);
	if (prof) {
	struct profiler_cpu_context *cpu_buf =
	profiler_get_cpu_ctx(prof, core_id());

	if (cpu_buf->tracing)
	profiler_push_kernel_trace64(prof, cpu_buf, pc_list,
	nr_pcs, info);
	}
	rcu_read_unlock();
	}

	void profiler_push_user_backtrace(uintptr_t *pc_list, size_t nr_pcs,
	uint64_t info)
	{
	struct profiler *prof;

	rcu_read_lock();
	prof = rcu_dereference(gbl_prof);
	if (prof) {
	struct profiler_cpu_context *cpu_buf =
	profiler_get_cpu_ctx(prof, core_id());

	if (cpu_buf->tracing)
	profiler_push_user_trace64(prof, cpu_buf, current,
	pc_list, nr_pcs, info);
	}
	rcu_read_unlock();
	}

	size_t profiler_size(void)
	{
	struct profiler *prof;
	size_t ret;

	rcu_read_lock();
	prof = rcu_dereference(gbl_prof);
	ret = prof ? qlen(prof->qio) : 0;
	rcu_read_unlock();
	return ret;
	}

	size_t profiler_read(void *va, size_t n)
	{
	struct profiler *prof;
	size_t ret;

	rcu_read_lock();
	prof = rcu_dereference(gbl_prof);
	ret = prof ? qread(prof->qio, va, n) : 0;
	rcu_read_unlock();
	return ret;
	}

	void profiler_notify_mmap(struct proc *p, uintptr_t addr, size_t size, int prot,
	int flags, struct file_or_chan *foc, size_t offset)
	{
	struct profiler *prof;
	char *path;

	rcu_read_lock();
	prof = rcu_dereference(gbl_prof);
	if (prof && foc && (prot & PROT_EXEC))
	profiler_push_pid_mmap(prof, p, addr, size, offset,
	foc_abs_path(foc));
	rcu_read_unlock();
	}

	void profiler_notify_new_process(struct proc *p)
	{
	struct profiler *prof;

	rcu_read_lock();
	prof = rcu_dereference(gbl_prof);
	if (prof && p->binary_path)
	profiler_push_new_process(prof, p);
	rcu_read_unlock();
	}