kern/arch/x86/perfmon.c - upstream - Git at Google

 /* Copyright (c) 2015 Google Inc
  * Davide Libenzi <dlibenzi@google.com>
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  *
  * Manages the setting and reading of hardware perf counters across all cores,
  * including generating samples in response to counter overflow interrupts.
  *
  * The hardware interface is pretty straightforward - it's mostly setting and
  * unsetting fixed and unfixed events, sometimes with interrupts and trigger
  * counts.
  *
  * The 'command' to the cores is a struct perfmon_alloc.  This tells the core
  * which event to set up (this is the perfmon_event).  The cores respond in
  * counters[], saying which of their counters it is using for that event.  If
  * the cores are given different alloc requests, it is possible that they might
  * choose different counters[] for the same event.
  *
  * These perfmon_allocs are collected in a perfmon_session.  The session is just
  * a bunch of allocs, which are referred to by index (the 'ped').  Currently,
  * the session is grabbed by whoever opens the perf FD in devarch, and closed
  * when that FD is closed.  They are 1:1 with devarch's perf_contexts.
  *
  * The values for the counters are extracted with perfmon_get_event_status(),
  * which uses a struct perfmon_status to collect the results.  We pass the
  * perfmon_alloc as part of the perfmon_status_env, since we need to tell the
  * core which counter we're talking about.
  *
  * You can have multiple sessions, but if you try to install the same counter in
  * multiple, concurrent sessions, the hardware might complain (it definitely
  * will if it is a fixed event). */

 #include <sys/types.h>
 #include <arch/ros/msr-index.h>
 #include <arch/x86.h>
 #include <arch/msr.h>
 #include <arch/uaccess.h>
 #include <ros/errno.h>
 #include <assert.h>
 #include <trap.h>
 #include <smp.h>
 #include <atomic.h>
 #include <core_set.h>
 #include <percpu.h>
 #include <kmalloc.h>
 #include <err.h>
 #include <string.h>
 #include <profiler.h>
 #include <arch/perfmon.h>

 #define FIXCNTR_NBITS 4
 #define FIXCNTR_MASK (((uint64_t) 1 << FIXCNTR_NBITS) - 1)

 struct perfmon_cpu_context {
 	spinlock_t lock;
 	struct perfmon_event counters[MAX_VAR_COUNTERS];
 	struct perfmon_event fixed_counters[MAX_FIX_COUNTERS];
 };

 struct perfmon_status_env {
 	struct perfmon_alloc *pa;
 	struct perfmon_status *pef;
 };

 static struct perfmon_cpu_caps cpu_caps;
 static DEFINE_PERCPU(struct perfmon_cpu_context, counters_env);
 DEFINE_PERCPU_INIT(perfmon_counters_env_init);

 #define PROFILER_BT_DEPTH 16

 struct sample_snapshot {
 	struct user_context		ctx;
 	uintptr_t			pc_list[PROFILER_BT_DEPTH];
 	size_t				nr_pcs;
 };
 static DEFINE_PERCPU(struct sample_snapshot, sample_snapshots);

 static void perfmon_counters_env_init(void)
 {
 	for (int i = 0; i < num_cores; i++) {
 		struct perfmon_cpu_context *cctx =
 			_PERCPU_VARPTR(counters_env, i);

 		spinlock_init_irqsave(&cctx->lock);
 	}
 }

 static void perfmon_read_cpu_caps(struct perfmon_cpu_caps *pcc)
 {
 	uint32_t a, b, c, d;

 	cpuid(0x0a, 0, &a, &b, &c, &d);

 	pcc->proc_arch_events = a >> 24;
 	pcc->bits_x_counter = (a >> 16) & 0xff;
 	pcc->counters_x_proc = (a >> 8) & 0xff;
 	pcc->bits_x_fix_counter = (d >> 5) & 0xff;
 	pcc->fix_counters_x_proc = d & 0x1f;
 	pcc->perfmon_version = a & 0xff;
 }

 static void perfmon_enable_event(int idx, uint64_t event)
 {
 	uint64_t gctrl;

 	/* Events need to be enabled in both MSRs */
 	write_msr(MSR_ARCH_PERFMON_EVENTSEL0 + idx, event);
 	gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
 	write_msr(MSR_CORE_PERF_GLOBAL_CTRL, gctrl | (1 << idx));
 }

 static void perfmon_disable_event(int idx)
 {
 	uint64_t gctrl;

 	/* Events can be disabled in either location.  We could just clear the
 	 * global ctrl, but we use the contents of EVENTSEL to say if the
 	 * counter is available or not. */
 	write_msr(MSR_ARCH_PERFMON_EVENTSEL0 + idx, 0);
 	gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
 	write_msr(MSR_CORE_PERF_GLOBAL_CTRL, gctrl & ~(1 << idx));
 }

 static bool perfmon_event_available(uint32_t idx)
 {
 	return read_msr(MSR_ARCH_PERFMON_EVENTSEL0 + idx) == 0;
 }

 /* Helper.  Given an event, a fixed counter index, and the contents of the fixed
  * counter ctl MSR, output the value for the fixed counter ctl that will enable
  * the event at idx. */
 static uint64_t perfmon_apply_fixevent_mask(uint64_t event, int idx,
                                             uint64_t base)
 {
 	uint64_t m = 0;

 	if (PMEV_GET_OS(event))
 		m |= (1 << 0);
 	if (PMEV_GET_USR(event))
 		m |= (1 << 1);
 	if (PMEV_GET_ANYTH(event))
 		m |= (1 << 2);
 	if (PMEV_GET_INTEN(event))
 		m |= (1 << 3);
 	/* Enable enforcement: we need at least one bit set so that this fixed
 	 * counter appears to be in use. */
 	if (PMEV_GET_EN(event) && !PMEV_GET_OS(event) && !PMEV_GET_USR(event))
 		m |= (1 << 0) | (1 << 1);

 	m <<= idx * FIXCNTR_NBITS;
 	m |= base & ~(FIXCNTR_MASK << (idx * FIXCNTR_NBITS));

 	return m;
 }

 /* These helpers take the fxctrl_value to save on a rdmsr. */
 static void perfmon_enable_fix_event(int idx, uint64_t event,
                                      uint64_t fxctrl_value)
 {
 	uint64_t gctrl, fx;

 	/* Enable in both locations: the bits in FIXED and the bit in GLOBAL. */
 	fx = perfmon_apply_fixevent_mask(event, idx, fxctrl_value);
 	write_msr(MSR_CORE_PERF_FIXED_CTR_CTRL, fx);
 	gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
 	write_msr(MSR_CORE_PERF_GLOBAL_CTRL,
 		  gctrl | ((uint64_t) 1 << (32 + idx)));
 }

 static void perfmon_disable_fix_event(int idx, uint64_t fxctrl_value)
 {
 	uint64_t gctrl;

 	/* Events can be disabled in either location.  We could just clear the
 	 * global ctrl, but we use the bits of fxctlr to say if the counter is
 	 * available or not. */
 	write_msr(MSR_CORE_PERF_FIXED_CTR_CTRL,
 	          fxctrl_value & ~(FIXCNTR_MASK << (idx * FIXCNTR_NBITS)));
 	gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
 	write_msr(MSR_CORE_PERF_GLOBAL_CTRL,
 		  gctrl & ~((uint64_t) 1 << (32 + idx)));
 }

 static bool perfmon_fix_event_available(uint32_t idx, uint64_t fxctrl_value)
 {
 	return (fxctrl_value & (FIXCNTR_MASK << (idx * FIXCNTR_NBITS))) == 0;
 }

 /* Helper to set a fixed perfcounter to trigger/overflow after count events.
  * Anytime you set a perfcounter to something non-zero, you need to use this
  * helper. */
 static void perfmon_set_fixed_trigger(unsigned int idx, uint64_t count)
 {
 	int64_t write_val = -(int64_t)count;

 	write_val &= (1ULL << cpu_caps.bits_x_fix_counter) - 1;
 	write_msr(MSR_CORE_PERF_FIXED_CTR0 + idx, write_val);
 }

 /* Helper to set a regular perfcounter to trigger/overflow after count events.
  * Anytime you set a perfcounter to something non-zero, you ought to use this
  * helper. */
 static void perfmon_set_unfixed_trigger(unsigned int idx, uint64_t count)
 {
 	int64_t write_val = -(int64_t)count;

 	write_val &= (1ULL << cpu_caps.bits_x_counter) - 1;
 	write_msr(MSR_IA32_PERFCTR0 + idx, write_val);
 }

 /* Helper: sets errno/errstr based on the error code returned from the core.  We
  * don't have a great way to get errors back from smp_do_in_cores() commands.
  * We use negative counter values (e.g. i = -EBUSY) to signal an error of a
  * certain type.  This converts that to something useful for userspace. */
 static void perfmon_convert_error(int err_code, int core_id)
 {
 	switch (err_code) {
 	case EBUSY:
 		set_error(err_code, "Fixed perf counter is busy on core %d",
 			  core_id);
 		break;
 	case ENOSPC:
 		set_error(err_code, "Perf counter idx out of range on core %d",
 		          core_id);
 		break;
 	case ENOENT:
 		set_error(err_code, "Perf counter not set on core %d", core_id);
 		break;
 	default:
 		set_error(err_code, "Unknown perf counter error on core %d",
 			  core_id);
 		break;
 	};
 }

 static void perfmon_do_cores_alloc(void *opaque)
 {
 	struct perfmon_alloc *pa = (struct perfmon_alloc *) opaque;
 	struct perfmon_cpu_context *cctx = PERCPU_VARPTR(counters_env);
 	int i;
 	struct perfmon_event *pev;

 	spin_lock_irqsave(&cctx->lock);
 	if (perfmon_is_fixed_event(&pa->ev)) {
 		uint64_t fxctrl_value = read_msr(MSR_CORE_PERF_FIXED_CTR_CTRL);

 		i = PMEV_GET_EVENT(pa->ev.event);
 		if (i >= (int) cpu_caps.fix_counters_x_proc) {
 			i = -ENOSPC;
 		} else if (!perfmon_fix_event_available(i, fxctrl_value)) {
 			i = -EBUSY;
 		} else {
 			/* Keep a copy of pa->ev for later.  pa is read-only and
 			 * shared. */
 			cctx->fixed_counters[i] = pa->ev;
 			pev = &cctx->fixed_counters[i];
 			if (PMEV_GET_INTEN(pev->event))
 				perfmon_set_fixed_trigger(i,
 							  pev->trigger_count);
 			else
 				write_msr(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
 			write_msr(MSR_CORE_PERF_GLOBAL_OVF_CTRL,
 				  1ULL << (32 + i));
 			perfmon_enable_fix_event(i, pev->event, fxctrl_value);
 		}
 	} else {
 		for (i = 0; i < (int) cpu_caps.counters_x_proc; i++) {
 			if (cctx->counters[i].event == 0) {
 				/* kernel bug if the MSRs don't agree with our
 				 * bookkeeping */
 				assert(perfmon_event_available(i));
 				break;
 			}
 		}
 		if (i < (int) cpu_caps.counters_x_proc) {
 			cctx->counters[i] = pa->ev;
 			pev = &cctx->counters[i];
 			if (PMEV_GET_INTEN(pev->event))
 				perfmon_set_unfixed_trigger(i,
 							    pev->trigger_count);
 			else
 				write_msr(MSR_IA32_PERFCTR0 + i, 0);
 			write_msr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, 1ULL << i);
 			perfmon_enable_event(i, pev->event);
 		} else {
 			i = -ENOSPC;
 		}
 	}
 	spin_unlock_irqsave(&cctx->lock);

 	pa->cores_counters[core_id()] = (counter_t) i;
 }

 static void perfmon_do_cores_free(void *opaque)
 {
 	struct perfmon_alloc *pa = (struct perfmon_alloc *) opaque;
 	struct perfmon_cpu_context *cctx = PERCPU_VARPTR(counters_env);
 	int err = 0, coreno = core_id();
 	counter_t ccno = pa->cores_counters[coreno];

 	spin_lock_irqsave(&cctx->lock);
 	if (perfmon_is_fixed_event(&pa->ev)) {
 		uint64_t fxctrl_value = read_msr(MSR_CORE_PERF_FIXED_CTR_CTRL);

 		if ((ccno >= cpu_caps.fix_counters_x_proc) ||
 		    perfmon_fix_event_available(ccno, fxctrl_value)) {
 			err = -ENOENT;
 		} else {
 			perfmon_init_event(&cctx->fixed_counters[ccno]);
 			perfmon_disable_fix_event((int) ccno, fxctrl_value);
 			write_msr(MSR_CORE_PERF_FIXED_CTR0 + ccno, 0);
 		}
 	} else {
 		if (ccno < (int) cpu_caps.counters_x_proc) {
 			perfmon_init_event(&cctx->counters[ccno]);
 			perfmon_disable_event((int) ccno);
 			write_msr(MSR_IA32_PERFCTR0 + ccno, 0);
 		} else {
 			err = -ENOENT;
 		}
 	}
 	spin_unlock_irqsave(&cctx->lock);

 	pa->cores_counters[coreno] = (counter_t) err;
 }

 /* Helper: Reads a fixed counter's value.  Returns the max amount possible if
  * the counter overflowed. */
 static uint64_t perfmon_read_fixed_counter(int ccno)
 {
 	uint64_t overflow_status = read_msr(MSR_CORE_PERF_GLOBAL_STATUS);

 	if (overflow_status & (1ULL << (32 + ccno)))
 		return (1ULL << cpu_caps.bits_x_fix_counter) - 1;
 	else
 		return read_msr(MSR_CORE_PERF_FIXED_CTR0 + ccno);
 }

 /* Helper: Reads an unfixed counter's value.  Returns the max amount possible if
  * the counter overflowed. */
 static uint64_t perfmon_read_unfixed_counter(int ccno)
 {
 	uint64_t overflow_status = read_msr(MSR_CORE_PERF_GLOBAL_STATUS);

 	if (overflow_status & (1ULL << ccno))
 		return (1ULL << cpu_caps.bits_x_counter) - 1;
 	else
 		return read_msr(MSR_IA32_PERFCTR0 + ccno);
 }

 static void perfmon_do_cores_status(void *opaque)
 {
 	struct perfmon_status_env *env = (struct perfmon_status_env *) opaque;
 	struct perfmon_cpu_context *cctx = PERCPU_VARPTR(counters_env);
 	int coreno = core_id();
 	counter_t ccno = env->pa->cores_counters[coreno];

 	spin_lock_irqsave(&cctx->lock);
 	if (perfmon_is_fixed_event(&env->pa->ev))
 		env->pef->cores_values[coreno] =
 			perfmon_read_fixed_counter(ccno);
 	else
 		env->pef->cores_values[coreno] =
 			perfmon_read_unfixed_counter(ccno);
 	spin_unlock_irqsave(&cctx->lock);
 }

 static void perfmon_setup_alloc_core_set(const struct perfmon_alloc *pa,
                                          struct core_set *cset)
 {
 	int i;

 	core_set_init(cset);
 	for (i = 0; i < num_cores; i++) {
 		if (pa->cores_counters[i] >= 0)
 			core_set_setcpu(cset, i);
 	}
 }

 static void perfmon_cleanup_cores_alloc(struct perfmon_alloc *pa)
 {
 	struct core_set cset;

 	perfmon_setup_alloc_core_set(pa, &cset);
 	smp_do_in_cores(&cset, perfmon_do_cores_free, pa);
 }

 static void perfmon_free_alloc(struct perfmon_alloc *pa)
 {
 	kfree(pa);
 }

 static void perfmon_destroy_alloc(struct perfmon_alloc *pa)
 {
 	perfmon_cleanup_cores_alloc(pa);
 	perfmon_free_alloc(pa);
 }

 static struct perfmon_alloc *perfmon_create_alloc(const struct perfmon_event *pev)
 {
 	int i;
 	struct perfmon_alloc *pa = kzmalloc(sizeof(struct perfmon_alloc) +
 	                                    num_cores * sizeof(counter_t),
 	                                    MEM_WAIT);

 	pa->ev = *pev;
 	for (i = 0; i < num_cores; i++)
 		pa->cores_counters[i] = INVALID_COUNTER;

 	return pa;
 }

 static struct perfmon_status *perfmon_status_alloc(void)
 {
 	struct perfmon_status *pef = kzmalloc(sizeof(struct perfmon_status) +
 	                                      num_cores * sizeof(uint64_t),
 	                                      MEM_WAIT);

 	return pef;
 }

 static void perfmon_arm_irq(void)
 {
 	/* Actually, the vector is ignored, I'm just adding T_NMI to avoid
 	 * confusion.  The important part is the NMI-bits (0x4) */
 	apicrput(MSR_LAPIC_LVT_PERFMON, (0x4 << 8) | T_NMI);
 }

 bool perfmon_supported(void)
 {
 	return cpu_caps.perfmon_version >= 2;
 }

 void perfmon_global_init(void)
 {
 	perfmon_read_cpu_caps(&cpu_caps);
 }

 void perfmon_pcpu_init(void)
 {
 	int i;

 	if (!perfmon_supported())
 		return;
 	/* Enable user level access to the performance counters */
 	lcr4(rcr4() | CR4_PCE);

 	/* Reset all the counters and selectors to zero.
 	 */
 	write_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	for (i = 0; i < (int) cpu_caps.counters_x_proc; i++) {
 		write_msr(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0);
 		write_msr(MSR_IA32_PERFCTR0 + i, 0);
 	}
 	write_msr(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
 	for (i = 0; i < (int) cpu_caps.fix_counters_x_proc; i++)
 		write_msr(MSR_CORE_PERF_FIXED_CTR0 + i, 0);

 	perfmon_arm_irq();
 }

 static uint64_t perfmon_make_sample_event(const struct perfmon_event *pev)
 {
 	return pev->user_data;
 }

 /* Called from NMI context! */
 void perfmon_snapshot_hwtf(struct hw_trapframe *hw_tf)
 {
 	struct sample_snapshot *sample = PERCPU_VARPTR(sample_snapshots);
 	uintptr_t pc = get_hwtf_pc(hw_tf);
 	uintptr_t fp = get_hwtf_fp(hw_tf);

 	sample->ctx.type = ROS_HW_CTX;
 	sample->ctx.tf.hw_tf = *hw_tf;
 	if (in_kernel(hw_tf)) {
 		sample->nr_pcs = backtrace_list(pc, fp, sample->pc_list,
 		                                PROFILER_BT_DEPTH);
 	} else {
 		sample->nr_pcs = backtrace_user_list(pc, fp, sample->pc_list,
 		                                     PROFILER_BT_DEPTH);
 	}
 }

 /* Called from NMI context, *and* this cannot fault (e.g. breakpoint tracing)!
  * The latter restriction is due to the vmexit NMI handler not being
  * interruptible.  Because of this, we just copy out the VM TF. */
 void perfmon_snapshot_vmtf(struct vm_trapframe *vm_tf)
 {
 	struct sample_snapshot *sample = PERCPU_VARPTR(sample_snapshots);

 	sample->ctx.type = ROS_VM_CTX;
 	sample->ctx.tf.vm_tf = *vm_tf;
 	sample->nr_pcs = 1;
 	sample->pc_list[0] = get_vmtf_pc(vm_tf);
 }

 static void profiler_add_sample(uint64_t info)
 {
 	struct sample_snapshot *sample = PERCPU_VARPTR(sample_snapshots);

 	/* We shouldn't need to worry about another NMI that concurrently mucks
 	 * with the sample.  The PMU won't rearm the interrupt until we're done
 	 * here.  In the event that we do get another NMI from another source,
 	 * we may get a weird backtrace in the perf output. */
 	switch (sample->ctx.type) {
 	case ROS_HW_CTX:
 		if (in_kernel(&sample->ctx.tf.hw_tf)) {
 			profiler_push_kernel_backtrace(sample->pc_list,
 						       sample->nr_pcs, info);
 		} else {
 			profiler_push_user_backtrace(sample->pc_list,
 						     sample->nr_pcs, info);
 		}
 		break;
 	case ROS_VM_CTX:
 		/* TODO: add VM support to perf.  For now, just treat it like a
 		 * user addr.  Note that the address is a guest-virtual address,
 		 * not guest-physical (which would be host virtual), and our
 		 * VM_CTXs don't make a distinction between user and kernel TFs
 		 * (yet). */
 		profiler_push_user_backtrace(sample->pc_list, sample->nr_pcs,
 					     info);
 		break;
 	default:
 		warn("Bad perf sample type %d!", sample->ctx.type);
 	}
 }

 void perfmon_interrupt(struct hw_trapframe *hw_tf, void *data)
 {
 	int i;
 	struct perfmon_cpu_context *cctx = PERCPU_VARPTR(counters_env);
 	uint64_t gctrl, status;

 	spin_lock_irqsave(&cctx->lock);
 	/* We need to save the global control status, because we need to disable
 	 * counters in order to be able to reset their values.
 	 * We will restore the global control status on exit.
 	 */
 	status = read_msr(MSR_CORE_PERF_GLOBAL_STATUS);
 	gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
 	write_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	for (i = 0; i < (int) cpu_caps.counters_x_proc; i++) {
 		if (status & ((uint64_t) 1 << i)) {
 			if (cctx->counters[i].event) {
 				profiler_add_sample(
 				    perfmon_make_sample_event(
 					cctx->counters + i));
 				perfmon_set_unfixed_trigger(i,
 					cctx->counters[i].trigger_count);
 			}
 		}
 	}
 	for (i = 0; i < (int) cpu_caps.fix_counters_x_proc; i++) {
 		if (status & ((uint64_t) 1 << (32 + i))) {
 			if (cctx->fixed_counters[i].event) {
 				profiler_add_sample(
 				    perfmon_make_sample_event(
 					cctx->fixed_counters + i));
 				perfmon_set_fixed_trigger(i,
 				        cctx->fixed_counters[i].trigger_count);
 			}
 		}
 	}
 	write_msr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, status);
 	write_msr(MSR_CORE_PERF_GLOBAL_CTRL, gctrl);
 	spin_unlock_irqsave(&cctx->lock);

 	/* We need to re-arm the IRQ as the PFM IRQ gets masked on trigger.
 	 * Note that KVM and real HW seems to be doing two different things WRT
 	 * re-arming the IRQ. KVM re-arms does not mask the IRQ, while real HW
 	 * does.  */
 	perfmon_arm_irq();
 }

 void perfmon_get_cpu_caps(struct perfmon_cpu_caps *pcc)
 {
 	memcpy(pcc, &cpu_caps, sizeof(*pcc));
 }

 static int perfmon_install_session_alloc(struct perfmon_session *ps,
                                          struct perfmon_alloc *pa)
 {
 	qlock(&ps->qlock);
 	for (int i = 0; i < ARRAY_SIZE(ps->allocs); i++) {
 		if (!ps->allocs[i]) {
 			ps->allocs[i] = pa;
 			qunlock(&ps->qlock);
 			return i;
 		}
 	}
 	qunlock(&ps->qlock);
 	error(ENFILE, "Too many perf allocs in the session");
 }

 int perfmon_open_event(const struct core_set *cset, struct perfmon_session *ps,
                        const struct perfmon_event *pev)
 {
 	ERRSTACK(1);
 	int i;
 	struct perfmon_alloc *pa = perfmon_create_alloc(pev);

 	if (waserror()) {
 		perfmon_destroy_alloc(pa);
 		nexterror();
 	}
 	/* Ensure the user did not set reserved bits or otherwise give us a bad
 	 * event.  pev (now pa->ev) must be a valid IA32_PERFEVTSEL MSR. */
 	pa->ev.event &= 0xffffffff;
 	if (cpu_caps.perfmon_version < 3)
 		PMEV_SET_ANYTH(pa->ev.event, 0);
 	/* Ensure we're turning on the event.  The user could have forgotten to
 	 * set it.  Our tracking of whether or not a counter is in use depends
 	 * on it being enabled, or at least that some bit is set. */
 	PMEV_SET_EN(pa->ev.event, 1);
 	smp_do_in_cores(cset, perfmon_do_cores_alloc, pa);

 	for (i = 0; i < num_cores; i++) {
 		if (core_set_getcpu(cset, i)) {
 			counter_t ccno = pa->cores_counters[i];

 			if (unlikely(ccno < 0)) {
 				perfmon_destroy_alloc(pa);
 				perfmon_convert_error(-(int)ccno, i);
 				return -1;
 			}
 		}
 	}
 	/* The perfmon_alloc data structure will not be visible to userspace,
 	 * until the perfmon_install_session_alloc() completes, and at that
 	 * time the smp_do_in_cores(perfmon_do_cores_alloc) will have run on
 	 * all cores.
 	 * The perfmon_alloc data structure will never be changed once
 	 * published. */
 	i = perfmon_install_session_alloc(ps, pa);
 	poperror();

 	return i;
 }

 /* Helper, looks up a pa, given ped.  Hold the qlock. */
 static struct perfmon_alloc *__lookup_pa(struct perfmon_session *ps, int ped)
 {
 	struct perfmon_alloc *pa;

 	if (unlikely((ped < 0) || (ped >= ARRAY_SIZE(ps->allocs))))
 		error(EBADFD, "Perf event %d out of range", ped);
 	pa = ps->allocs[ped];
 	if (!pa)
 		error(ENOENT, "No perf alloc for event %d", ped);
 	return pa;
 }

 void perfmon_close_event(struct perfmon_session *ps, int ped)
 {
 	ERRSTACK(1);
 	struct perfmon_alloc *pa;

 	qlock(&ps->qlock);
 	if (waserror()) {
 		qunlock(&ps->qlock);
 		nexterror();
 	};
 	/* lookup does the error checking */
 	pa = __lookup_pa(ps, ped);
 	ps->allocs[ped] = NULL;
 	poperror();
 	qunlock(&ps->qlock);
 	perfmon_destroy_alloc(pa);
 }

 /* Fetches the status (i.e. PMU counters) of event ped from all applicable
  * cores.  Returns a perfmon_status, which the caller should free. */
 struct perfmon_status *perfmon_get_event_status(struct perfmon_session *ps,
                                                 int ped)
 {
 	ERRSTACK(1);
 	struct core_set cset;
 	struct perfmon_status_env env;

 	/* qlock keeps the PA alive.  We don't want to spin, since the spinners
 	 * might prevent the smp_do_in_cores(), resulting in a deadlock. */
 	qlock(&ps->qlock);
 	if (waserror()) {
 		qunlock(&ps->qlock);
 		nexterror();
 	};
 	env.pa = __lookup_pa(ps, ped);
 	env.pef = perfmon_status_alloc();

 	perfmon_setup_alloc_core_set(env.pa, &cset);
 	smp_do_in_cores(&cset, perfmon_do_cores_status, &env);

 	poperror();
 	qunlock(&ps->qlock);

 	return env.pef;
 }

 void perfmon_free_event_status(struct perfmon_status *pef)
 {
 	kfree(pef);
 }

 struct perfmon_session *perfmon_create_session(void)
 {
 	struct perfmon_session *ps = kzmalloc(sizeof(struct perfmon_session),
 	                                      MEM_WAIT);

 	qlock_init(&ps->qlock);
 	return ps;
 }

 void perfmon_close_session(struct perfmon_session *ps)
 {
 	struct perfmon_alloc *pa;

 	for (int i = 0; i < ARRAY_SIZE(ps->allocs); i++) {
 		pa = ps->allocs[i];
 		if (pa)
 			perfmon_destroy_alloc(pa);
 	}
 	kfree(ps);
 }
	/* Copyright (c) 2015 Google Inc
	* Davide Libenzi <dlibenzi@google.com>
	* Barret Rhoden <brho@cs.berkeley.edu>
	* See LICENSE for details.
	*
	* Manages the setting and reading of hardware perf counters across all cores,
	* including generating samples in response to counter overflow interrupts.
	*
	* The hardware interface is pretty straightforward - it's mostly setting and
	* unsetting fixed and unfixed events, sometimes with interrupts and trigger
	* counts.
	*
	* The 'command' to the cores is a struct perfmon_alloc. This tells the core
	* which event to set up (this is the perfmon_event). The cores respond in
	* counters[], saying which of their counters it is using for that event. If
	* the cores are given different alloc requests, it is possible that they might
	* choose different counters[] for the same event.
	*
	* These perfmon_allocs are collected in a perfmon_session. The session is just
	* a bunch of allocs, which are referred to by index (the 'ped'). Currently,
	* the session is grabbed by whoever opens the perf FD in devarch, and closed
	* when that FD is closed. They are 1:1 with devarch's perf_contexts.
	*
	* The values for the counters are extracted with perfmon_get_event_status(),
	* which uses a struct perfmon_status to collect the results. We pass the
	* perfmon_alloc as part of the perfmon_status_env, since we need to tell the
	* core which counter we're talking about.
	*
	* You can have multiple sessions, but if you try to install the same counter in
	* multiple, concurrent sessions, the hardware might complain (it definitely
	* will if it is a fixed event). */

	#include <sys/types.h>
	#include <arch/ros/msr-index.h>
	#include <arch/x86.h>
	#include <arch/msr.h>
	#include <arch/uaccess.h>
	#include <ros/errno.h>
	#include <assert.h>
	#include <trap.h>
	#include <smp.h>
	#include <atomic.h>
	#include <core_set.h>
	#include <percpu.h>
	#include <kmalloc.h>
	#include <err.h>
	#include <string.h>
	#include <profiler.h>
	#include <arch/perfmon.h>

	#define FIXCNTR_NBITS 4
	#define FIXCNTR_MASK (((uint64_t) 1 << FIXCNTR_NBITS) - 1)

	struct perfmon_cpu_context {
	spinlock_t lock;
	struct perfmon_event counters[MAX_VAR_COUNTERS];
	struct perfmon_event fixed_counters[MAX_FIX_COUNTERS];
	};

	struct perfmon_status_env {
	struct perfmon_alloc *pa;
	struct perfmon_status *pef;
	};

	static struct perfmon_cpu_caps cpu_caps;
	static DEFINE_PERCPU(struct perfmon_cpu_context, counters_env);
	DEFINE_PERCPU_INIT(perfmon_counters_env_init);

	#define PROFILER_BT_DEPTH 16

	struct sample_snapshot {
	struct user_context ctx;
	uintptr_t pc_list[PROFILER_BT_DEPTH];
	size_t nr_pcs;
	};
	static DEFINE_PERCPU(struct sample_snapshot, sample_snapshots);

	static void perfmon_counters_env_init(void)
	{
	for (int i = 0; i < num_cores; i++) {
	struct perfmon_cpu_context *cctx =
	_PERCPU_VARPTR(counters_env, i);

	spinlock_init_irqsave(&cctx->lock);
	}
	}

	static void perfmon_read_cpu_caps(struct perfmon_cpu_caps *pcc)
	{
	uint32_t a, b, c, d;

	cpuid(0x0a, 0, &a, &b, &c, &d);

	pcc->proc_arch_events = a >> 24;
	pcc->bits_x_counter = (a >> 16) & 0xff;
	pcc->counters_x_proc = (a >> 8) & 0xff;
	pcc->bits_x_fix_counter = (d >> 5) & 0xff;
	pcc->fix_counters_x_proc = d & 0x1f;
	pcc->perfmon_version = a & 0xff;
	}

	static void perfmon_enable_event(int idx, uint64_t event)
	{
	uint64_t gctrl;

	/* Events need to be enabled in both MSRs */
	write_msr(MSR_ARCH_PERFMON_EVENTSEL0 + idx, event);
	gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
	write_msr(MSR_CORE_PERF_GLOBAL_CTRL, gctrl \| (1 << idx));
	}

	static void perfmon_disable_event(int idx)
	{
	uint64_t gctrl;

	/* Events can be disabled in either location. We could just clear the
	* global ctrl, but we use the contents of EVENTSEL to say if the
	* counter is available or not. */
	write_msr(MSR_ARCH_PERFMON_EVENTSEL0 + idx, 0);
	gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
	write_msr(MSR_CORE_PERF_GLOBAL_CTRL, gctrl & ~(1 << idx));
	}

	static bool perfmon_event_available(uint32_t idx)
	{
	return read_msr(MSR_ARCH_PERFMON_EVENTSEL0 + idx) == 0;
	}

	/* Helper. Given an event, a fixed counter index, and the contents of the fixed
	* counter ctl MSR, output the value for the fixed counter ctl that will enable
	* the event at idx. */
	static uint64_t perfmon_apply_fixevent_mask(uint64_t event, int idx,
	uint64_t base)
	{
	uint64_t m = 0;

	if (PMEV_GET_OS(event))
	m \|= (1 << 0);
	if (PMEV_GET_USR(event))
	m \|= (1 << 1);
	if (PMEV_GET_ANYTH(event))
	m \|= (1 << 2);
	if (PMEV_GET_INTEN(event))
	m \|= (1 << 3);
	/* Enable enforcement: we need at least one bit set so that this fixed
	* counter appears to be in use. */
	if (PMEV_GET_EN(event) && !PMEV_GET_OS(event) && !PMEV_GET_USR(event))
	m \|= (1 << 0) \| (1 << 1);

	m <<= idx * FIXCNTR_NBITS;
	m \|= base & ~(FIXCNTR_MASK << (idx * FIXCNTR_NBITS));

	return m;
	}

	/* These helpers take the fxctrl_value to save on a rdmsr. */
	static void perfmon_enable_fix_event(int idx, uint64_t event,
	uint64_t fxctrl_value)
	{
	uint64_t gctrl, fx;

	/* Enable in both locations: the bits in FIXED and the bit in GLOBAL. */
	fx = perfmon_apply_fixevent_mask(event, idx, fxctrl_value);
	write_msr(MSR_CORE_PERF_FIXED_CTR_CTRL, fx);
	gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
	write_msr(MSR_CORE_PERF_GLOBAL_CTRL,
	gctrl \| ((uint64_t) 1 << (32 + idx)));
	}

	static void perfmon_disable_fix_event(int idx, uint64_t fxctrl_value)
	{
	uint64_t gctrl;

	/* Events can be disabled in either location. We could just clear the
	* global ctrl, but we use the bits of fxctlr to say if the counter is
	* available or not. */
	write_msr(MSR_CORE_PERF_FIXED_CTR_CTRL,
	fxctrl_value & ~(FIXCNTR_MASK << (idx * FIXCNTR_NBITS)));
	gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
	write_msr(MSR_CORE_PERF_GLOBAL_CTRL,
	gctrl & ~((uint64_t) 1 << (32 + idx)));
	}

	static bool perfmon_fix_event_available(uint32_t idx, uint64_t fxctrl_value)
	{
	return (fxctrl_value & (FIXCNTR_MASK << (idx * FIXCNTR_NBITS))) == 0;
	}

	/* Helper to set a fixed perfcounter to trigger/overflow after count events.
	* Anytime you set a perfcounter to something non-zero, you need to use this
	* helper. */
	static void perfmon_set_fixed_trigger(unsigned int idx, uint64_t count)
	{
	int64_t write_val = -(int64_t)count;

	write_val &= (1ULL << cpu_caps.bits_x_fix_counter) - 1;
	write_msr(MSR_CORE_PERF_FIXED_CTR0 + idx, write_val);
	}

	/* Helper to set a regular perfcounter to trigger/overflow after count events.
	* Anytime you set a perfcounter to something non-zero, you ought to use this
	* helper. */
	static void perfmon_set_unfixed_trigger(unsigned int idx, uint64_t count)
	{
	int64_t write_val = -(int64_t)count;

	write_val &= (1ULL << cpu_caps.bits_x_counter) - 1;
	write_msr(MSR_IA32_PERFCTR0 + idx, write_val);
	}

	/* Helper: sets errno/errstr based on the error code returned from the core. We
	* don't have a great way to get errors back from smp_do_in_cores() commands.
	* We use negative counter values (e.g. i = -EBUSY) to signal an error of a
	* certain type. This converts that to something useful for userspace. */
	static void perfmon_convert_error(int err_code, int core_id)
	{
	switch (err_code) {
	case EBUSY:
	set_error(err_code, "Fixed perf counter is busy on core %d",
	core_id);
	break;
	case ENOSPC:
	set_error(err_code, "Perf counter idx out of range on core %d",
	core_id);
	break;
	case ENOENT:
	set_error(err_code, "Perf counter not set on core %d", core_id);
	break;
	default:
	set_error(err_code, "Unknown perf counter error on core %d",
	core_id);
	break;
	};
	}

	static void perfmon_do_cores_alloc(void *opaque)
	{
	struct perfmon_alloc pa = (struct perfmon_alloc ) opaque;
	struct perfmon_cpu_context *cctx = PERCPU_VARPTR(counters_env);
	int i;
	struct perfmon_event *pev;

	spin_lock_irqsave(&cctx->lock);
	if (perfmon_is_fixed_event(&pa->ev)) {
	uint64_t fxctrl_value = read_msr(MSR_CORE_PERF_FIXED_CTR_CTRL);

	i = PMEV_GET_EVENT(pa->ev.event);
	if (i >= (int) cpu_caps.fix_counters_x_proc) {
	i = -ENOSPC;
	} else if (!perfmon_fix_event_available(i, fxctrl_value)) {
	i = -EBUSY;
	} else {
	/* Keep a copy of pa->ev for later. pa is read-only and
	* shared. */
	cctx->fixed_counters[i] = pa->ev;
	pev = &cctx->fixed_counters[i];
	if (PMEV_GET_INTEN(pev->event))
	perfmon_set_fixed_trigger(i,
	pev->trigger_count);
	else
	write_msr(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
	write_msr(MSR_CORE_PERF_GLOBAL_OVF_CTRL,
	1ULL << (32 + i));
	perfmon_enable_fix_event(i, pev->event, fxctrl_value);
	}
	} else {
	for (i = 0; i < (int) cpu_caps.counters_x_proc; i++) {
	if (cctx->counters[i].event == 0) {
	/* kernel bug if the MSRs don't agree with our
	* bookkeeping */
	assert(perfmon_event_available(i));
	break;
	}
	}
	if (i < (int) cpu_caps.counters_x_proc) {
	cctx->counters[i] = pa->ev;
	pev = &cctx->counters[i];
	if (PMEV_GET_INTEN(pev->event))
	perfmon_set_unfixed_trigger(i,
	pev->trigger_count);
	else
	write_msr(MSR_IA32_PERFCTR0 + i, 0);
	write_msr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, 1ULL << i);
	perfmon_enable_event(i, pev->event);
	} else {
	i = -ENOSPC;
	}
	}
	spin_unlock_irqsave(&cctx->lock);

	pa->cores_counters[core_id()] = (counter_t) i;
	}

	static void perfmon_do_cores_free(void *opaque)
	{
	struct perfmon_alloc pa = (struct perfmon_alloc ) opaque;
	struct perfmon_cpu_context *cctx = PERCPU_VARPTR(counters_env);
	int err = 0, coreno = core_id();
	counter_t ccno = pa->cores_counters[coreno];

	spin_lock_irqsave(&cctx->lock);
	if (perfmon_is_fixed_event(&pa->ev)) {
	uint64_t fxctrl_value = read_msr(MSR_CORE_PERF_FIXED_CTR_CTRL);

	if ((ccno >= cpu_caps.fix_counters_x_proc) \|\|
	perfmon_fix_event_available(ccno, fxctrl_value)) {
	err = -ENOENT;
	} else {
	perfmon_init_event(&cctx->fixed_counters[ccno]);
	perfmon_disable_fix_event((int) ccno, fxctrl_value);
	write_msr(MSR_CORE_PERF_FIXED_CTR0 + ccno, 0);
	}
	} else {
	if (ccno < (int) cpu_caps.counters_x_proc) {
	perfmon_init_event(&cctx->counters[ccno]);
	perfmon_disable_event((int) ccno);
	write_msr(MSR_IA32_PERFCTR0 + ccno, 0);
	} else {
	err = -ENOENT;
	}
	}
	spin_unlock_irqsave(&cctx->lock);

	pa->cores_counters[coreno] = (counter_t) err;
	}

	/* Helper: Reads a fixed counter's value. Returns the max amount possible if
	* the counter overflowed. */
	static uint64_t perfmon_read_fixed_counter(int ccno)
	{
	uint64_t overflow_status = read_msr(MSR_CORE_PERF_GLOBAL_STATUS);

	if (overflow_status & (1ULL << (32 + ccno)))
	return (1ULL << cpu_caps.bits_x_fix_counter) - 1;
	else
	return read_msr(MSR_CORE_PERF_FIXED_CTR0 + ccno);
	}

	/* Helper: Reads an unfixed counter's value. Returns the max amount possible if
	* the counter overflowed. */
	static uint64_t perfmon_read_unfixed_counter(int ccno)
	{
	uint64_t overflow_status = read_msr(MSR_CORE_PERF_GLOBAL_STATUS);

	if (overflow_status & (1ULL << ccno))
	return (1ULL << cpu_caps.bits_x_counter) - 1;
	else
	return read_msr(MSR_IA32_PERFCTR0 + ccno);
	}

	static void perfmon_do_cores_status(void *opaque)
	{
	struct perfmon_status_env env = (struct perfmon_status_env ) opaque;
	struct perfmon_cpu_context *cctx = PERCPU_VARPTR(counters_env);
	int coreno = core_id();
	counter_t ccno = env->pa->cores_counters[coreno];

	spin_lock_irqsave(&cctx->lock);
	if (perfmon_is_fixed_event(&env->pa->ev))
	env->pef->cores_values[coreno] =
	perfmon_read_fixed_counter(ccno);
	else
	env->pef->cores_values[coreno] =
	perfmon_read_unfixed_counter(ccno);
	spin_unlock_irqsave(&cctx->lock);
	}

	static void perfmon_setup_alloc_core_set(const struct perfmon_alloc *pa,
	struct core_set *cset)
	{
	int i;

	core_set_init(cset);
	for (i = 0; i < num_cores; i++) {
	if (pa->cores_counters[i] >= 0)
	core_set_setcpu(cset, i);
	}
	}

	static void perfmon_cleanup_cores_alloc(struct perfmon_alloc *pa)
	{
	struct core_set cset;

	perfmon_setup_alloc_core_set(pa, &cset);
	smp_do_in_cores(&cset, perfmon_do_cores_free, pa);
	}

	static void perfmon_free_alloc(struct perfmon_alloc *pa)
	{
	kfree(pa);
	}

	static void perfmon_destroy_alloc(struct perfmon_alloc *pa)
	{
	perfmon_cleanup_cores_alloc(pa);
	perfmon_free_alloc(pa);
	}

	static struct perfmon_alloc perfmon_create_alloc(const struct perfmon_event pev)
	{
	int i;
	struct perfmon_alloc *pa = kzmalloc(sizeof(struct perfmon_alloc) +
	num_cores * sizeof(counter_t),
	MEM_WAIT);

	pa->ev = *pev;
	for (i = 0; i < num_cores; i++)
	pa->cores_counters[i] = INVALID_COUNTER;

	return pa;
	}

	static struct perfmon_status *perfmon_status_alloc(void)
	{
	struct perfmon_status *pef = kzmalloc(sizeof(struct perfmon_status) +
	num_cores * sizeof(uint64_t),
	MEM_WAIT);

	return pef;
	}

	static void perfmon_arm_irq(void)
	{
	/* Actually, the vector is ignored, I'm just adding T_NMI to avoid
	* confusion. The important part is the NMI-bits (0x4) */
	apicrput(MSR_LAPIC_LVT_PERFMON, (0x4 << 8) \| T_NMI);
	}

	bool perfmon_supported(void)
	{
	return cpu_caps.perfmon_version >= 2;
	}

	void perfmon_global_init(void)
	{
	perfmon_read_cpu_caps(&cpu_caps);
	}

	void perfmon_pcpu_init(void)
	{
	int i;

	if (!perfmon_supported())
	return;
	/* Enable user level access to the performance counters */
	lcr4(rcr4() \| CR4_PCE);

	/* Reset all the counters and selectors to zero.
	*/
	write_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
	for (i = 0; i < (int) cpu_caps.counters_x_proc; i++) {
	write_msr(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0);
	write_msr(MSR_IA32_PERFCTR0 + i, 0);
	}
	write_msr(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
	for (i = 0; i < (int) cpu_caps.fix_counters_x_proc; i++)
	write_msr(MSR_CORE_PERF_FIXED_CTR0 + i, 0);

	perfmon_arm_irq();
	}

	static uint64_t perfmon_make_sample_event(const struct perfmon_event *pev)
	{
	return pev->user_data;
	}

	/* Called from NMI context! */
	void perfmon_snapshot_hwtf(struct hw_trapframe *hw_tf)
	{
	struct sample_snapshot *sample = PERCPU_VARPTR(sample_snapshots);
	uintptr_t pc = get_hwtf_pc(hw_tf);
	uintptr_t fp = get_hwtf_fp(hw_tf);

	sample->ctx.type = ROS_HW_CTX;
	sample->ctx.tf.hw_tf = *hw_tf;
	if (in_kernel(hw_tf)) {
	sample->nr_pcs = backtrace_list(pc, fp, sample->pc_list,
	PROFILER_BT_DEPTH);
	} else {
	sample->nr_pcs = backtrace_user_list(pc, fp, sample->pc_list,
	PROFILER_BT_DEPTH);
	}
	}

	/* Called from NMI context, and this cannot fault (e.g. breakpoint tracing)!
	* The latter restriction is due to the vmexit NMI handler not being
	* interruptible. Because of this, we just copy out the VM TF. */
	void perfmon_snapshot_vmtf(struct vm_trapframe *vm_tf)
	{
	struct sample_snapshot *sample = PERCPU_VARPTR(sample_snapshots);

	sample->ctx.type = ROS_VM_CTX;
	sample->ctx.tf.vm_tf = *vm_tf;
	sample->nr_pcs = 1;
	sample->pc_list[0] = get_vmtf_pc(vm_tf);
	}

	static void profiler_add_sample(uint64_t info)
	{
	struct sample_snapshot *sample = PERCPU_VARPTR(sample_snapshots);

	/* We shouldn't need to worry about another NMI that concurrently mucks
	* with the sample. The PMU won't rearm the interrupt until we're done
	* here. In the event that we do get another NMI from another source,
	* we may get a weird backtrace in the perf output. */
	switch (sample->ctx.type) {
	case ROS_HW_CTX:
	if (in_kernel(&sample->ctx.tf.hw_tf)) {
	profiler_push_kernel_backtrace(sample->pc_list,
	sample->nr_pcs, info);
	} else {
	profiler_push_user_backtrace(sample->pc_list,
	sample->nr_pcs, info);
	}
	break;
	case ROS_VM_CTX:
	/* TODO: add VM support to perf. For now, just treat it like a
	* user addr. Note that the address is a guest-virtual address,
	* not guest-physical (which would be host virtual), and our
	* VM_CTXs don't make a distinction between user and kernel TFs
	* (yet). */
	profiler_push_user_backtrace(sample->pc_list, sample->nr_pcs,
	info);
	break;
	default:
	warn("Bad perf sample type %d!", sample->ctx.type);
	}
	}

	void perfmon_interrupt(struct hw_trapframe hw_tf, void data)
	{
	int i;
	struct perfmon_cpu_context *cctx = PERCPU_VARPTR(counters_env);
	uint64_t gctrl, status;

	spin_lock_irqsave(&cctx->lock);
	/* We need to save the global control status, because we need to disable
	* counters in order to be able to reset their values.
	* We will restore the global control status on exit.
	*/
	status = read_msr(MSR_CORE_PERF_GLOBAL_STATUS);
	gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
	write_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
	for (i = 0; i < (int) cpu_caps.counters_x_proc; i++) {
	if (status & ((uint64_t) 1 << i)) {
	if (cctx->counters[i].event) {
	profiler_add_sample(
	perfmon_make_sample_event(
	cctx->counters + i));
	perfmon_set_unfixed_trigger(i,
	cctx->counters[i].trigger_count);
	}
	}
	}
	for (i = 0; i < (int) cpu_caps.fix_counters_x_proc; i++) {
	if (status & ((uint64_t) 1 << (32 + i))) {
	if (cctx->fixed_counters[i].event) {
	profiler_add_sample(
	perfmon_make_sample_event(
	cctx->fixed_counters + i));
	perfmon_set_fixed_trigger(i,
	cctx->fixed_counters[i].trigger_count);
	}
	}
	}
	write_msr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, status);
	write_msr(MSR_CORE_PERF_GLOBAL_CTRL, gctrl);
	spin_unlock_irqsave(&cctx->lock);

	/* We need to re-arm the IRQ as the PFM IRQ gets masked on trigger.
	* Note that KVM and real HW seems to be doing two different things WRT
	* re-arming the IRQ. KVM re-arms does not mask the IRQ, while real HW
	* does. */
	perfmon_arm_irq();
	}

	void perfmon_get_cpu_caps(struct perfmon_cpu_caps *pcc)
	{
	memcpy(pcc, &cpu_caps, sizeof(*pcc));
	}

	static int perfmon_install_session_alloc(struct perfmon_session *ps,
	struct perfmon_alloc *pa)
	{
	qlock(&ps->qlock);
	for (int i = 0; i < ARRAY_SIZE(ps->allocs); i++) {
	if (!ps->allocs[i]) {
	ps->allocs[i] = pa;
	qunlock(&ps->qlock);
	return i;
	}
	}
	qunlock(&ps->qlock);
	error(ENFILE, "Too many perf allocs in the session");
	}

	int perfmon_open_event(const struct core_set cset, struct perfmon_session ps,
	const struct perfmon_event *pev)
	{
	ERRSTACK(1);
	int i;
	struct perfmon_alloc *pa = perfmon_create_alloc(pev);

	if (waserror()) {
	perfmon_destroy_alloc(pa);
	nexterror();
	}
	/* Ensure the user did not set reserved bits or otherwise give us a bad
	* event. pev (now pa->ev) must be a valid IA32_PERFEVTSEL MSR. */
	pa->ev.event &= 0xffffffff;
	if (cpu_caps.perfmon_version < 3)
	PMEV_SET_ANYTH(pa->ev.event, 0);
	/* Ensure we're turning on the event. The user could have forgotten to
	* set it. Our tracking of whether or not a counter is in use depends
	* on it being enabled, or at least that some bit is set. */
	PMEV_SET_EN(pa->ev.event, 1);
	smp_do_in_cores(cset, perfmon_do_cores_alloc, pa);

	for (i = 0; i < num_cores; i++) {
	if (core_set_getcpu(cset, i)) {
	counter_t ccno = pa->cores_counters[i];

	if (unlikely(ccno < 0)) {
	perfmon_destroy_alloc(pa);
	perfmon_convert_error(-(int)ccno, i);
	return -1;
	}
	}
	}
	/* The perfmon_alloc data structure will not be visible to userspace,
	* until the perfmon_install_session_alloc() completes, and at that
	* time the smp_do_in_cores(perfmon_do_cores_alloc) will have run on
	* all cores.
	* The perfmon_alloc data structure will never be changed once
	* published. */
	i = perfmon_install_session_alloc(ps, pa);
	poperror();

	return i;
	}

	/* Helper, looks up a pa, given ped. Hold the qlock. */
	static struct perfmon_alloc __lookup_pa(struct perfmon_session ps, int ped)
	{
	struct perfmon_alloc *pa;

	if (unlikely((ped < 0) \|\| (ped >= ARRAY_SIZE(ps->allocs))))
	error(EBADFD, "Perf event %d out of range", ped);
	pa = ps->allocs[ped];
	if (!pa)
	error(ENOENT, "No perf alloc for event %d", ped);
	return pa;
	}

	void perfmon_close_event(struct perfmon_session *ps, int ped)
	{
	ERRSTACK(1);
	struct perfmon_alloc *pa;

	qlock(&ps->qlock);
	if (waserror()) {
	qunlock(&ps->qlock);
	nexterror();
	};
	/* lookup does the error checking */
	pa = __lookup_pa(ps, ped);
	ps->allocs[ped] = NULL;
	poperror();
	qunlock(&ps->qlock);
	perfmon_destroy_alloc(pa);
	}

	/* Fetches the status (i.e. PMU counters) of event ped from all applicable
	* cores. Returns a perfmon_status, which the caller should free. */
	struct perfmon_status perfmon_get_event_status(struct perfmon_session ps,
	int ped)
	{
	ERRSTACK(1);
	struct core_set cset;
	struct perfmon_status_env env;

	/* qlock keeps the PA alive. We don't want to spin, since the spinners
	* might prevent the smp_do_in_cores(), resulting in a deadlock. */
	qlock(&ps->qlock);
	if (waserror()) {
	qunlock(&ps->qlock);
	nexterror();
	};
	env.pa = __lookup_pa(ps, ped);
	env.pef = perfmon_status_alloc();

	perfmon_setup_alloc_core_set(env.pa, &cset);
	smp_do_in_cores(&cset, perfmon_do_cores_status, &env);

	poperror();
	qunlock(&ps->qlock);

	return env.pef;
	}

	void perfmon_free_event_status(struct perfmon_status *pef)
	{
	kfree(pef);
	}

	struct perfmon_session *perfmon_create_session(void)
	{
	struct perfmon_session *ps = kzmalloc(sizeof(struct perfmon_session),
	MEM_WAIT);

	qlock_init(&ps->qlock);
	return ps;
	}

	void perfmon_close_session(struct perfmon_session *ps)
	{
	struct perfmon_alloc *pa;

	for (int i = 0; i < ARRAY_SIZE(ps->allocs); i++) {
	pa = ps->allocs[i];
	if (pa)
	perfmon_destroy_alloc(pa);
	}
	kfree(ps);
	}