kern/src/smp.c - akaros - Git at Google

 /*
  * Copyright (c) 2009 The Regents of the University of California
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  */

 #include <arch/arch.h>
 #include <atomic.h>
 #include <smp.h>
 #include <error.h>
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
 #include <pmap.h>
 #include <process.h>
 #include <schedule.h>
 #include <trap.h>
 #include <trace.h>
 #include <kdebug.h>
 #include <kmalloc.h>
 #include <core_set.h>
 #include <completion.h>
 #include <rcu.h>

 struct all_cpu_work {
 	struct completion comp;
 	void (*func)(void *);
 	void *opaque;
 };

 struct per_cpu_info per_cpu_info[MAX_NUM_CORES];

 // tracks number of global waits on smp_calls, must be <= NUM_HANDLER_WRAPPERS
 atomic_t outstanding_calls = 0;

 /* Helper for running a proc (if we should).  Lots of repetition with
  * proc_restartcore */
 static void try_run_proc(void)
 {
 	struct per_cpu_info *pcpui = &per_cpu_info[core_id()];

 	/* There was a process running here, and we should return to it. */
 	if (pcpui->owning_proc) {
 		assert(!pcpui->cur_kthread->sysc);
 		assert(pcpui->cur_ctx);
 		__proc_startcore(pcpui->owning_proc, pcpui->cur_ctx);
 		assert(0);
 	} else {
 		/* Make sure we have abandoned core.  It's possible to have an
 		 * owner without a current (smp_idle, __startcore, __death).
 		 *
 		 * If we had a current process, we might trigger __proc_free,
 		 * which could send us a KMSG.  Since we're called after PRKM,
 		 * let's just restart the idle loop. */
 		if (abandon_core())
 			smp_idle();
 	}
 }

 /* All cores end up calling this whenever there is nothing left to do or they
  * don't know explicitly what to do.  Non-zero cores call it when they are done
  * booting.  Other cases include after getting a DEATH IPI.
  *
  * All cores attempt to run the context of any owning proc.  Barring that, they
  * halt and wake up when interrupted, do any work on their work queue, then halt
  * again.  In between, the ksched gets a chance to tell it to do something else,
  * or perhaps to halt in another manner. */
 static void __attribute__((noreturn)) __smp_idle(void *arg)
 {
 	struct per_cpu_info *pcpui = &per_cpu_info[core_id()];

 	pcpui->cur_kthread->flags = KTH_DEFAULT_FLAGS;
 	while (1) {
 		/* This might wake a kthread (the gp ktask), so be sure to run
 		 * PRKM after reporting the quiescent state. */
 		rcu_report_qs();
 		/* If this runs an RKM, we'll call smp_idle from the top. */
 		process_routine_kmsg();
 		try_run_proc();
 		cpu_bored();		/* call out to the ksched */
 		/* cpu_halt() atomically turns on interrupts and halts the core.
 		 * Important to do this, since we could have a RKM come in via
 		 * an interrupt right while PRKM is returning, and we wouldn't
 		 * catch it.  When it returns, IRQs are back off. */
 		__set_cpu_state(pcpui, CPU_STATE_IDLE);
 		cpu_halt();
 		__set_cpu_state(pcpui, CPU_STATE_KERNEL);
 	}
 	assert(0);
 }

 void smp_idle(void)
 {
 	disable_irq();
 	__reset_stack_pointer(0, get_stack_top(), __smp_idle);
 }

 /* Arch-independent per-cpu initialization.  This will call the arch dependent
  * init first. */
 void smp_percpu_init(void)
 {
 	uint32_t coreid = core_id();
 	struct per_cpu_info *pcpui = &per_cpu_info[coreid];
 	void *trace_buf;
 	struct kthread *kthread;
 	/* Don't initialize __ctx_depth here, since it is already 1 (at least on
 	 * x86), since this runs in irq context. */
 	/* Do this first */
 	__arch_pcpu_init(coreid);
 	/* init our kthread (tracks our currently running context) */
 	kthread = __kthread_zalloc();
 	/* assumes we're on the 1st page */
 	kthread->stacktop = get_stack_top();
 	pcpui->cur_kthread = kthread;
 	/* Treat the startup threads as ktasks.  This will last until smp_idle
 	 * when they clear it, either in anticipation of being a user-backing
 	 * kthread or to handle an RKM. */
 	kthread->flags = KTH_KTASK_FLAGS;
 	per_cpu_info[coreid].spare = 0;
 	/* Init relevant lists */
 	spinlock_init_irqsave(&per_cpu_info[coreid].immed_amsg_lock);
 	STAILQ_INIT(&per_cpu_info[coreid].immed_amsgs);
 	spinlock_init_irqsave(&per_cpu_info[coreid].routine_amsg_lock);
 	STAILQ_INIT(&per_cpu_info[coreid].routine_amsgs);
 	init_timer_chain(&this_pcpui_var(tchain), set_pcpu_alarm_interrupt);
 	/* Init generic tracing ring */
 	trace_buf = kpage_alloc_addr();
 	assert(trace_buf);
 	trace_ring_init(&pcpui->traces, trace_buf, PGSIZE,
 	                sizeof(struct pcpu_trace_event));
 	for (int i = 0; i < NR_CPU_STATES; i++)
 		pcpui->state_ticks[i] = 0;
 	pcpui->last_tick_cnt = read_tsc();
 	/* Core 0 is in the KERNEL state, called from smp_boot.  The other cores
 	 * are too, at least on x86, where we were called from asm (woken by
 	 * POKE). */
 	pcpui->cpu_state = CPU_STATE_KERNEL;
 	/* Enable full lock debugging, after all pcpui work is done */
 	pcpui->__lock_checking_enabled = 1;
 }

 /* it's actually okay to set the state to the existing state.  originally, it
  * was a bug in the state tracking, but it is possible, at least on x86, to have
  * a halted core (state IDLE) get woken up by an IRQ that does not trigger the
  * IRQ handling state.  for example, there is the I_POKE_CORE ipi.  smp_idle
  * will just sleep again, and reset the state from IDLE to IDLE. */
 void __set_cpu_state(struct per_cpu_info *pcpui, int state)
 {
 	uint64_t now_ticks;

 	assert(!irq_is_enabled());
 	/* TODO: could put in an option to enable/disable state tracking. */
 	now_ticks = read_tsc();
 	pcpui->state_ticks[pcpui->cpu_state] += now_ticks -
 					        pcpui->last_tick_cnt;
 	/* TODO: if the state was user, we could account for the vcore's time,
 	 * similar to the total_ticks in struct vcore.  the difference is that
 	 * the total_ticks tracks the vcore's virtual time, while this tracks
 	 * user time.  something like vcore->user_ticks. */
 	pcpui->cpu_state = state;
 	pcpui->last_tick_cnt = now_ticks;
 }

 void reset_cpu_state_ticks(int coreid)
 {
 	struct per_cpu_info *pcpui = &per_cpu_info[coreid];
 	uint64_t now_ticks;

 	if (coreid >= num_cores)
 		return;
 	/* need to update last_tick_cnt, so the current value doesn't get added
 	 * in next time we update */
 	now_ticks = read_tsc();
 	for (int i = 0; i < NR_CPU_STATES; i++) {
 		pcpui->state_ticks[i] = 0;
 		pcpui->last_tick_cnt = now_ticks;
 	}
 }

 /* PCPUI Trace Rings: */

 static void pcpui_trace_kmsg_handler(void *event, void *data)
 {
 	struct pcpu_trace_event *te = (struct pcpu_trace_event*)event;
 	uintptr_t addr;

 	addr = te->arg1;
 	printk("\tKMSG %p: %s\n", addr, get_fn_name(addr));
 }

 static void pcpui_trace_locks_handler(void *event, void *data)
 {
 	struct pcpu_trace_event *te = (struct pcpu_trace_event*)event;
 	const char *func_name;
 	uintptr_t lock_addr = te->arg1;

 	if (lock_addr > KERN_LOAD_ADDR)
 		func_name = get_fn_name(lock_addr);
 	else
 		func_name = "Dynamic lock";
 	print_lock();
 	printk("Time %uus, lock %p (%s)\n", te->arg0, lock_addr, func_name);
 	printk("\t");
 	spinlock_debug((spinlock_t*)lock_addr);
 	print_unlock();
 }

 /* Add specific trace handlers here: */
 trace_handler_t pcpui_tr_handlers[PCPUI_NR_TYPES] = {
                                   0,
                                   pcpui_trace_kmsg_handler,
                                   pcpui_trace_locks_handler,
                                   };

 /* Generic handler for the pcpui ring.  Will switch out to the appropriate
  * type's handler */
 static void pcpui_trace_fn(void *event, void *data)
 {
 	struct pcpu_trace_event *te = (struct pcpu_trace_event*)event;
 	int desired_type = (int)(long)data;

 	if (te->type >= PCPUI_NR_TYPES)
 		printk("Bad trace type %d\n", te->type);
 	/* desired_type == 0 means all types */
 	if (desired_type && desired_type != te->type)
 		return;
 	if (pcpui_tr_handlers[te->type])
 		pcpui_tr_handlers[te->type](event, data);
 }

 void pcpui_tr_foreach(int coreid, int type)
 {
 	struct trace_ring *tr = &per_cpu_info[coreid].traces;
 	assert(tr);
 	printk("\n\nTrace Ring on Core %d\n--------------\n", coreid);
 	trace_ring_foreach(tr, pcpui_trace_fn, (void*)(long)type);
 }

 void pcpui_tr_foreach_all(int type)
 {
 	for (int i = 0; i < num_cores; i++)
 		pcpui_tr_foreach(i, type);
 }

 void pcpui_tr_reset_all(void)
 {
 	for (int i = 0; i < num_cores; i++)
 		trace_ring_reset(&per_cpu_info[i].traces);
 }

 void pcpui_tr_reset_and_clear_all(void)
 {
 	for (int i = 0; i < num_cores; i++)
 		trace_ring_reset_and_clear(&per_cpu_info[i].traces);
 }

 static void smp_do_core_work(uint32_t srcid, long a0, long a1, long a2)
 {
 	struct all_cpu_work *acw = (struct all_cpu_work *) a0;

 	acw->func(acw->opaque);
 	completion_complete(&acw->comp, 1);
 }

 void smp_do_in_cores(const struct core_set *cset, void (*func)(void *),
 					 void *opaque)
 {
 	int cpu = core_id();
 	struct all_cpu_work acw;

 	memset(&acw, 0, sizeof(acw));
 	completion_init(&acw.comp, core_set_remote_count(cset));
 	acw.func = func;
 	acw.opaque = opaque;

 	for (int i = 0; i < num_cores; i++) {
 		if (core_set_getcpu(cset, i)) {
 			if (i == cpu)
 				func(opaque);
 			else
 				send_kernel_message(i, smp_do_core_work,
 						    (long)&acw, 0, 0,
 						    KMSG_ROUTINE);
 		}
 	}
 	completion_wait(&acw.comp);
 }
	/*
	* Copyright (c) 2009 The Regents of the University of California
	* Barret Rhoden <brho@cs.berkeley.edu>
	* See LICENSE for details.
	*/

	#include <arch/arch.h>
	#include <atomic.h>
	#include <smp.h>
	#include <error.h>
	#include <stdio.h>
	#include <string.h>
	#include <assert.h>
	#include <pmap.h>
	#include <process.h>
	#include <schedule.h>
	#include <trap.h>
	#include <trace.h>
	#include <kdebug.h>
	#include <kmalloc.h>
	#include <core_set.h>
	#include <completion.h>
	#include <rcu.h>

	struct all_cpu_work {
	struct completion comp;
	void (func)(void );
	void *opaque;
	};

	struct per_cpu_info per_cpu_info[MAX_NUM_CORES];

	// tracks number of global waits on smp_calls, must be <= NUM_HANDLER_WRAPPERS
	atomic_t outstanding_calls = 0;

	/* Helper for running a proc (if we should). Lots of repetition with
	* proc_restartcore */
	static void try_run_proc(void)
	{
	struct per_cpu_info *pcpui = &per_cpu_info[core_id()];

	/* There was a process running here, and we should return to it. */
	if (pcpui->owning_proc) {
	assert(!pcpui->cur_kthread->sysc);
	assert(pcpui->cur_ctx);
	__proc_startcore(pcpui->owning_proc, pcpui->cur_ctx);
	assert(0);
	} else {
	/* Make sure we have abandoned core. It's possible to have an
	* owner without a current (smp_idle, __startcore, __death).
	*
	* If we had a current process, we might trigger __proc_free,
	* which could send us a KMSG. Since we're called after PRKM,
	* let's just restart the idle loop. */
	if (abandon_core())
	smp_idle();
	}
	}

	/* All cores end up calling this whenever there is nothing left to do or they
	* don't know explicitly what to do. Non-zero cores call it when they are done
	* booting. Other cases include after getting a DEATH IPI.
	*
	* All cores attempt to run the context of any owning proc. Barring that, they
	* halt and wake up when interrupted, do any work on their work queue, then halt
	* again. In between, the ksched gets a chance to tell it to do something else,
	* or perhaps to halt in another manner. */
	static void __attribute__((noreturn)) __smp_idle(void *arg)
	{
	struct per_cpu_info *pcpui = &per_cpu_info[core_id()];

	pcpui->cur_kthread->flags = KTH_DEFAULT_FLAGS;
	while (1) {
	/* This might wake a kthread (the gp ktask), so be sure to run
	* PRKM after reporting the quiescent state. */
	rcu_report_qs();
	/* If this runs an RKM, we'll call smp_idle from the top. */
	process_routine_kmsg();
	try_run_proc();
	cpu_bored(); /* call out to the ksched */
	/* cpu_halt() atomically turns on interrupts and halts the core.
	* Important to do this, since we could have a RKM come in via
	* an interrupt right while PRKM is returning, and we wouldn't
	* catch it. When it returns, IRQs are back off. */
	__set_cpu_state(pcpui, CPU_STATE_IDLE);
	cpu_halt();
	__set_cpu_state(pcpui, CPU_STATE_KERNEL);
	}
	assert(0);
	}

	void smp_idle(void)
	{
	disable_irq();
	__reset_stack_pointer(0, get_stack_top(), __smp_idle);
	}

	/* Arch-independent per-cpu initialization. This will call the arch dependent
	* init first. */
	void smp_percpu_init(void)
	{
	uint32_t coreid = core_id();
	struct per_cpu_info *pcpui = &per_cpu_info[coreid];
	void *trace_buf;
	struct kthread *kthread;
	/* Don't initialize __ctx_depth here, since it is already 1 (at least on
	* x86), since this runs in irq context. */
	/* Do this first */
	__arch_pcpu_init(coreid);
	/* init our kthread (tracks our currently running context) */
	kthread = __kthread_zalloc();
	/* assumes we're on the 1st page */
	kthread->stacktop = get_stack_top();
	pcpui->cur_kthread = kthread;
	/* Treat the startup threads as ktasks. This will last until smp_idle
	* when they clear it, either in anticipation of being a user-backing
	* kthread or to handle an RKM. */
	kthread->flags = KTH_KTASK_FLAGS;
	per_cpu_info[coreid].spare = 0;
	/* Init relevant lists */
	spinlock_init_irqsave(&per_cpu_info[coreid].immed_amsg_lock);
	STAILQ_INIT(&per_cpu_info[coreid].immed_amsgs);
	spinlock_init_irqsave(&per_cpu_info[coreid].routine_amsg_lock);
	STAILQ_INIT(&per_cpu_info[coreid].routine_amsgs);
	init_timer_chain(&this_pcpui_var(tchain), set_pcpu_alarm_interrupt);
	/* Init generic tracing ring */
	trace_buf = kpage_alloc_addr();
	assert(trace_buf);
	trace_ring_init(&pcpui->traces, trace_buf, PGSIZE,
	sizeof(struct pcpu_trace_event));
	for (int i = 0; i < NR_CPU_STATES; i++)
	pcpui->state_ticks[i] = 0;
	pcpui->last_tick_cnt = read_tsc();
	/* Core 0 is in the KERNEL state, called from smp_boot. The other cores
	* are too, at least on x86, where we were called from asm (woken by
	* POKE). */
	pcpui->cpu_state = CPU_STATE_KERNEL;
	/* Enable full lock debugging, after all pcpui work is done */
	pcpui->__lock_checking_enabled = 1;
	}

	/* it's actually okay to set the state to the existing state. originally, it
	* was a bug in the state tracking, but it is possible, at least on x86, to have
	* a halted core (state IDLE) get woken up by an IRQ that does not trigger the
	* IRQ handling state. for example, there is the I_POKE_CORE ipi. smp_idle
	* will just sleep again, and reset the state from IDLE to IDLE. */
	void __set_cpu_state(struct per_cpu_info *pcpui, int state)
	{
	uint64_t now_ticks;

	assert(!irq_is_enabled());
	/* TODO: could put in an option to enable/disable state tracking. */
	now_ticks = read_tsc();
	pcpui->state_ticks[pcpui->cpu_state] += now_ticks -
	pcpui->last_tick_cnt;
	/* TODO: if the state was user, we could account for the vcore's time,
	* similar to the total_ticks in struct vcore. the difference is that
	* the total_ticks tracks the vcore's virtual time, while this tracks
	* user time. something like vcore->user_ticks. */
	pcpui->cpu_state = state;
	pcpui->last_tick_cnt = now_ticks;
	}

	void reset_cpu_state_ticks(int coreid)
	{
	struct per_cpu_info *pcpui = &per_cpu_info[coreid];
	uint64_t now_ticks;

	if (coreid >= num_cores)
	return;
	/* need to update last_tick_cnt, so the current value doesn't get added
	* in next time we update */
	now_ticks = read_tsc();
	for (int i = 0; i < NR_CPU_STATES; i++) {
	pcpui->state_ticks[i] = 0;
	pcpui->last_tick_cnt = now_ticks;
	}
	}

	/* PCPUI Trace Rings: */

	static void pcpui_trace_kmsg_handler(void event, void data)
	{
	struct pcpu_trace_event te = (struct pcpu_trace_event)event;
	uintptr_t addr;

	addr = te->arg1;
	printk("\tKMSG %p: %s\n", addr, get_fn_name(addr));
	}

	static void pcpui_trace_locks_handler(void event, void data)
	{
	struct pcpu_trace_event te = (struct pcpu_trace_event)event;
	const char *func_name;
	uintptr_t lock_addr = te->arg1;

	if (lock_addr > KERN_LOAD_ADDR)
	func_name = get_fn_name(lock_addr);
	else
	func_name = "Dynamic lock";
	print_lock();
	printk("Time %uus, lock %p (%s)\n", te->arg0, lock_addr, func_name);
	printk("\t");
	spinlock_debug((spinlock_t*)lock_addr);
	print_unlock();
	}

	/* Add specific trace handlers here: */
	trace_handler_t pcpui_tr_handlers[PCPUI_NR_TYPES] = {
	0,
	pcpui_trace_kmsg_handler,
	pcpui_trace_locks_handler,
	};

	/* Generic handler for the pcpui ring. Will switch out to the appropriate
	* type's handler */
	static void pcpui_trace_fn(void event, void data)
	{
	struct pcpu_trace_event te = (struct pcpu_trace_event)event;
	int desired_type = (int)(long)data;

	if (te->type >= PCPUI_NR_TYPES)
	printk("Bad trace type %d\n", te->type);
	/* desired_type == 0 means all types */
	if (desired_type && desired_type != te->type)
	return;
	if (pcpui_tr_handlers[te->type])
	pcpui_tr_handlers[te->type](event, data);
	}

	void pcpui_tr_foreach(int coreid, int type)
	{
	struct trace_ring *tr = &per_cpu_info[coreid].traces;
	assert(tr);
	printk("\n\nTrace Ring on Core %d\n--------------\n", coreid);
	trace_ring_foreach(tr, pcpui_trace_fn, (void*)(long)type);
	}

	void pcpui_tr_foreach_all(int type)
	{
	for (int i = 0; i < num_cores; i++)
	pcpui_tr_foreach(i, type);
	}

	void pcpui_tr_reset_all(void)
	{
	for (int i = 0; i < num_cores; i++)
	trace_ring_reset(&per_cpu_info[i].traces);
	}

	void pcpui_tr_reset_and_clear_all(void)
	{
	for (int i = 0; i < num_cores; i++)
	trace_ring_reset_and_clear(&per_cpu_info[i].traces);
	}

	static void smp_do_core_work(uint32_t srcid, long a0, long a1, long a2)
	{
	struct all_cpu_work acw = (struct all_cpu_work ) a0;

	acw->func(acw->opaque);
	completion_complete(&acw->comp, 1);
	}

	void smp_do_in_cores(const struct core_set cset, void (func)(void *),
	void *opaque)
	{
	int cpu = core_id();
	struct all_cpu_work acw;

	memset(&acw, 0, sizeof(acw));
	completion_init(&acw.comp, core_set_remote_count(cset));
	acw.func = func;
	acw.opaque = opaque;

	for (int i = 0; i < num_cores; i++) {
	if (core_set_getcpu(cset, i)) {
	if (i == cpu)
	func(opaque);
	else
	send_kernel_message(i, smp_do_core_work,
	(long)&acw, 0, 0,
	KMSG_ROUTINE);
	}
	}
	completion_wait(&acw.comp);
	}