kern/arch/x86/smp_boot.c - upstream - Git at Google

 /*
  * Copyright (c) 2009 The Regents of the University of California
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  */

 #include <arch/x86.h>
 #include <arch/arch.h>
 #include <smp.h>
 #include <arch/console.h>
 #include <arch/apic.h>
 #include <arch/perfmon.h>
 #include <time.h>

 #include <bitmask.h>
 #include <atomic.h>
 #include <error.h>
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
 #include <pmap.h>
 #include <env.h>
 #include <trap.h>
 #include <kmalloc.h>
 #include <cpu_feat.h>
 #include <arch/fsgsbase.h>
 #include <ros/procinfo.h>

 #include "vmm/vmm.h"

 extern handler_wrapper_t handler_wrappers[NUM_HANDLER_WRAPPERS];
 int x86_num_cores_booted = 1;
 uintptr_t smp_stack_top;
 barrier_t generic_barrier;

 #define DECLARE_HANDLER_CHECKLISTS(vector)                          \
 	INIT_CHECKLIST(f##vector##_cpu_list, MAX_NUM_CORES);

 #define INIT_HANDLER_WRAPPER(v)                                     \
 {                                                                   \
 	handler_wrappers[(v)].vector = 0xe##v;                      \
 	handler_wrappers[(v)].cpu_list = &f##v##_cpu_list;          \
 	handler_wrappers[(v)].cpu_list->mask.size = num_cores;      \
 }

 DECLARE_HANDLER_CHECKLISTS(0);
 DECLARE_HANDLER_CHECKLISTS(1);
 DECLARE_HANDLER_CHECKLISTS(2);
 DECLARE_HANDLER_CHECKLISTS(3);
 DECLARE_HANDLER_CHECKLISTS(4);

 static void init_smp_call_function(void)
 {
 	INIT_HANDLER_WRAPPER(0);
 	INIT_HANDLER_WRAPPER(1);
 	INIT_HANDLER_WRAPPER(2);
 	INIT_HANDLER_WRAPPER(3);
 	INIT_HANDLER_WRAPPER(4);
 }

 /******************************************************************************/

 bool core_id_ready = FALSE;

 static void setup_rdtscp(int coreid)
 {
 	uint32_t edx;
 	int rdtscp_ecx;

 	/* TODO: have some sort of 'cpu info structure' with flags */
 	cpuid(0x80000001, 0x0, 0, 0, 0, &edx);
 	if (edx & (1 << 27)) {
 		write_msr(MSR_TSC_AUX, coreid);
 		/* Busted versions of qemu bug out here (32 bit) */
 		asm volatile ("rdtscp" : "=c"(rdtscp_ecx) : : "eax", "edx");
 		if (!coreid && (read_msr(MSR_TSC_AUX) != rdtscp_ecx))
 			printk("\nBroken rdtscp detected, don't trust it for pcoreid!\n\n");
 	}
 }

 /* TODO: consider merging __arch_pcpu with parts of this (sync with RISCV) */
 void smp_final_core_init(void)
 {
 	/* Set the coreid in pcpui for fast access to it through TLS. */
 	int coreid = get_os_coreid(hw_core_id());
 	struct per_cpu_info *pcpui = &per_cpu_info[coreid];
 	pcpui->coreid = coreid;
 	write_msr(MSR_GS_BASE, (uintptr_t)pcpui); /* our cr4 isn't set yet */
 	write_msr(MSR_KERN_GS_BASE, (uint64_t)pcpui);
 	/* don't need this for the kernel anymore, but userspace can still use
 	 * it */
 	setup_rdtscp(coreid);
 	/* After this point, all cores have set up their segmentation and
 	 * whatnot to be able to do a proper core_id(). */
 	waiton_barrier(&generic_barrier);
 	if (coreid == 0)
 		core_id_ready = TRUE;
 	/* being paranoid with this, it's all a bit ugly */
 	waiton_barrier(&generic_barrier);
 	setup_default_mtrrs(&generic_barrier);
 	smp_percpu_init();
 	waiton_barrier(&generic_barrier);
 }

 // this needs to be set in smp_entry too...
 #define trampoline_pg 0x00001000UL
 extern char smp_entry[];
 extern char smp_entry_end[];
 extern char smp_boot_lock[];
 extern char smp_semaphore[];

 static inline uint16_t *get_smp_semaphore()
 {
 	return (uint16_t *)(smp_semaphore - smp_entry + trampoline_pg);
 }

 static void __spin_bootlock_raw(void)
 {
 	uint16_t *bootlock = (uint16_t*)(smp_boot_lock - smp_entry +
 					 trampoline_pg);

 	/* Same lock code as in smp_entry */
 	asm volatile ("movw $1, %%ax;   "
 	              "1:               "
 	              "xchgw %%ax, %0;  "
 	              "test %%ax, %%ax; "
 	              "jne 1b;" : : "m"(*bootlock) : "eax", "cc", "memory");
 }

 void smp_boot(void)
 {
 	struct per_cpu_info *pcpui0 = &per_cpu_info[0];
 	page_t *smp_stack;

 	// NEED TO GRAB A LOWMEM FREE PAGE FOR AP BOOTUP CODE
 	// page1 (2nd page) is reserved, hardcoded in pmap.c
 	memset(KADDR(trampoline_pg), 0, PGSIZE);
 	memcpy(KADDR(trampoline_pg), (void *)smp_entry,
            smp_entry_end - smp_entry);

 	/* Make sure the trampoline page is mapped.  64 bit already has the
 	 * tramp pg mapped (1 GB of lowmem), so this is a nop. */

 	// Allocate a stack for the cores starting up.  One for all, must share
 	if (kpage_alloc(&smp_stack))
 		panic("No memory for SMP boot stack!");
 	smp_stack_top = (uintptr_t)(page2kva(smp_stack) + PGSIZE);

 	/* During SMP boot, core_id_early() returns 0, so all of the cores,
 	 * which grab locks concurrently, share the same pcpui and thus the same
 	 * lock_depth.  We need to disable checking until core_id works
 	 * properly. */
 	pcpui0->__lock_checking_enabled = 0;
 	// Start the IPI process (INIT, wait, SIPI, wait, SIPI, wait)
 	send_init_ipi();
 	// SDM 3A is a little wonky wrt the proper delays.  These are my best
 	// guess.
 	udelay(10000);
 	// first SIPI
 	send_startup_ipi(0x01);
 	/* BOCHS does not like this second SIPI.
 	// second SIPI
 	udelay(200);
 	send_startup_ipi(0x01);
 	*/
 	udelay(500000);

 	// Each core will also increment smp_semaphore, and decrement when it is
 	// done, all in smp_entry.  It's purpose is to keep Core0 from competing
 	// for the smp_boot_lock.  So long as one AP increments the sem before
 	// the final LAPIC timer goes off, all available cores will be
 	// initialized.
 	while (*get_smp_semaphore())
 		cpu_relax();

 	// From here on, no other cores are coming up.  Grab the lock to ensure
 	// it.  Another core could be in it's prelock phase and be trying to
 	// grab the lock forever....
 	// The lock exists on the trampoline, so it can be grabbed right away in
 	// real mode.  If core0 wins the race and blocks other CPUs from coming
 	// up it can crash the machine if the other cores are allowed to proceed
 	// with booting.  Specifically, it's when they turn on paging and have
 	// that temp mapping pulled out from under them.  Now, if a core loses,
 	// it will spin on the trampoline (which we must be careful to not
 	// deallocate)
 	__spin_bootlock_raw();
 	printk("Number of Cores Detected: %d\n", x86_num_cores_booted);
 #ifdef CONFIG_DISABLE_SMT
 	assert(!(num_cores % 2));
 	printk("Using only %d Idlecores (SMT Disabled)\n", num_cores >> 1);
 #endif /* CONFIG_DISABLE_SMT */

 	/* cleans up the trampoline page, and any other low boot mem mappings */
 	x86_cleanup_bootmem();
 	/* trampoline_pg had a refcount of 2 earlier, so we need to dec once
 	 * more to free it but only if all cores are in (or we reset / reinit
 	 * those that failed) */
 	if (x86_num_cores_booted == num_cores) {
 		/* TODO: if we ever alloc the trampoline_pg or something, we can
 		 * free it here. */
 	} else {
 		warn("ACPI/MP found %d cores, smp_boot initialized %d, using %d\n",
 		     num_cores, x86_num_cores_booted, x86_num_cores_booted);
 		num_cores = x86_num_cores_booted;
 	}
 	// Dealloc the temp shared stack
 	page_decref(smp_stack);

 	// Set up the generic remote function call facility
 	init_smp_call_function();

 	/* Final core initialization */
 	init_barrier(&generic_barrier, num_cores);
 	/* This will break the cores out of their hlt in smp_entry.S */
 	send_broadcast_ipi(I_POKE_CORE);
 	smp_final_core_init();	/* need to init ourselves as well */
 }

 /* This is called from smp_entry by each core to finish the core bootstrapping.
  * There is a spinlock around this entire function in smp_entry, for a few
  * reasons, the most important being that all cores use the same stack when
  * entering here.
  *
  * Do not use per_cpu_info in here.  Do whatever you need in smp_percpu_init().
  */
 uintptr_t smp_main(void)
 {
 	/* We need to fake being core 0 for our memory allocations to work
 	 * nicely.  This is safe since the entire machine is single threaded
 	 * while we are in this function. */
 	write_msr(MSR_GS_BASE, (uintptr_t)&per_cpu_info[0]);

 	// Get a per-core kernel stack
 	uintptr_t my_stack_top = get_kstack();

 	/* This blob is the GDT, the GDT PD, and the TSS. */
 	unsigned int blob_size = sizeof(segdesc_t) * SEG_COUNT +
 	                         sizeof(pseudodesc_t) + sizeof(taskstate_t);
 	/* TODO: don't use kmalloc - might have issues in the future */
 	void *gdt_etc = kmalloc(blob_size, 0);	/* we'll never free this btw */
 	taskstate_t *my_ts = gdt_etc;
 	pseudodesc_t *my_gdt_pd = (void*)my_ts + sizeof(taskstate_t);
 	segdesc_t *my_gdt = (void*)my_gdt_pd + sizeof(pseudodesc_t);

 	/* This is a bit ghetto: we need to communicate our GDT and TSS's
 	 * location to smp_percpu_init(), but we can't trust our coreid (since
 	 * they haven't been remapped yet (so we can't write it directly to
 	 * per_cpu_info)).  So we use the bottom of the stack page... */
 	*kstack_bottom_addr(my_stack_top) = (uintptr_t)gdt_etc;

 	// Build and load the gdt / gdt_pd
 	memcpy(my_gdt, gdt, sizeof(segdesc_t)*SEG_COUNT);
 	*my_gdt_pd = (pseudodesc_t) {
 		sizeof(segdesc_t)*SEG_COUNT - 1, (uintptr_t) my_gdt };
 	asm volatile("lgdt %0" : : "m"(*my_gdt_pd));

 	/* Set up our kernel stack when changing rings */
 	x86_set_stacktop_tss(my_ts, my_stack_top);
 	// Initialize the TSS field of my_gdt.
 	syssegdesc_t *ts_slot = (syssegdesc_t*)&my_gdt[GD_TSS >> 3];
 	*ts_slot = (syssegdesc_t)SEG_SYS_SMALL(STS_T32A, (uintptr_t)my_ts,
 	                                       sizeof(taskstate_t), 0);
 	// Load the TSS
 	ltr(GD_TSS);

 	// Loads the same IDT used by the other cores
 	asm volatile("lidt %0" : : "m"(idt_pd));

 	apiconline();

 	/* Stop pretending to be core 0.  We'll get our own coreid shortly and
 	 * set gs properly (smp_final_core_init()) */
 	write_msr(MSR_GS_BASE, 0);

 	return my_stack_top; // will be loaded in smp_entry.S
 }

 static void pcpu_init_nmi(struct per_cpu_info *pcpui)
 {
 	uintptr_t nmi_entry_stacktop = get_kstack();

 	/* NMI handlers can't use swapgs for kernel TFs, so we need to bootstrap
 	 * a bit.  We'll use a little bit of space above the actual NMI stacktop
 	 * for storage for the pcpui pointer.  But we need to be careful: the HW
 	 * will align RSP to 16 bytes on entry. */
 	nmi_entry_stacktop -= 16;
 	*(uintptr_t*)nmi_entry_stacktop = (uintptr_t)pcpui;
 	pcpui->tss->ts_ist1 = nmi_entry_stacktop;
 	/* Our actual NMI work is done on yet another stack, to avoid the "iret
 	 * cancelling NMI protections" problem.  All problems can be solved with
 	 * another layer of indirection! */
 	pcpui->nmi_worker_stacktop = get_kstack();
 }

 static void pcpu_init_doublefault(struct per_cpu_info *pcpui)
 {
 	pcpui->tss->ts_ist2 = get_kstack();
 }

 /* Perform any initialization needed by per_cpu_info.  Make sure every core
  * calls this at some point in the smp_boot process.  If you don't smp_boot, you
  * must still call this for core 0.  This must NOT be called from smp_main,
  * since it relies on the kernel stack pointer to find the gdt.  Be careful not
  * to call it on too deep of a stack frame. */
 void __arch_pcpu_init(uint32_t coreid)
 {
 	uintptr_t *my_stack_bot;
 	struct per_cpu_info *pcpui = &per_cpu_info[coreid];
 	uint32_t eax, edx;

 	/* Flushes any potentially old mappings from smp_boot() (note the page
 	 * table removal) */
 	tlbflush();

 	if (cpu_has_feat(CPU_FEAT_X86_FSGSBASE))
 		lcr4(rcr4() | CR4_FSGSBASE);

 	/*
 	 * Enable SSE instructions.
 	 * CR4.OSFXSR enables SSE and ensures that MXCSR/XMM gets saved with
 	 * 	      FXSAVE
 	 * CR4.OSXSAVE enables XSAVE instructions. Only set if XSAVE supported.
 	 * CR4.OSXMME indicates OS support for software exception handlers for
 	 * SIMD floating-point exceptions (turn it on to get #XM exceptions
 	 * in the event of a SIMD error instead of #UD exceptions).
 	 */
 	lcr4(rcr4() | CR4_OSFXSR | CR4_OSXMME);

 	if (cpu_has_feat(CPU_FEAT_X86_XSAVE)) {
 		// You MUST set CR4.OSXSAVE before loading xcr0
 		lcr4(rcr4() | CR4_OSXSAVE);
 		// Set xcr0 to the Akaros-wide default
 		lxcr0(__proc_global_info.x86_default_xcr0);
 	}

 	// Initialize fpu and extended state by restoring our default XSAVE
 	// area.
 	init_fp_state();

 	/* core 0 set up earlier in idt_init() */
 	if (coreid) {
 		my_stack_bot = kstack_bottom_addr(ROUNDUP(read_sp() - 1,
 							  PGSIZE));
 		pcpui->tss = (taskstate_t*)(*my_stack_bot);
 		pcpui->gdt = (segdesc_t*)(*my_stack_bot +
 		                          sizeof(taskstate_t) +
 					  sizeof(pseudodesc_t));
 	}
 	assert(read_gsbase() == (uintptr_t)pcpui);
 	assert(read_msr(MSR_KERN_GS_BASE) == (uint64_t)pcpui);
 	/* Don't try setting up til after setting GS */
 	x86_sysenter_init();
 	x86_set_sysenter_stacktop(x86_get_stacktop_tss(pcpui->tss));
 	pcpu_init_nmi(pcpui);
 	pcpu_init_doublefault(pcpui);
 	/* need to init perfctr before potentially using it in timer handler */
 	perfmon_pcpu_init();
 	vmm_pcpu_init();
 	lcr4(rcr4() & ~CR4_TSD);

 	/* This should allow turbo mode.  I haven't found a doc that says how
 	 * deep we need to sleep.  At a minimum on some machines, it's C2.
 	 * Given that "C2 or deeper" pops up in a few other areas as a deeper
 	 * sleep (e.g.  mwaits on memory accesses from outside the processor
 	 * won't wake >= C2), this might be deep enough for turbo mode to kick
 	 * in. */
 	set_fastest_pstate();
 	set_cstate(X86_MWAIT_C2);
 }
	/*
	* Copyright (c) 2009 The Regents of the University of California
	* Barret Rhoden <brho@cs.berkeley.edu>
	* See LICENSE for details.
	*/

	#include <arch/x86.h>
	#include <arch/arch.h>
	#include <smp.h>
	#include <arch/console.h>
	#include <arch/apic.h>
	#include <arch/perfmon.h>
	#include <time.h>

	#include <bitmask.h>
	#include <atomic.h>
	#include <error.h>
	#include <stdio.h>
	#include <string.h>
	#include <assert.h>
	#include <pmap.h>
	#include <env.h>
	#include <trap.h>
	#include <kmalloc.h>
	#include <cpu_feat.h>
	#include <arch/fsgsbase.h>
	#include <ros/procinfo.h>

	#include "vmm/vmm.h"

	extern handler_wrapper_t handler_wrappers[NUM_HANDLER_WRAPPERS];
	int x86_num_cores_booted = 1;
	uintptr_t smp_stack_top;
	barrier_t generic_barrier;

	#define DECLARE_HANDLER_CHECKLISTS(vector) \
	INIT_CHECKLIST(f##vector##_cpu_list, MAX_NUM_CORES);

	#define INIT_HANDLER_WRAPPER(v) \
	{ \
	handler_wrappers[(v)].vector = 0xe##v; \
	handler_wrappers[(v)].cpu_list = &f##v##_cpu_list; \
	handler_wrappers[(v)].cpu_list->mask.size = num_cores; \
	}

	DECLARE_HANDLER_CHECKLISTS(0);
	DECLARE_HANDLER_CHECKLISTS(1);
	DECLARE_HANDLER_CHECKLISTS(2);
	DECLARE_HANDLER_CHECKLISTS(3);
	DECLARE_HANDLER_CHECKLISTS(4);

	static void init_smp_call_function(void)
	{
	INIT_HANDLER_WRAPPER(0);
	INIT_HANDLER_WRAPPER(1);
	INIT_HANDLER_WRAPPER(2);
	INIT_HANDLER_WRAPPER(3);
	INIT_HANDLER_WRAPPER(4);
	}

	/******************************************************************************/

	bool core_id_ready = FALSE;

	static void setup_rdtscp(int coreid)
	{
	uint32_t edx;
	int rdtscp_ecx;

	/* TODO: have some sort of 'cpu info structure' with flags */
	cpuid(0x80000001, 0x0, 0, 0, 0, &edx);
	if (edx & (1 << 27)) {
	write_msr(MSR_TSC_AUX, coreid);
	/* Busted versions of qemu bug out here (32 bit) */
	asm volatile ("rdtscp" : "=c"(rdtscp_ecx) : : "eax", "edx");
	if (!coreid && (read_msr(MSR_TSC_AUX) != rdtscp_ecx))
	printk("\nBroken rdtscp detected, don't trust it for pcoreid!\n\n");
	}
	}

	/* TODO: consider merging __arch_pcpu with parts of this (sync with RISCV) */
	void smp_final_core_init(void)
	{
	/* Set the coreid in pcpui for fast access to it through TLS. */
	int coreid = get_os_coreid(hw_core_id());
	struct per_cpu_info *pcpui = &per_cpu_info[coreid];
	pcpui->coreid = coreid;
	write_msr(MSR_GS_BASE, (uintptr_t)pcpui); /* our cr4 isn't set yet */
	write_msr(MSR_KERN_GS_BASE, (uint64_t)pcpui);
	/* don't need this for the kernel anymore, but userspace can still use
	* it */
	setup_rdtscp(coreid);
	/* After this point, all cores have set up their segmentation and
	* whatnot to be able to do a proper core_id(). */
	waiton_barrier(&generic_barrier);
	if (coreid == 0)
	core_id_ready = TRUE;
	/* being paranoid with this, it's all a bit ugly */
	waiton_barrier(&generic_barrier);
	setup_default_mtrrs(&generic_barrier);
	smp_percpu_init();
	waiton_barrier(&generic_barrier);
	}

	// this needs to be set in smp_entry too...
	#define trampoline_pg 0x00001000UL
	extern char smp_entry[];
	extern char smp_entry_end[];
	extern char smp_boot_lock[];
	extern char smp_semaphore[];

	static inline uint16_t *get_smp_semaphore()
	{
	return (uint16_t *)(smp_semaphore - smp_entry + trampoline_pg);
	}

	static void __spin_bootlock_raw(void)
	{
	uint16_t bootlock = (uint16_t)(smp_boot_lock - smp_entry +
	trampoline_pg);

	/* Same lock code as in smp_entry */
	asm volatile ("movw $1, %%ax; "
	"1: "
	"xchgw %%ax, %0; "
	"test %%ax, %%ax; "
	"jne 1b;" : : "m"(*bootlock) : "eax", "cc", "memory");
	}

	void smp_boot(void)
	{
	struct per_cpu_info *pcpui0 = &per_cpu_info[0];
	page_t *smp_stack;

	// NEED TO GRAB A LOWMEM FREE PAGE FOR AP BOOTUP CODE
	// page1 (2nd page) is reserved, hardcoded in pmap.c
	memset(KADDR(trampoline_pg), 0, PGSIZE);
	memcpy(KADDR(trampoline_pg), (void *)smp_entry,
	smp_entry_end - smp_entry);

	/* Make sure the trampoline page is mapped. 64 bit already has the
	* tramp pg mapped (1 GB of lowmem), so this is a nop. */

	// Allocate a stack for the cores starting up. One for all, must share
	if (kpage_alloc(&smp_stack))
	panic("No memory for SMP boot stack!");
	smp_stack_top = (uintptr_t)(page2kva(smp_stack) + PGSIZE);

	/* During SMP boot, core_id_early() returns 0, so all of the cores,
	* which grab locks concurrently, share the same pcpui and thus the same
	* lock_depth. We need to disable checking until core_id works
	* properly. */
	pcpui0->__lock_checking_enabled = 0;
	// Start the IPI process (INIT, wait, SIPI, wait, SIPI, wait)
	send_init_ipi();
	// SDM 3A is a little wonky wrt the proper delays. These are my best
	// guess.
	udelay(10000);
	// first SIPI
	send_startup_ipi(0x01);
	/* BOCHS does not like this second SIPI.
	// second SIPI
	udelay(200);
	send_startup_ipi(0x01);
	*/
	udelay(500000);

	// Each core will also increment smp_semaphore, and decrement when it is
	// done, all in smp_entry. It's purpose is to keep Core0 from competing
	// for the smp_boot_lock. So long as one AP increments the sem before
	// the final LAPIC timer goes off, all available cores will be
	// initialized.
	while (*get_smp_semaphore())
	cpu_relax();

	// From here on, no other cores are coming up. Grab the lock to ensure
	// it. Another core could be in it's prelock phase and be trying to
	// grab the lock forever....
	// The lock exists on the trampoline, so it can be grabbed right away in
	// real mode. If core0 wins the race and blocks other CPUs from coming
	// up it can crash the machine if the other cores are allowed to proceed
	// with booting. Specifically, it's when they turn on paging and have
	// that temp mapping pulled out from under them. Now, if a core loses,
	// it will spin on the trampoline (which we must be careful to not
	// deallocate)
	__spin_bootlock_raw();
	printk("Number of Cores Detected: %d\n", x86_num_cores_booted);
	#ifdef CONFIG_DISABLE_SMT
	assert(!(num_cores % 2));
	printk("Using only %d Idlecores (SMT Disabled)\n", num_cores >> 1);
	#endif /* CONFIG_DISABLE_SMT */

	/* cleans up the trampoline page, and any other low boot mem mappings */
	x86_cleanup_bootmem();
	/* trampoline_pg had a refcount of 2 earlier, so we need to dec once
	* more to free it but only if all cores are in (or we reset / reinit
	* those that failed) */
	if (x86_num_cores_booted == num_cores) {
	/* TODO: if we ever alloc the trampoline_pg or something, we can
	* free it here. */
	} else {
	warn("ACPI/MP found %d cores, smp_boot initialized %d, using %d\n",
	num_cores, x86_num_cores_booted, x86_num_cores_booted);
	num_cores = x86_num_cores_booted;
	}
	// Dealloc the temp shared stack
	page_decref(smp_stack);

	// Set up the generic remote function call facility
	init_smp_call_function();

	/* Final core initialization */
	init_barrier(&generic_barrier, num_cores);
	/* This will break the cores out of their hlt in smp_entry.S */
	send_broadcast_ipi(I_POKE_CORE);
	smp_final_core_init(); /* need to init ourselves as well */
	}

	/* This is called from smp_entry by each core to finish the core bootstrapping.
	* There is a spinlock around this entire function in smp_entry, for a few
	* reasons, the most important being that all cores use the same stack when
	* entering here.
	*
	* Do not use per_cpu_info in here. Do whatever you need in smp_percpu_init().
	*/
	uintptr_t smp_main(void)
	{
	/* We need to fake being core 0 for our memory allocations to work
	* nicely. This is safe since the entire machine is single threaded
	* while we are in this function. */
	write_msr(MSR_GS_BASE, (uintptr_t)&per_cpu_info[0]);

	// Get a per-core kernel stack
	uintptr_t my_stack_top = get_kstack();

	/* This blob is the GDT, the GDT PD, and the TSS. */
	unsigned int blob_size = sizeof(segdesc_t) * SEG_COUNT +
	sizeof(pseudodesc_t) + sizeof(taskstate_t);
	/* TODO: don't use kmalloc - might have issues in the future */
	void gdt_etc = kmalloc(blob_size, 0); / we'll never free this btw */
	taskstate_t *my_ts = gdt_etc;
	pseudodesc_t my_gdt_pd = (void)my_ts + sizeof(taskstate_t);
	segdesc_t my_gdt = (void)my_gdt_pd + sizeof(pseudodesc_t);

	/* This is a bit ghetto: we need to communicate our GDT and TSS's
	* location to smp_percpu_init(), but we can't trust our coreid (since
	* they haven't been remapped yet (so we can't write it directly to
	* per_cpu_info)). So we use the bottom of the stack page... */
	*kstack_bottom_addr(my_stack_top) = (uintptr_t)gdt_etc;

	// Build and load the gdt / gdt_pd
	memcpy(my_gdt, gdt, sizeof(segdesc_t)*SEG_COUNT);
	*my_gdt_pd = (pseudodesc_t) {
	sizeof(segdesc_t)*SEG_COUNT - 1, (uintptr_t) my_gdt };
	asm volatile("lgdt %0" : : "m"(*my_gdt_pd));

	/* Set up our kernel stack when changing rings */
	x86_set_stacktop_tss(my_ts, my_stack_top);
	// Initialize the TSS field of my_gdt.
	syssegdesc_t ts_slot = (syssegdesc_t)&my_gdt[GD_TSS >> 3];
	*ts_slot = (syssegdesc_t)SEG_SYS_SMALL(STS_T32A, (uintptr_t)my_ts,
	sizeof(taskstate_t), 0);
	// Load the TSS
	ltr(GD_TSS);

	// Loads the same IDT used by the other cores
	asm volatile("lidt %0" : : "m"(idt_pd));

	apiconline();

	/* Stop pretending to be core 0. We'll get our own coreid shortly and
	* set gs properly (smp_final_core_init()) */
	write_msr(MSR_GS_BASE, 0);

	return my_stack_top; // will be loaded in smp_entry.S
	}

	static void pcpu_init_nmi(struct per_cpu_info *pcpui)
	{
	uintptr_t nmi_entry_stacktop = get_kstack();

	/* NMI handlers can't use swapgs for kernel TFs, so we need to bootstrap
	* a bit. We'll use a little bit of space above the actual NMI stacktop
	* for storage for the pcpui pointer. But we need to be careful: the HW
	* will align RSP to 16 bytes on entry. */
	nmi_entry_stacktop -= 16;
	(uintptr_t)nmi_entry_stacktop = (uintptr_t)pcpui;
	pcpui->tss->ts_ist1 = nmi_entry_stacktop;
	/* Our actual NMI work is done on yet another stack, to avoid the "iret
	* cancelling NMI protections" problem. All problems can be solved with
	* another layer of indirection! */
	pcpui->nmi_worker_stacktop = get_kstack();
	}

	static void pcpu_init_doublefault(struct per_cpu_info *pcpui)
	{
	pcpui->tss->ts_ist2 = get_kstack();
	}

	/* Perform any initialization needed by per_cpu_info. Make sure every core
	* calls this at some point in the smp_boot process. If you don't smp_boot, you
	* must still call this for core 0. This must NOT be called from smp_main,
	* since it relies on the kernel stack pointer to find the gdt. Be careful not
	* to call it on too deep of a stack frame. */
	void __arch_pcpu_init(uint32_t coreid)
	{
	uintptr_t *my_stack_bot;
	struct per_cpu_info *pcpui = &per_cpu_info[coreid];
	uint32_t eax, edx;

	/* Flushes any potentially old mappings from smp_boot() (note the page
	* table removal) */
	tlbflush();

	if (cpu_has_feat(CPU_FEAT_X86_FSGSBASE))
	lcr4(rcr4() \| CR4_FSGSBASE);

	/*
	* Enable SSE instructions.
	* CR4.OSFXSR enables SSE and ensures that MXCSR/XMM gets saved with
	* FXSAVE
	* CR4.OSXSAVE enables XSAVE instructions. Only set if XSAVE supported.
	* CR4.OSXMME indicates OS support for software exception handlers for
	* SIMD floating-point exceptions (turn it on to get #XM exceptions
	* in the event of a SIMD error instead of #UD exceptions).
	*/
	lcr4(rcr4() \| CR4_OSFXSR \| CR4_OSXMME);

	if (cpu_has_feat(CPU_FEAT_X86_XSAVE)) {
	// You MUST set CR4.OSXSAVE before loading xcr0
	lcr4(rcr4() \| CR4_OSXSAVE);
	// Set xcr0 to the Akaros-wide default
	lxcr0(__proc_global_info.x86_default_xcr0);
	}

	// Initialize fpu and extended state by restoring our default XSAVE
	// area.
	init_fp_state();

	/* core 0 set up earlier in idt_init() */
	if (coreid) {
	my_stack_bot = kstack_bottom_addr(ROUNDUP(read_sp() - 1,
	PGSIZE));
	pcpui->tss = (taskstate_t)(my_stack_bot);
	pcpui->gdt = (segdesc_t)(my_stack_bot +
	sizeof(taskstate_t) +
	sizeof(pseudodesc_t));
	}
	assert(read_gsbase() == (uintptr_t)pcpui);
	assert(read_msr(MSR_KERN_GS_BASE) == (uint64_t)pcpui);
	/* Don't try setting up til after setting GS */
	x86_sysenter_init();
	x86_set_sysenter_stacktop(x86_get_stacktop_tss(pcpui->tss));
	pcpu_init_nmi(pcpui);
	pcpu_init_doublefault(pcpui);
	/* need to init perfctr before potentially using it in timer handler */
	perfmon_pcpu_init();
	vmm_pcpu_init();
	lcr4(rcr4() & ~CR4_TSD);

	/* This should allow turbo mode. I haven't found a doc that says how
	* deep we need to sleep. At a minimum on some machines, it's C2.
	* Given that "C2 or deeper" pops up in a few other areas as a deeper
	* sleep (e.g. mwaits on memory accesses from outside the processor
	* won't wake >= C2), this might be deep enough for turbo mode to kick
	* in. */
	set_fastest_pstate();
	set_cstate(X86_MWAIT_C2);
	}