|  | /* | 
|  | * Copyright (c) 2009 The Regents of the University of California | 
|  | * Barret Rhoden <brho@cs.berkeley.edu> | 
|  | * See LICENSE for details. | 
|  | */ | 
|  |  | 
|  | #include <arch/x86.h> | 
|  | #include <arch/arch.h> | 
|  | #include <smp.h> | 
|  | #include <arch/console.h> | 
|  | #include <arch/apic.h> | 
|  | #include <arch/perfmon.h> | 
|  | #include <time.h> | 
|  |  | 
|  | #include <bitmask.h> | 
|  | #include <atomic.h> | 
|  | #include <error.h> | 
|  | #include <stdio.h> | 
|  | #include <string.h> | 
|  | #include <assert.h> | 
|  | #include <pmap.h> | 
|  | #include <env.h> | 
|  | #include <trap.h> | 
|  | #include <kmalloc.h> | 
|  | #include <cpu_feat.h> | 
|  | #include <arch/fsgsbase.h> | 
|  | #include <ros/procinfo.h> | 
|  |  | 
|  | #include "vmm/vmm.h" | 
|  |  | 
|  | extern handler_wrapper_t handler_wrappers[NUM_HANDLER_WRAPPERS]; | 
|  | int x86_num_cores_booted = 1; | 
|  | uintptr_t smp_stack_top; | 
|  | barrier_t generic_barrier; | 
|  |  | 
|  | #define DECLARE_HANDLER_CHECKLISTS(vector)                          \ | 
|  | INIT_CHECKLIST(f##vector##_cpu_list, MAX_NUM_CORES); | 
|  |  | 
|  | #define INIT_HANDLER_WRAPPER(v)                                     \ | 
|  | {                                                                   \ | 
|  | handler_wrappers[(v)].vector = 0xe##v;                      \ | 
|  | handler_wrappers[(v)].cpu_list = &f##v##_cpu_list;          \ | 
|  | handler_wrappers[(v)].cpu_list->mask.size = num_cores;      \ | 
|  | } | 
|  |  | 
|  | DECLARE_HANDLER_CHECKLISTS(0); | 
|  | DECLARE_HANDLER_CHECKLISTS(1); | 
|  | DECLARE_HANDLER_CHECKLISTS(2); | 
|  | DECLARE_HANDLER_CHECKLISTS(3); | 
|  | DECLARE_HANDLER_CHECKLISTS(4); | 
|  |  | 
|  | static void init_smp_call_function(void) | 
|  | { | 
|  | INIT_HANDLER_WRAPPER(0); | 
|  | INIT_HANDLER_WRAPPER(1); | 
|  | INIT_HANDLER_WRAPPER(2); | 
|  | INIT_HANDLER_WRAPPER(3); | 
|  | INIT_HANDLER_WRAPPER(4); | 
|  | } | 
|  |  | 
|  | /******************************************************************************/ | 
|  |  | 
|  | bool core_id_ready = FALSE; | 
|  |  | 
|  | static void setup_rdtscp(int coreid) | 
|  | { | 
|  | uint32_t edx; | 
|  | int rdtscp_ecx; | 
|  |  | 
|  | /* TODO: have some sort of 'cpu info structure' with flags */ | 
|  | cpuid(0x80000001, 0x0, 0, 0, 0, &edx); | 
|  | if (edx & (1 << 27)) { | 
|  | write_msr(MSR_TSC_AUX, coreid); | 
|  | /* Busted versions of qemu bug out here (32 bit) */ | 
|  | asm volatile ("rdtscp" : "=c"(rdtscp_ecx) : : "eax", "edx"); | 
|  | if (!coreid && (read_msr(MSR_TSC_AUX) != rdtscp_ecx)) | 
|  | printk("\nBroken rdtscp detected, don't trust it for pcoreid!\n\n"); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* TODO: consider merging __arch_pcpu with parts of this (sync with RISCV) */ | 
|  | void smp_final_core_init(void) | 
|  | { | 
|  | /* Set the coreid in pcpui for fast access to it through TLS. */ | 
|  | int coreid = get_os_coreid(hw_core_id()); | 
|  | struct per_cpu_info *pcpui = &per_cpu_info[coreid]; | 
|  | pcpui->coreid = coreid; | 
|  | write_msr(MSR_GS_BASE, (uintptr_t)pcpui); /* our cr4 isn't set yet */ | 
|  | write_msr(MSR_KERN_GS_BASE, (uint64_t)pcpui); | 
|  | /* don't need this for the kernel anymore, but userspace can still use | 
|  | * it */ | 
|  | setup_rdtscp(coreid); | 
|  | /* After this point, all cores have set up their segmentation and | 
|  | * whatnot to be able to do a proper core_id(). */ | 
|  | waiton_barrier(&generic_barrier); | 
|  | if (coreid == 0) | 
|  | core_id_ready = TRUE; | 
|  | /* being paranoid with this, it's all a bit ugly */ | 
|  | waiton_barrier(&generic_barrier); | 
|  | setup_default_mtrrs(&generic_barrier); | 
|  | smp_percpu_init(); | 
|  | waiton_barrier(&generic_barrier); | 
|  | } | 
|  |  | 
|  | // this needs to be set in smp_entry too... | 
|  | #define trampoline_pg 0x00001000UL | 
|  | extern char smp_entry[]; | 
|  | extern char smp_entry_end[]; | 
|  | extern char smp_boot_lock[]; | 
|  | extern char smp_semaphore[]; | 
|  |  | 
|  | static inline uint16_t *get_smp_semaphore() | 
|  | { | 
|  | return (uint16_t *)(smp_semaphore - smp_entry + trampoline_pg); | 
|  | } | 
|  |  | 
|  | static void __spin_bootlock_raw(void) | 
|  | { | 
|  | uint16_t *bootlock = (uint16_t*)(smp_boot_lock - smp_entry + | 
|  | trampoline_pg); | 
|  |  | 
|  | /* Same lock code as in smp_entry */ | 
|  | asm volatile ("movw $1, %%ax;   " | 
|  | "1:               " | 
|  | "xchgw %%ax, %0;  " | 
|  | "test %%ax, %%ax; " | 
|  | "jne 1b;" : : "m"(*bootlock) : "eax", "cc", "memory"); | 
|  | } | 
|  |  | 
|  | void smp_boot(void) | 
|  | { | 
|  | struct per_cpu_info *pcpui0 = &per_cpu_info[0]; | 
|  | page_t *smp_stack; | 
|  |  | 
|  | // NEED TO GRAB A LOWMEM FREE PAGE FOR AP BOOTUP CODE | 
|  | // page1 (2nd page) is reserved, hardcoded in pmap.c | 
|  | memset(KADDR(trampoline_pg), 0, PGSIZE); | 
|  | memcpy(KADDR(trampoline_pg), (void *)smp_entry, | 
|  | smp_entry_end - smp_entry); | 
|  |  | 
|  | /* Make sure the trampoline page is mapped.  64 bit already has the | 
|  | * tramp pg mapped (1 GB of lowmem), so this is a nop. */ | 
|  |  | 
|  | // Allocate a stack for the cores starting up.  One for all, must share | 
|  | if (kpage_alloc(&smp_stack)) | 
|  | panic("No memory for SMP boot stack!"); | 
|  | smp_stack_top = (uintptr_t)(page2kva(smp_stack) + PGSIZE); | 
|  |  | 
|  | /* During SMP boot, core_id_early() returns 0, so all of the cores, | 
|  | * which grab locks concurrently, share the same pcpui and thus the same | 
|  | * lock_depth.  We need to disable checking until core_id works | 
|  | * properly. */ | 
|  | pcpui0->__lock_checking_enabled = 0; | 
|  | // Start the IPI process (INIT, wait, SIPI, wait, SIPI, wait) | 
|  | send_init_ipi(); | 
|  | // SDM 3A is a little wonky wrt the proper delays.  These are my best | 
|  | // guess. | 
|  | udelay(10000); | 
|  | // first SIPI | 
|  | send_startup_ipi(0x01); | 
|  | /* BOCHS does not like this second SIPI. | 
|  | // second SIPI | 
|  | udelay(200); | 
|  | send_startup_ipi(0x01); | 
|  | */ | 
|  | udelay(500000); | 
|  |  | 
|  | // Each core will also increment smp_semaphore, and decrement when it is | 
|  | // done, all in smp_entry.  It's purpose is to keep Core0 from competing | 
|  | // for the smp_boot_lock.  So long as one AP increments the sem before | 
|  | // the final LAPIC timer goes off, all available cores will be | 
|  | // initialized. | 
|  | while (*get_smp_semaphore()) | 
|  | cpu_relax(); | 
|  |  | 
|  | // From here on, no other cores are coming up.  Grab the lock to ensure | 
|  | // it.  Another core could be in it's prelock phase and be trying to | 
|  | // grab the lock forever.... | 
|  | // The lock exists on the trampoline, so it can be grabbed right away in | 
|  | // real mode.  If core0 wins the race and blocks other CPUs from coming | 
|  | // up it can crash the machine if the other cores are allowed to proceed | 
|  | // with booting.  Specifically, it's when they turn on paging and have | 
|  | // that temp mapping pulled out from under them.  Now, if a core loses, | 
|  | // it will spin on the trampoline (which we must be careful to not | 
|  | // deallocate) | 
|  | __spin_bootlock_raw(); | 
|  | printk("Number of Cores Detected: %d\n", x86_num_cores_booted); | 
|  | #ifdef CONFIG_DISABLE_SMT | 
|  | assert(!(num_cores % 2)); | 
|  | printk("Using only %d Idlecores (SMT Disabled)\n", num_cores >> 1); | 
|  | #endif /* CONFIG_DISABLE_SMT */ | 
|  |  | 
|  | /* cleans up the trampoline page, and any other low boot mem mappings */ | 
|  | x86_cleanup_bootmem(); | 
|  | /* trampoline_pg had a refcount of 2 earlier, so we need to dec once | 
|  | * more to free it but only if all cores are in (or we reset / reinit | 
|  | * those that failed) */ | 
|  | if (x86_num_cores_booted == num_cores) { | 
|  | /* TODO: if we ever alloc the trampoline_pg or something, we can | 
|  | * free it here. */ | 
|  | } else { | 
|  | warn("ACPI/MP found %d cores, smp_boot initialized %d, using %d\n", | 
|  | num_cores, x86_num_cores_booted, x86_num_cores_booted); | 
|  | num_cores = x86_num_cores_booted; | 
|  | } | 
|  | // Dealloc the temp shared stack | 
|  | page_decref(smp_stack); | 
|  |  | 
|  | // Set up the generic remote function call facility | 
|  | init_smp_call_function(); | 
|  |  | 
|  | /* Final core initialization */ | 
|  | init_barrier(&generic_barrier, num_cores); | 
|  | /* This will break the cores out of their hlt in smp_entry.S */ | 
|  | send_broadcast_ipi(I_POKE_CORE); | 
|  | smp_final_core_init();	/* need to init ourselves as well */ | 
|  | } | 
|  |  | 
|  | /* This is called from smp_entry by each core to finish the core bootstrapping. | 
|  | * There is a spinlock around this entire function in smp_entry, for a few | 
|  | * reasons, the most important being that all cores use the same stack when | 
|  | * entering here. | 
|  | * | 
|  | * Do not use per_cpu_info in here.  Do whatever you need in smp_percpu_init(). | 
|  | */ | 
|  | uintptr_t smp_main(void) | 
|  | { | 
|  | /* We need to fake being core 0 for our memory allocations to work | 
|  | * nicely.  This is safe since the entire machine is single threaded | 
|  | * while we are in this function. */ | 
|  | write_msr(MSR_GS_BASE, (uintptr_t)&per_cpu_info[0]); | 
|  |  | 
|  | // Get a per-core kernel stack | 
|  | uintptr_t my_stack_top = get_kstack(); | 
|  |  | 
|  | /* This blob is the GDT, the GDT PD, and the TSS. */ | 
|  | unsigned int blob_size = sizeof(segdesc_t) * SEG_COUNT + | 
|  | sizeof(pseudodesc_t) + sizeof(taskstate_t); | 
|  | /* TODO: don't use kmalloc - might have issues in the future */ | 
|  | void *gdt_etc = kmalloc(blob_size, 0);	/* we'll never free this btw */ | 
|  | taskstate_t *my_ts = gdt_etc; | 
|  | pseudodesc_t *my_gdt_pd = (void*)my_ts + sizeof(taskstate_t); | 
|  | segdesc_t *my_gdt = (void*)my_gdt_pd + sizeof(pseudodesc_t); | 
|  |  | 
|  | /* This is a bit ghetto: we need to communicate our GDT and TSS's | 
|  | * location to smp_percpu_init(), but we can't trust our coreid (since | 
|  | * they haven't been remapped yet (so we can't write it directly to | 
|  | * per_cpu_info)).  So we use the bottom of the stack page... */ | 
|  | *kstack_bottom_addr(my_stack_top) = (uintptr_t)gdt_etc; | 
|  |  | 
|  | // Build and load the gdt / gdt_pd | 
|  | memcpy(my_gdt, gdt, sizeof(segdesc_t)*SEG_COUNT); | 
|  | *my_gdt_pd = (pseudodesc_t) { | 
|  | sizeof(segdesc_t)*SEG_COUNT - 1, (uintptr_t) my_gdt }; | 
|  | asm volatile("lgdt %0" : : "m"(*my_gdt_pd)); | 
|  |  | 
|  | /* Set up our kernel stack when changing rings */ | 
|  | x86_set_stacktop_tss(my_ts, my_stack_top); | 
|  | // Initialize the TSS field of my_gdt. | 
|  | syssegdesc_t *ts_slot = (syssegdesc_t*)&my_gdt[GD_TSS >> 3]; | 
|  | *ts_slot = (syssegdesc_t)SEG_SYS_SMALL(STS_T32A, (uintptr_t)my_ts, | 
|  | sizeof(taskstate_t), 0); | 
|  | // Load the TSS | 
|  | ltr(GD_TSS); | 
|  |  | 
|  | // Loads the same IDT used by the other cores | 
|  | asm volatile("lidt %0" : : "m"(idt_pd)); | 
|  |  | 
|  | apiconline(); | 
|  |  | 
|  | /* Stop pretending to be core 0.  We'll get our own coreid shortly and | 
|  | * set gs properly (smp_final_core_init()) */ | 
|  | write_msr(MSR_GS_BASE, 0); | 
|  |  | 
|  | return my_stack_top; // will be loaded in smp_entry.S | 
|  | } | 
|  |  | 
|  | static void pcpu_init_nmi(struct per_cpu_info *pcpui) | 
|  | { | 
|  | uintptr_t nmi_entry_stacktop = get_kstack(); | 
|  |  | 
|  | /* NMI handlers can't use swapgs for kernel TFs, so we need to bootstrap | 
|  | * a bit.  We'll use a little bit of space above the actual NMI stacktop | 
|  | * for storage for the pcpui pointer.  But we need to be careful: the HW | 
|  | * will align RSP to 16 bytes on entry. */ | 
|  | nmi_entry_stacktop -= 16; | 
|  | *(uintptr_t*)nmi_entry_stacktop = (uintptr_t)pcpui; | 
|  | pcpui->tss->ts_ist1 = nmi_entry_stacktop; | 
|  | /* Our actual NMI work is done on yet another stack, to avoid the "iret | 
|  | * cancelling NMI protections" problem.  All problems can be solved with | 
|  | * another layer of indirection! */ | 
|  | pcpui->nmi_worker_stacktop = get_kstack(); | 
|  | } | 
|  |  | 
|  | static void pcpu_init_doublefault(struct per_cpu_info *pcpui) | 
|  | { | 
|  | pcpui->tss->ts_ist2 = get_kstack(); | 
|  | } | 
|  |  | 
|  | /* Perform any initialization needed by per_cpu_info.  Make sure every core | 
|  | * calls this at some point in the smp_boot process.  If you don't smp_boot, you | 
|  | * must still call this for core 0.  This must NOT be called from smp_main, | 
|  | * since it relies on the kernel stack pointer to find the gdt.  Be careful not | 
|  | * to call it on too deep of a stack frame. */ | 
|  | void __arch_pcpu_init(uint32_t coreid) | 
|  | { | 
|  | uintptr_t *my_stack_bot; | 
|  | struct per_cpu_info *pcpui = &per_cpu_info[coreid]; | 
|  | uint32_t eax, edx; | 
|  |  | 
|  | /* Flushes any potentially old mappings from smp_boot() (note the page | 
|  | * table removal) */ | 
|  | tlbflush(); | 
|  |  | 
|  | if (cpu_has_feat(CPU_FEAT_X86_FSGSBASE)) | 
|  | lcr4(rcr4() | CR4_FSGSBASE); | 
|  |  | 
|  | /* | 
|  | * Enable SSE instructions. | 
|  | * CR4.OSFXSR enables SSE and ensures that MXCSR/XMM gets saved with | 
|  | * 	      FXSAVE | 
|  | * CR4.OSXSAVE enables XSAVE instructions. Only set if XSAVE supported. | 
|  | * CR4.OSXMME indicates OS support for software exception handlers for | 
|  | * SIMD floating-point exceptions (turn it on to get #XM exceptions | 
|  | * in the event of a SIMD error instead of #UD exceptions). | 
|  | */ | 
|  | lcr4(rcr4() | CR4_OSFXSR | CR4_OSXMME); | 
|  |  | 
|  | if (cpu_has_feat(CPU_FEAT_X86_XSAVE)) { | 
|  | // You MUST set CR4.OSXSAVE before loading xcr0 | 
|  | lcr4(rcr4() | CR4_OSXSAVE); | 
|  | // Set xcr0 to the Akaros-wide default | 
|  | lxcr0(__proc_global_info.x86_default_xcr0); | 
|  | } | 
|  |  | 
|  | // Initialize fpu and extended state by restoring our default XSAVE | 
|  | // area. | 
|  | init_fp_state(); | 
|  |  | 
|  | /* core 0 set up earlier in idt_init() */ | 
|  | if (coreid) { | 
|  | my_stack_bot = kstack_bottom_addr(ROUNDUP(read_sp() - 1, | 
|  | PGSIZE)); | 
|  | pcpui->tss = (taskstate_t*)(*my_stack_bot); | 
|  | pcpui->gdt = (segdesc_t*)(*my_stack_bot + | 
|  | sizeof(taskstate_t) + | 
|  | sizeof(pseudodesc_t)); | 
|  | } | 
|  | assert(read_gsbase() == (uintptr_t)pcpui); | 
|  | assert(read_msr(MSR_KERN_GS_BASE) == (uint64_t)pcpui); | 
|  | /* Don't try setting up til after setting GS */ | 
|  | x86_sysenter_init(); | 
|  | x86_set_sysenter_stacktop(x86_get_stacktop_tss(pcpui->tss)); | 
|  | pcpu_init_nmi(pcpui); | 
|  | pcpu_init_doublefault(pcpui); | 
|  | /* need to init perfctr before potentially using it in timer handler */ | 
|  | perfmon_pcpu_init(); | 
|  | vmm_pcpu_init(); | 
|  | lcr4(rcr4() & ~CR4_TSD); | 
|  |  | 
|  | /* This should allow turbo mode.  I haven't found a doc that says how | 
|  | * deep we need to sleep.  At a minimum on some machines, it's C2. | 
|  | * Given that "C2 or deeper" pops up in a few other areas as a deeper | 
|  | * sleep (e.g.  mwaits on memory accesses from outside the processor | 
|  | * won't wake >= C2), this might be deep enough for turbo mode to kick | 
|  | * in. */ | 
|  | set_fastest_pstate(); | 
|  | set_cstate(X86_MWAIT_C2); | 
|  | } |