| #include <arch/arch.h> |
| #include <trap.h> |
| #include <process.h> |
| #include <pmap.h> |
| #include <smp.h> |
| #include <arch/fsgsbase.h> |
| |
| #include <string.h> |
| #include <assert.h> |
| #include <stdio.h> |
| |
| static void __attribute__((noreturn)) proc_pop_hwtf(struct hw_trapframe *tf) |
| { |
| /* for both HW and SW, note we pass an offset into the TF, beyond the fs |
| * and gs bases */ |
| if (x86_hwtf_is_partial(tf)) { |
| swap_gs(); |
| } else { |
| write_gsbase(tf->tf_gsbase); |
| write_fsbase(tf->tf_fsbase); |
| } |
| asm volatile (".globl __asm_pop_hwtf_start;" |
| "__asm_pop_hwtf_start: " |
| "movq %0, %%rsp; " |
| "popq %%rax; " |
| "popq %%rbx; " |
| "popq %%rcx; " |
| "popq %%rdx; " |
| "popq %%rbp; " |
| "popq %%rsi; " |
| "popq %%rdi; " |
| "popq %%r8; " |
| "popq %%r9; " |
| "popq %%r10; " |
| "popq %%r11; " |
| "popq %%r12; " |
| "popq %%r13; " |
| "popq %%r14; " |
| "popq %%r15; " |
| "addq $0x10, %%rsp; " |
| "iretq; " |
| ".globl __asm_pop_hwtf_end;" |
| "__asm_pop_hwtf_end: " |
| : : "g" (&tf->tf_rax) : "memory"); |
| panic("iretq failed"); |
| } |
| |
| static void __attribute__((noreturn)) proc_pop_swtf(struct sw_trapframe *tf) |
| { |
| if (x86_swtf_is_partial(tf)) { |
| swap_gs(); |
| } else { |
| write_gsbase(tf->tf_gsbase); |
| write_fsbase(tf->tf_fsbase); |
| } |
| /* We need to 0 out any registers that aren't part of the sw_tf and that |
| * we won't use/clobber on the out-path. While these aren't part of the |
| * sw_tf, we also don't want to leak any kernel register content. */ |
| asm volatile (".globl __asm_pop_swtf_start;" |
| "__asm_pop_swtf_start: " |
| "movq %0, %%rsp; " |
| "movq $0, %%rax; " |
| "movq $0, %%rdx; " |
| "movq $0, %%rsi; " |
| "movq $0, %%rdi; " |
| "movq $0, %%r8; " |
| "movq $0, %%r9; " |
| "movq $0, %%r10; " |
| "popq %%rbx; " |
| "popq %%rbp; " |
| "popq %%r12; " |
| "popq %%r13; " |
| "popq %%r14; " |
| "popq %%r15; " |
| "movq %1, %%r11; " |
| "popq %%rcx; " |
| "popq %%rsp; " |
| "rex.w sysret; " |
| ".globl __asm_pop_swtf_end;" |
| "__asm_pop_swtf_end: " |
| : : "g"(&tf->tf_rbx), "i"(FL_IF) : "memory"); |
| panic("sysret failed"); |
| } |
| |
| /* If popping a VM TF fails for some reason, we need to reflect it back to the |
| * user. It is possible that the reflection fails. We still need to run |
| * something, and it's a lousy time to try something else. So We'll give them a |
| * TF that will probably fault right away and kill them. */ |
| static void __attribute__((noreturn)) handle_bad_vm_tf(struct vm_trapframe *tf) |
| { |
| struct per_cpu_info *pcpui = &per_cpu_info[core_id()]; |
| |
| tf->tf_exit_reason |= VMX_EXIT_REASONS_FAILED_VMENTRY; |
| tf->tf_flags |= VMCTX_FL_HAS_FAULT; |
| if (reflect_current_context()) { |
| printk("[kernel] Unable to reflect after a bad VM enter\n"); |
| proc_init_ctx(pcpui->cur_ctx, 0, 0xcafebabe, 0, 0); |
| } |
| proc_pop_ctx(pcpui->cur_ctx); |
| } |
| |
| static void __attribute__((noreturn)) proc_pop_vmtf(struct vm_trapframe *tf) |
| { |
| struct per_cpu_info *pcpui = &per_cpu_info[core_id()]; |
| struct proc *p = pcpui->cur_proc; |
| struct guest_pcore *gpc; |
| |
| if (x86_vmtf_is_partial(tf)) { |
| gpc = lookup_guest_pcore(p, tf->tf_guest_pcoreid); |
| assert(gpc); |
| assert(pcpui->guest_pcoreid == tf->tf_guest_pcoreid); |
| assert(gpc->should_vmresume); |
| } else { |
| gpc = load_guest_pcore(p, tf->tf_guest_pcoreid); |
| if (!gpc) { |
| tf->tf_exit_reason = EXIT_REASON_GUEST_IN_USE; |
| handle_bad_vm_tf(tf); |
| } |
| } |
| vmcs_write(GUEST_RSP, tf->tf_rsp); |
| vmcs_write(GUEST_CR3, tf->tf_cr3); |
| vmcs_write(GUEST_RIP, tf->tf_rip); |
| vmcs_write(GUEST_RFLAGS, tf->tf_rflags); |
| /* The host stacktop could have changed, even if we are still a partial |
| * context. Consider a vmcall that blocks. We'll restart the partial |
| * context, but be on a new stack. set_stack_top() doesn't really know |
| * about the VMCS. */ |
| vmcs_write(HOST_RSP, pcpui->stacktop); |
| /* cr2 is not part of the VMCS state; we need to save/restore it |
| * manually */ |
| lcr2(tf->tf_cr2); |
| vmcs_write(VM_ENTRY_INTR_INFO_FIELD, tf->tf_trap_inject); |
| /* Someone may have tried poking the guest and posting an IRQ, but the |
| * IPI missed (concurrent vmexit). In these cases, the 'outstanding |
| * notification' bit should still be set, and we can resend the IPI. |
| * This will arrive after we vmenter, since IRQs are currently disabled. |
| * */ |
| if (test_bit(VMX_POSTED_OUTSTANDING_NOTIF, gpc->posted_irq_desc)) |
| send_self_ipi(I_POKE_GUEST); |
| /* The first time a VMCS is started after being loaded, it must be |
| * launched. Subsequent starts must be resumes. Once the VMCS is |
| * cleared, we start with a launch again. Note this is the VMCS, not |
| * the GPC unload. */ |
| if (gpc->should_vmresume) { |
| tf->tf_flags |= VMCTX_FL_VMRESUME; |
| } else { |
| tf->tf_flags &= ~VMCTX_FL_VMRESUME; |
| gpc->should_vmresume = TRUE; |
| } |
| /* vmlaunch/resume can fail, so we need to be able to return from this. |
| * Thus we can't clobber rsp via the popq style of setting the |
| * registers. Likewise, we don't want to lose rbp via the clobber list. |
| * |
| * Partial contexts have already been launched, so we resume them. */ |
| asm volatile (".globl __asm_pop_vmtf_start;" |
| "__asm_pop_vmtf_start: " |
| "testl $"STRINGIFY(VMCTX_FL_VMRESUME)", %c[flags](%0);" |
| "pushq %%rbp; " /* save in case we fail */ |
| "movq %c[rbx](%0), %%rbx; " |
| "movq %c[rcx](%0), %%rcx; " |
| "movq %c[rdx](%0), %%rdx; " |
| "movq %c[rbp](%0), %%rbp; " |
| "movq %c[rsi](%0), %%rsi; " |
| "movq %c[rdi](%0), %%rdi; " |
| "movq %c[r8](%0), %%r8; " |
| "movq %c[r9](%0), %%r9; " |
| "movq %c[r10](%0), %%r10; " |
| "movq %c[r11](%0), %%r11; " |
| "movq %c[r12](%0), %%r12; " |
| "movq %c[r13](%0), %%r13; " |
| "movq %c[r14](%0), %%r14; " |
| "movq %c[r15](%0), %%r15; " |
| "movq %c[rax](%0), %%rax; " /* clobber our *tf last */ |
| "jnz 1f; " /* jump if resume */ |
| ASM_VMX_VMLAUNCH"; " /* non-resume gets launched*/ |
| "jmp 2f; " |
| "1: "ASM_VMX_VMRESUME"; " |
| "2: popq %%rbp; " /* vmlaunch failed */ |
| ".globl __asm_pop_vmtf_end;" |
| "__asm_pop_vmtf_end: " |
| : |
| : "a" (tf), |
| [rax]"i"(offsetof(struct vm_trapframe, tf_rax)), |
| [rbx]"i"(offsetof(struct vm_trapframe, tf_rbx)), |
| [rcx]"i"(offsetof(struct vm_trapframe, tf_rcx)), |
| [rdx]"i"(offsetof(struct vm_trapframe, tf_rdx)), |
| [rbp]"i"(offsetof(struct vm_trapframe, tf_rbp)), |
| [rsi]"i"(offsetof(struct vm_trapframe, tf_rsi)), |
| [rdi]"i"(offsetof(struct vm_trapframe, tf_rdi)), |
| [r8]"i"(offsetof(struct vm_trapframe, tf_r8)), |
| [r9]"i"(offsetof(struct vm_trapframe, tf_r9)), |
| [r10]"i"(offsetof(struct vm_trapframe, tf_r10)), |
| [r11]"i"(offsetof(struct vm_trapframe, tf_r11)), |
| [r12]"i"(offsetof(struct vm_trapframe, tf_r12)), |
| [r13]"i"(offsetof(struct vm_trapframe, tf_r13)), |
| [r14]"i"(offsetof(struct vm_trapframe, tf_r14)), |
| [r15]"i"(offsetof(struct vm_trapframe, tf_r15)), |
| [flags]"i"(offsetof(struct vm_trapframe, tf_flags)) |
| : "cc", "memory", "rbx", "rcx", "rdx", "rsi", "rdi", |
| "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"); |
| /* vmlaunch/resume failed. It could be for a few reasons, including |
| * things like launching instead of resuming, not having a VMCS loaded, |
| * failing a host-state area check, etc. Those are kernel problems. |
| * |
| * The user should not be able to trigger these problems. The user |
| * could trigger a problem loading the guest-state area, such as a |
| * non-canonical address for RIP. Those sorts of errors should appear |
| * to be a normal vmexit with some flags set. |
| * |
| * Any failed vmlaunch/resume is likely a kernel bug, but we'll still |
| * reflect it to the user for debugability. |
| * |
| * Also we should always have a non-shadow VMCS, so ZF should be 1 and |
| * we can read the error register. */ |
| assert(read_flags() & FL_ZF); |
| tf->tf_exit_reason = EXIT_REASON_VMENTER_FAILED; |
| tf->tf_exit_qual = vmcs_read(VM_INSTRUCTION_ERROR); |
| tf->tf_flags |= VMCTX_FL_PARTIAL; |
| warn("vmlaunch / vmresume failed!, check userspace's reflected fault"); |
| handle_bad_vm_tf(tf); |
| } |
| |
| void proc_pop_ctx(struct user_context *ctx) |
| { |
| disable_irq(); |
| switch (ctx->type) { |
| case ROS_HW_CTX: |
| proc_pop_hwtf(&ctx->tf.hw_tf); |
| break; |
| case ROS_SW_CTX: |
| proc_pop_swtf(&ctx->tf.sw_tf); |
| break; |
| case ROS_VM_CTX: |
| proc_pop_vmtf(&ctx->tf.vm_tf); |
| break; |
| default: |
| /* We should have caught this when securing the ctx */ |
| panic("Unknown context type %d!", ctx->type); |
| } |
| } |
| |
| void proc_init_ctx(struct user_context *ctx, uint32_t vcoreid, uintptr_t entryp, |
| uintptr_t stack_top, uintptr_t tls_desc) |
| { |
| struct sw_trapframe *sw_tf = &ctx->tf.sw_tf; |
| |
| /* zero the entire structure for any type, prevent potential disclosure |
| */ |
| memset(ctx, 0, sizeof(struct user_context)); |
| ctx->type = ROS_SW_CTX; |
| /* Stack pointers in x86 C functions need to be such that adding or |
| * subtracting 8 will result in 16 byte alignment (AMD64 ABI), which we |
| * call an odd-8-byte alignment. The reason is so that input arguments |
| * (on the stack) are 16 byte aligned. The extra 8 bytes is the |
| * retaddr, pushed on the stack. Compilers know they can subtract 8 to |
| * get 16 byte alignment for instructions like movaps. |
| * |
| * However, the kernel will start contexts at 16 byte aligned stacks. |
| * This is because glibc's _start (in ASM) expects this. Parlib x86's |
| * vcore entry does the same. |
| * |
| * We init contexts for both an elf startup as well as vcore entry. It |
| * is up to the caller (including the user) to make sure the stack is |
| * aligned properly. elf.c doesn't know about these concerns, so if it |
| * messes up, there's nothing we can really do, since the args are just |
| * wrong. ld will fail immediately though, so we'll find out quickly. |
| * */ |
| sw_tf->tf_rsp = stack_top; |
| sw_tf->tf_rip = entryp; |
| sw_tf->tf_rbp = 0; /* for potential backtraces */ |
| sw_tf->tf_mxcsr = 0x00001f80; /* x86 default mxcsr */ |
| sw_tf->tf_fpucw = 0x037f; /* x86 default FP CW */ |
| /* Coupled closely with user's entry.S. id is the vcoreid, which |
| * entry.S uses to determine what to do. vcoreid == 0 is the main |
| * core/context. */ |
| sw_tf->tf_rbx = vcoreid; |
| sw_tf->tf_fsbase = tls_desc; |
| proc_secure_ctx(ctx); |
| } |
| |
| static void proc_secure_hwtf(struct hw_trapframe *tf) |
| { |
| enforce_user_canon(&tf->tf_gsbase); |
| enforce_user_canon(&tf->tf_fsbase); |
| enforce_user_canon(&tf->tf_rip); |
| enforce_user_canon(&tf->tf_rsp); |
| /* GD_UD is the user data segment selector in the GDT, and |
| * GD_UT is the user text segment selector (see inc/memlayout.h). |
| * The low 2 bits of each segment register contains the |
| * Requestor Privilege Level (RPL); 3 means user mode. */ |
| tf->tf_ss = GD_UD | 3; |
| tf->tf_cs = GD_UT | 3; |
| /* Always 1: interrupts */ |
| tf->tf_rflags |= FL_IF; |
| /* Always 0: IOPL must be set to 0. VM (virtual 8086) probably doesn't |
| * matter - SDM says it can't get modified via iret anyways. VIF and |
| * VIP are also virtual-8086 mode stuff. Supposedly NT is settable by |
| * userspace, but there's no good reason for it. Rather be paranoid. */ |
| tf->tf_rflags &= ~(FL_IOPL_MASK | FL_VM | FL_NT | FL_VIF | FL_VIP); |
| tf->tf_rflags |= FL_RSVD_1; |
| tf->tf_rflags &= FL_RSVD_0; |
| x86_hwtf_clear_partial(tf); |
| } |
| |
| static void proc_secure_swtf(struct sw_trapframe *tf) |
| { |
| enforce_user_canon(&tf->tf_gsbase); |
| enforce_user_canon(&tf->tf_fsbase); |
| enforce_user_canon(&tf->tf_rip); |
| enforce_user_canon(&tf->tf_rsp); |
| /* The kernel doesn't actually load the mxcsr or the fpucw, but we can |
| * still sanitize it in case we ever do load it. */ |
| tf->tf_mxcsr &= MXCSR_RSVD_0; |
| x86_swtf_clear_partial(tf); |
| } |
| |
| static void proc_secure_vmtf(struct vm_trapframe *tf) |
| { |
| /* The user can say whatever it wants for the bulk of the TF. If they |
| * mess up something in the guest-area, it'll be treated like a vmexit. |
| * There are a few things in the TF that we use on the kernel side. |
| * |
| * If guest_pcoreid is bad (not a guest_pcore), we'll fail to load the |
| * GPC and reflect the fault to userspace. |
| * |
| * Regarding tf_flags, some are informational for the user, some are |
| * used for our own use in the kernel. |
| * - VMCTX_FL_PARTIAL: We clear this below |
| * - VMCTX_FL_VMRESUME: Used to temporarily carry a bool in pop_vmtf, |
| * but we never trust the value in the VM TF. |
| * These are write-only from the kernel and passed to the user: |
| * - VMCTX_FL_HAS_FAULT |
| * - VMCTX_FL_EPT_VMR_BACKED */ |
| x86_vmtf_clear_partial(tf); |
| } |
| |
| void proc_secure_ctx(struct user_context *ctx) |
| { |
| switch (ctx->type) { |
| case ROS_HW_CTX: |
| proc_secure_hwtf(&ctx->tf.hw_tf); |
| break; |
| case ROS_SW_CTX: |
| proc_secure_swtf(&ctx->tf.sw_tf); |
| break; |
| case ROS_VM_CTX: |
| proc_secure_vmtf(&ctx->tf.vm_tf); |
| break; |
| default: |
| /* If we aren't another ctx type, we're assuming (and forcing) a |
| * HW ctx. If this is somehow fucked up, userspace should die |
| * rather quickly. */ |
| ctx->type = ROS_HW_CTX; |
| proc_secure_hwtf(&ctx->tf.hw_tf); |
| } |
| } |
| |
| /* Called when we are currently running an address space on our core and want to |
| * abandon it. We need a known good pgdir before releasing the old one. We |
| * decref, since current no longer tracks the proc (and current no longer |
| * protects the cr3). */ |
| void __abandon_core(void) |
| { |
| struct per_cpu_info *pcpui = &per_cpu_info[core_id()]; |
| struct proc *old_proc; |
| |
| lcr3(boot_cr3); |
| old_proc = pcpui->cur_proc; |
| pcpui->cur_proc = NULL; |
| proc_decref(old_proc); |
| } |
| |
| void __clear_owning_proc(uint32_t coreid) |
| { |
| vmx_clear_vmcs(); |
| } |