blob: 3a227c795bf2d2f822df3972c2acfed5fd416f45 [file] [log] [blame]
#include <arch/arch.h>
#include <trap.h>
#include <process.h>
#include <pmap.h>
#include <smp.h>
#include <arch/fsgsbase.h>
#include <string.h>
#include <assert.h>
#include <stdio.h>
static void __attribute__((noreturn)) proc_pop_hwtf(struct hw_trapframe *tf)
{
/* for both HW and SW, note we pass an offset into the TF, beyond the fs
* and gs bases */
if (x86_hwtf_is_partial(tf)) {
swap_gs();
} else {
write_gsbase(tf->tf_gsbase);
write_fsbase(tf->tf_fsbase);
}
asm volatile (".globl __asm_pop_hwtf_start;"
"__asm_pop_hwtf_start: "
"movq %0, %%rsp; "
"popq %%rax; "
"popq %%rbx; "
"popq %%rcx; "
"popq %%rdx; "
"popq %%rbp; "
"popq %%rsi; "
"popq %%rdi; "
"popq %%r8; "
"popq %%r9; "
"popq %%r10; "
"popq %%r11; "
"popq %%r12; "
"popq %%r13; "
"popq %%r14; "
"popq %%r15; "
"addq $0x10, %%rsp; "
"iretq; "
".globl __asm_pop_hwtf_end;"
"__asm_pop_hwtf_end: "
: : "g" (&tf->tf_rax) : "memory");
panic("iretq failed");
}
static void __attribute__((noreturn)) proc_pop_swtf(struct sw_trapframe *tf)
{
if (x86_swtf_is_partial(tf)) {
swap_gs();
} else {
write_gsbase(tf->tf_gsbase);
write_fsbase(tf->tf_fsbase);
}
/* We need to 0 out any registers that aren't part of the sw_tf and that
* we won't use/clobber on the out-path. While these aren't part of the
* sw_tf, we also don't want to leak any kernel register content. */
asm volatile (".globl __asm_pop_swtf_start;"
"__asm_pop_swtf_start: "
"movq %0, %%rsp; "
"movq $0, %%rax; "
"movq $0, %%rdx; "
"movq $0, %%rsi; "
"movq $0, %%rdi; "
"movq $0, %%r8; "
"movq $0, %%r9; "
"movq $0, %%r10; "
"popq %%rbx; "
"popq %%rbp; "
"popq %%r12; "
"popq %%r13; "
"popq %%r14; "
"popq %%r15; "
"movq %1, %%r11; "
"popq %%rcx; "
"popq %%rsp; "
"rex.w sysret; "
".globl __asm_pop_swtf_end;"
"__asm_pop_swtf_end: "
: : "g"(&tf->tf_rbx), "i"(FL_IF) : "memory");
panic("sysret failed");
}
/* If popping a VM TF fails for some reason, we need to reflect it back to the
* user. It is possible that the reflection fails. We still need to run
* something, and it's a lousy time to try something else. So We'll give them a
* TF that will probably fault right away and kill them. */
static void __attribute__((noreturn)) handle_bad_vm_tf(struct vm_trapframe *tf)
{
struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
tf->tf_exit_reason |= VMX_EXIT_REASONS_FAILED_VMENTRY;
tf->tf_flags |= VMCTX_FL_HAS_FAULT;
if (reflect_current_context()) {
printk("[kernel] Unable to reflect after a bad VM enter\n");
proc_init_ctx(pcpui->cur_ctx, 0, 0xcafebabe, 0, 0);
}
proc_pop_ctx(pcpui->cur_ctx);
}
static void __attribute__((noreturn)) proc_pop_vmtf(struct vm_trapframe *tf)
{
struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
struct proc *p = pcpui->cur_proc;
struct guest_pcore *gpc;
if (x86_vmtf_is_partial(tf)) {
gpc = lookup_guest_pcore(p, tf->tf_guest_pcoreid);
assert(gpc);
assert(pcpui->guest_pcoreid == tf->tf_guest_pcoreid);
assert(gpc->should_vmresume);
} else {
gpc = load_guest_pcore(p, tf->tf_guest_pcoreid);
if (!gpc) {
tf->tf_exit_reason = EXIT_REASON_GUEST_IN_USE;
handle_bad_vm_tf(tf);
}
}
vmcs_write(GUEST_RSP, tf->tf_rsp);
vmcs_write(GUEST_CR3, tf->tf_cr3);
vmcs_write(GUEST_RIP, tf->tf_rip);
vmcs_write(GUEST_RFLAGS, tf->tf_rflags);
/* The host stacktop could have changed, even if we are still a partial
* context. Consider a vmcall that blocks. We'll restart the partial
* context, but be on a new stack. set_stack_top() doesn't really know
* about the VMCS. */
vmcs_write(HOST_RSP, pcpui->stacktop);
/* cr2 is not part of the VMCS state; we need to save/restore it
* manually */
lcr2(tf->tf_cr2);
vmcs_write(VM_ENTRY_INTR_INFO_FIELD, tf->tf_trap_inject);
/* Someone may have tried poking the guest and posting an IRQ, but the
* IPI missed (concurrent vmexit). In these cases, the 'outstanding
* notification' bit should still be set, and we can resend the IPI.
* This will arrive after we vmenter, since IRQs are currently disabled.
* */
if (test_bit(VMX_POSTED_OUTSTANDING_NOTIF, gpc->posted_irq_desc))
send_self_ipi(I_POKE_GUEST);
/* The first time a VMCS is started after being loaded, it must be
* launched. Subsequent starts must be resumes. Once the VMCS is
* cleared, we start with a launch again. Note this is the VMCS, not
* the GPC unload. */
if (gpc->should_vmresume) {
tf->tf_flags |= VMCTX_FL_VMRESUME;
} else {
tf->tf_flags &= ~VMCTX_FL_VMRESUME;
gpc->should_vmresume = TRUE;
}
/* vmlaunch/resume can fail, so we need to be able to return from this.
* Thus we can't clobber rsp via the popq style of setting the
* registers. Likewise, we don't want to lose rbp via the clobber list.
*
* Partial contexts have already been launched, so we resume them. */
asm volatile (".globl __asm_pop_vmtf_start;"
"__asm_pop_vmtf_start: "
"testl $"STRINGIFY(VMCTX_FL_VMRESUME)", %c[flags](%0);"
"pushq %%rbp; " /* save in case we fail */
"movq %c[rbx](%0), %%rbx; "
"movq %c[rcx](%0), %%rcx; "
"movq %c[rdx](%0), %%rdx; "
"movq %c[rbp](%0), %%rbp; "
"movq %c[rsi](%0), %%rsi; "
"movq %c[rdi](%0), %%rdi; "
"movq %c[r8](%0), %%r8; "
"movq %c[r9](%0), %%r9; "
"movq %c[r10](%0), %%r10; "
"movq %c[r11](%0), %%r11; "
"movq %c[r12](%0), %%r12; "
"movq %c[r13](%0), %%r13; "
"movq %c[r14](%0), %%r14; "
"movq %c[r15](%0), %%r15; "
"movq %c[rax](%0), %%rax; " /* clobber our *tf last */
"jnz 1f; " /* jump if resume */
ASM_VMX_VMLAUNCH"; " /* non-resume gets launched*/
"jmp 2f; "
"1: "ASM_VMX_VMRESUME"; "
"2: popq %%rbp; " /* vmlaunch failed */
".globl __asm_pop_vmtf_end;"
"__asm_pop_vmtf_end: "
:
: "a" (tf),
[rax]"i"(offsetof(struct vm_trapframe, tf_rax)),
[rbx]"i"(offsetof(struct vm_trapframe, tf_rbx)),
[rcx]"i"(offsetof(struct vm_trapframe, tf_rcx)),
[rdx]"i"(offsetof(struct vm_trapframe, tf_rdx)),
[rbp]"i"(offsetof(struct vm_trapframe, tf_rbp)),
[rsi]"i"(offsetof(struct vm_trapframe, tf_rsi)),
[rdi]"i"(offsetof(struct vm_trapframe, tf_rdi)),
[r8]"i"(offsetof(struct vm_trapframe, tf_r8)),
[r9]"i"(offsetof(struct vm_trapframe, tf_r9)),
[r10]"i"(offsetof(struct vm_trapframe, tf_r10)),
[r11]"i"(offsetof(struct vm_trapframe, tf_r11)),
[r12]"i"(offsetof(struct vm_trapframe, tf_r12)),
[r13]"i"(offsetof(struct vm_trapframe, tf_r13)),
[r14]"i"(offsetof(struct vm_trapframe, tf_r14)),
[r15]"i"(offsetof(struct vm_trapframe, tf_r15)),
[flags]"i"(offsetof(struct vm_trapframe, tf_flags))
: "cc", "memory", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15");
/* vmlaunch/resume failed. It could be for a few reasons, including
* things like launching instead of resuming, not having a VMCS loaded,
* failing a host-state area check, etc. Those are kernel problems.
*
* The user should not be able to trigger these problems. The user
* could trigger a problem loading the guest-state area, such as a
* non-canonical address for RIP. Those sorts of errors should appear
* to be a normal vmexit with some flags set.
*
* Any failed vmlaunch/resume is likely a kernel bug, but we'll still
* reflect it to the user for debugability.
*
* Also we should always have a non-shadow VMCS, so ZF should be 1 and
* we can read the error register. */
assert(read_flags() & FL_ZF);
tf->tf_exit_reason = EXIT_REASON_VMENTER_FAILED;
tf->tf_exit_qual = vmcs_read(VM_INSTRUCTION_ERROR);
tf->tf_flags |= VMCTX_FL_PARTIAL;
warn("vmlaunch / vmresume failed!, check userspace's reflected fault");
handle_bad_vm_tf(tf);
}
void proc_pop_ctx(struct user_context *ctx)
{
disable_irq();
switch (ctx->type) {
case ROS_HW_CTX:
proc_pop_hwtf(&ctx->tf.hw_tf);
break;
case ROS_SW_CTX:
proc_pop_swtf(&ctx->tf.sw_tf);
break;
case ROS_VM_CTX:
proc_pop_vmtf(&ctx->tf.vm_tf);
break;
default:
/* We should have caught this when securing the ctx */
panic("Unknown context type %d!", ctx->type);
}
}
void proc_init_ctx(struct user_context *ctx, uint32_t vcoreid, uintptr_t entryp,
uintptr_t stack_top, uintptr_t tls_desc)
{
struct sw_trapframe *sw_tf = &ctx->tf.sw_tf;
/* zero the entire structure for any type, prevent potential disclosure
*/
memset(ctx, 0, sizeof(struct user_context));
ctx->type = ROS_SW_CTX;
/* Stack pointers in x86 C functions need to be such that adding or
* subtracting 8 will result in 16 byte alignment (AMD64 ABI), which we
* call an odd-8-byte alignment. The reason is so that input arguments
* (on the stack) are 16 byte aligned. The extra 8 bytes is the
* retaddr, pushed on the stack. Compilers know they can subtract 8 to
* get 16 byte alignment for instructions like movaps.
*
* However, the kernel will start contexts at 16 byte aligned stacks.
* This is because glibc's _start (in ASM) expects this. Parlib x86's
* vcore entry does the same.
*
* We init contexts for both an elf startup as well as vcore entry. It
* is up to the caller (including the user) to make sure the stack is
* aligned properly. elf.c doesn't know about these concerns, so if it
* messes up, there's nothing we can really do, since the args are just
* wrong. ld will fail immediately though, so we'll find out quickly.
* */
sw_tf->tf_rsp = stack_top;
sw_tf->tf_rip = entryp;
sw_tf->tf_rbp = 0; /* for potential backtraces */
sw_tf->tf_mxcsr = 0x00001f80; /* x86 default mxcsr */
sw_tf->tf_fpucw = 0x037f; /* x86 default FP CW */
/* Coupled closely with user's entry.S. id is the vcoreid, which
* entry.S uses to determine what to do. vcoreid == 0 is the main
* core/context. */
sw_tf->tf_rbx = vcoreid;
sw_tf->tf_fsbase = tls_desc;
proc_secure_ctx(ctx);
}
static void proc_secure_hwtf(struct hw_trapframe *tf)
{
enforce_user_canon(&tf->tf_gsbase);
enforce_user_canon(&tf->tf_fsbase);
enforce_user_canon(&tf->tf_rip);
enforce_user_canon(&tf->tf_rsp);
/* GD_UD is the user data segment selector in the GDT, and
* GD_UT is the user text segment selector (see inc/memlayout.h).
* The low 2 bits of each segment register contains the
* Requestor Privilege Level (RPL); 3 means user mode. */
tf->tf_ss = GD_UD | 3;
tf->tf_cs = GD_UT | 3;
/* Always 1: interrupts */
tf->tf_rflags |= FL_IF;
/* Always 0: IOPL must be set to 0. VM (virtual 8086) probably doesn't
* matter - SDM says it can't get modified via iret anyways. VIF and
* VIP are also virtual-8086 mode stuff. Supposedly NT is settable by
* userspace, but there's no good reason for it. Rather be paranoid. */
tf->tf_rflags &= ~(FL_IOPL_MASK | FL_VM | FL_NT | FL_VIF | FL_VIP);
tf->tf_rflags |= FL_RSVD_1;
tf->tf_rflags &= FL_RSVD_0;
x86_hwtf_clear_partial(tf);
}
static void proc_secure_swtf(struct sw_trapframe *tf)
{
enforce_user_canon(&tf->tf_gsbase);
enforce_user_canon(&tf->tf_fsbase);
enforce_user_canon(&tf->tf_rip);
enforce_user_canon(&tf->tf_rsp);
/* The kernel doesn't actually load the mxcsr or the fpucw, but we can
* still sanitize it in case we ever do load it. */
tf->tf_mxcsr &= MXCSR_RSVD_0;
x86_swtf_clear_partial(tf);
}
static void proc_secure_vmtf(struct vm_trapframe *tf)
{
/* The user can say whatever it wants for the bulk of the TF. If they
* mess up something in the guest-area, it'll be treated like a vmexit.
* There are a few things in the TF that we use on the kernel side.
*
* If guest_pcoreid is bad (not a guest_pcore), we'll fail to load the
* GPC and reflect the fault to userspace.
*
* Regarding tf_flags, some are informational for the user, some are
* used for our own use in the kernel.
* - VMCTX_FL_PARTIAL: We clear this below
* - VMCTX_FL_VMRESUME: Used to temporarily carry a bool in pop_vmtf,
* but we never trust the value in the VM TF.
* These are write-only from the kernel and passed to the user:
* - VMCTX_FL_HAS_FAULT
* - VMCTX_FL_EPT_VMR_BACKED */
x86_vmtf_clear_partial(tf);
}
void proc_secure_ctx(struct user_context *ctx)
{
switch (ctx->type) {
case ROS_HW_CTX:
proc_secure_hwtf(&ctx->tf.hw_tf);
break;
case ROS_SW_CTX:
proc_secure_swtf(&ctx->tf.sw_tf);
break;
case ROS_VM_CTX:
proc_secure_vmtf(&ctx->tf.vm_tf);
break;
default:
/* If we aren't another ctx type, we're assuming (and forcing) a
* HW ctx. If this is somehow fucked up, userspace should die
* rather quickly. */
ctx->type = ROS_HW_CTX;
proc_secure_hwtf(&ctx->tf.hw_tf);
}
}
/* Called when we are currently running an address space on our core and want to
* abandon it. We need a known good pgdir before releasing the old one. We
* decref, since current no longer tracks the proc (and current no longer
* protects the cr3). */
void __abandon_core(void)
{
struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
struct proc *old_proc;
lcr3(boot_cr3);
old_proc = pcpui->cur_proc;
pcpui->cur_proc = NULL;
proc_decref(old_proc);
}
void __clear_owning_proc(uint32_t coreid)
{
vmx_clear_vmcs();
}