kern/arch/x86/process64.c - upstream - Git at Google

 #include <arch/arch.h>
 #include <trap.h>
 #include <process.h>
 #include <pmap.h>
 #include <smp.h>
 #include <arch/fsgsbase.h>

 #include <string.h>
 #include <assert.h>
 #include <stdio.h>

 static void __attribute__((noreturn)) proc_pop_hwtf(struct hw_trapframe *tf)
 {
 	/* for both HW and SW, note we pass an offset into the TF, beyond the fs
 	 * and gs bases */
 	if (x86_hwtf_is_partial(tf)) {
 		swap_gs();
 	} else {
 		write_gsbase(tf->tf_gsbase);
 		write_fsbase(tf->tf_fsbase);
 	}
 	asm volatile (".globl __asm_pop_hwtf_start;"
 	              "__asm_pop_hwtf_start:    "
 	              "movq %0, %%rsp;          "
 	              "popq %%rax;              "
 	              "popq %%rbx;              "
 	              "popq %%rcx;              "
 	              "popq %%rdx;              "
 	              "popq %%rbp;              "
 	              "popq %%rsi;              "
 	              "popq %%rdi;              "
 	              "popq %%r8;               "
 	              "popq %%r9;               "
 	              "popq %%r10;              "
 	              "popq %%r11;              "
 	              "popq %%r12;              "
 	              "popq %%r13;              "
 	              "popq %%r14;              "
 	              "popq %%r15;              "
 	              "addq $0x10, %%rsp;       "
 	              "iretq;                   "
 	              ".globl __asm_pop_hwtf_end;"
 	              "__asm_pop_hwtf_end:      "
 	              : : "g" (&tf->tf_rax) : "memory");
 	panic("iretq failed");
 }

 static void __attribute__((noreturn)) proc_pop_swtf(struct sw_trapframe *tf)
 {
 	if (x86_swtf_is_partial(tf)) {
 		swap_gs();
 	} else {
 		write_gsbase(tf->tf_gsbase);
 		write_fsbase(tf->tf_fsbase);
 	}
 	/* We need to 0 out any registers that aren't part of the sw_tf and that
 	 * we won't use/clobber on the out-path.  While these aren't part of the
 	 * sw_tf, we also don't want to leak any kernel register content. */
 	asm volatile (".globl __asm_pop_swtf_start;"
 	              "__asm_pop_swtf_start:    "
 	              "movq %0, %%rsp;          "
 	              "movq $0, %%rax;          "
 	              "movq $0, %%rdx;          "
 	              "movq $0, %%rsi;          "
 	              "movq $0, %%rdi;          "
 	              "movq $0, %%r8;           "
 	              "movq $0, %%r9;           "
 	              "movq $0, %%r10;          "
 	              "popq %%rbx;              "
 	              "popq %%rbp;              "
 	              "popq %%r12;              "
 	              "popq %%r13;              "
 	              "popq %%r14;              "
 	              "popq %%r15;              "
 	              "movq %1, %%r11;          "
 	              "popq %%rcx;              "
 	              "popq %%rsp;              "
 	              "rex.w sysret;            "
 	              ".globl __asm_pop_swtf_end;"
 	              "__asm_pop_swtf_end:      "
 	              : : "g"(&tf->tf_rbx), "i"(FL_IF) : "memory");
 	panic("sysret failed");
 }

 /* If popping a VM TF fails for some reason, we need to reflect it back to the
  * user.  It is possible that the reflection fails.  We still need to run
  * something, and it's a lousy time to try something else.  So We'll give them a
  * TF that will probably fault right away and kill them. */
 static void __attribute__((noreturn)) handle_bad_vm_tf(struct vm_trapframe *tf)
 {
 	struct per_cpu_info *pcpui = &per_cpu_info[core_id()];

 	tf->tf_exit_reason |= VMX_EXIT_REASONS_FAILED_VMENTRY;
 	tf->tf_flags |= VMCTX_FL_HAS_FAULT;
 	if (reflect_current_context()) {
 		printk("[kernel] Unable to reflect after a bad VM enter\n");
 		proc_init_ctx(pcpui->cur_ctx, 0, 0xcafebabe, 0, 0);
 	}
 	proc_pop_ctx(pcpui->cur_ctx);
 }

 static void __attribute__((noreturn)) proc_pop_vmtf(struct vm_trapframe *tf)
 {
 	struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 	struct proc *p = pcpui->cur_proc;
 	struct guest_pcore *gpc;

 	if (x86_vmtf_is_partial(tf)) {
 		gpc = lookup_guest_pcore(p, tf->tf_guest_pcoreid);
 		assert(gpc);
 		assert(pcpui->guest_pcoreid == tf->tf_guest_pcoreid);
 		assert(gpc->should_vmresume);
 	} else {
 		gpc = load_guest_pcore(p, tf->tf_guest_pcoreid);
 		if (!gpc) {
 			tf->tf_exit_reason = EXIT_REASON_GUEST_IN_USE;
 			handle_bad_vm_tf(tf);
 		}
 	}
 	vmcs_write(GUEST_RSP, tf->tf_rsp);
 	vmcs_write(GUEST_CR3, tf->tf_cr3);
 	vmcs_write(GUEST_RIP, tf->tf_rip);
 	vmcs_write(GUEST_RFLAGS, tf->tf_rflags);
 	/* The host stacktop could have changed, even if we are still a partial
 	 * context.  Consider a vmcall that blocks.  We'll restart the partial
 	 * context, but be on a new stack.  set_stack_top() doesn't really know
 	 * about the VMCS. */
 	vmcs_write(HOST_RSP, pcpui->stacktop);
 	/* cr2 is not part of the VMCS state; we need to save/restore it
 	 * manually */
 	lcr2(tf->tf_cr2);
 	vmcs_write(VM_ENTRY_INTR_INFO_FIELD, tf->tf_trap_inject);
 	/* Someone may have tried poking the guest and posting an IRQ, but the
 	 * IPI missed (concurrent vmexit).  In these cases, the 'outstanding
 	 * notification' bit should still be set, and we can resend the IPI.
 	 * This will arrive after we vmenter, since IRQs are currently disabled.
 	 * */
 	if (test_bit(VMX_POSTED_OUTSTANDING_NOTIF, gpc->posted_irq_desc))
 		send_self_ipi(I_POKE_GUEST);
 	/* The first time a VMCS is started after being loaded, it must be
 	 * launched.  Subsequent starts must be resumes.  Once the VMCS is
 	 * cleared, we start with a launch again.  Note this is the VMCS, not
 	 * the GPC unload. */
 	if (gpc->should_vmresume) {
 		tf->tf_flags |= VMCTX_FL_VMRESUME;
 	} else {
 		tf->tf_flags &= ~VMCTX_FL_VMRESUME;
 		gpc->should_vmresume = TRUE;
 	}
 	/* vmlaunch/resume can fail, so we need to be able to return from this.
 	 * Thus we can't clobber rsp via the popq style of setting the
 	 * registers.  Likewise, we don't want to lose rbp via the clobber list.
 	 *
 	 * Partial contexts have already been launched, so we resume them. */
 	asm volatile (".globl __asm_pop_vmtf_start;"
 	              "__asm_pop_vmtf_start:     "
 	              "testl $"STRINGIFY(VMCTX_FL_VMRESUME)", %c[flags](%0);"
 	              "pushq %%rbp;              " /* save in case we fail */
 	              "movq %c[rbx](%0), %%rbx;  "
 	              "movq %c[rcx](%0), %%rcx;  "
 	              "movq %c[rdx](%0), %%rdx;  "
 	              "movq %c[rbp](%0), %%rbp;  "
 	              "movq %c[rsi](%0), %%rsi;  "
 	              "movq %c[rdi](%0), %%rdi;  "
 	              "movq %c[r8](%0),  %%r8;   "
 	              "movq %c[r9](%0),  %%r9;   "
 	              "movq %c[r10](%0), %%r10;  "
 	              "movq %c[r11](%0), %%r11;  "
 	              "movq %c[r12](%0), %%r12;  "
 	              "movq %c[r13](%0), %%r13;  "
 	              "movq %c[r14](%0), %%r14;  "
 	              "movq %c[r15](%0), %%r15;  "
 	              "movq %c[rax](%0), %%rax;  " /* clobber our *tf last */
 	              "jnz 1f;                   " /* jump if resume */
 	              ASM_VMX_VMLAUNCH";         " /* non-resume gets launched*/
 	              "jmp 2f;                   "
 	              "1: "ASM_VMX_VMRESUME";    "
 	              "2: popq %%rbp;            " /* vmlaunch failed */
 	              ".globl __asm_pop_vmtf_end;"
 	              "__asm_pop_vmtf_end:       "
 	              :
 	              : "a" (tf),
 	                [rax]"i"(offsetof(struct vm_trapframe, tf_rax)),
 	                [rbx]"i"(offsetof(struct vm_trapframe, tf_rbx)),
 	                [rcx]"i"(offsetof(struct vm_trapframe, tf_rcx)),
 	                [rdx]"i"(offsetof(struct vm_trapframe, tf_rdx)),
 	                [rbp]"i"(offsetof(struct vm_trapframe, tf_rbp)),
 	                [rsi]"i"(offsetof(struct vm_trapframe, tf_rsi)),
 	                [rdi]"i"(offsetof(struct vm_trapframe, tf_rdi)),
 	                 [r8]"i"(offsetof(struct vm_trapframe, tf_r8)),
 	                 [r9]"i"(offsetof(struct vm_trapframe, tf_r9)),
 	                [r10]"i"(offsetof(struct vm_trapframe, tf_r10)),
 	                [r11]"i"(offsetof(struct vm_trapframe, tf_r11)),
 	                [r12]"i"(offsetof(struct vm_trapframe, tf_r12)),
 	                [r13]"i"(offsetof(struct vm_trapframe, tf_r13)),
 	                [r14]"i"(offsetof(struct vm_trapframe, tf_r14)),
 	                [r15]"i"(offsetof(struct vm_trapframe, tf_r15)),
 	                [flags]"i"(offsetof(struct vm_trapframe, tf_flags))
 	              : "cc", "memory", "rbx", "rcx", "rdx", "rsi", "rdi",
 	                "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15");
 	/* vmlaunch/resume failed.  It could be for a few reasons, including
 	 * things like launching instead of resuming, not having a VMCS loaded,
 	 * failing a host-state area check, etc.  Those are kernel problems.
 	 *
 	 * The user should not be able to trigger these problems.  The user
 	 * could trigger a problem loading the guest-state area, such as a
 	 * non-canonical address for RIP.  Those sorts of errors should appear
 	 * to be a normal vmexit with some flags set.
 	 *
 	 * Any failed vmlaunch/resume is likely a kernel bug, but we'll still
 	 * reflect it to the user for debugability.
 	 *
 	 * Also we should always have a non-shadow VMCS, so ZF should be 1 and
 	 * we can read the error register. */
 	assert(read_flags() & FL_ZF);
 	tf->tf_exit_reason = EXIT_REASON_VMENTER_FAILED;
 	tf->tf_exit_qual = vmcs_read(VM_INSTRUCTION_ERROR);
 	tf->tf_flags |= VMCTX_FL_PARTIAL;
 	warn("vmlaunch / vmresume failed!, check userspace's reflected fault");
 	handle_bad_vm_tf(tf);
 }

 void proc_pop_ctx(struct user_context *ctx)
 {
 	disable_irq();
 	switch (ctx->type) {
 	case ROS_HW_CTX:
 		proc_pop_hwtf(&ctx->tf.hw_tf);
 		break;
 	case ROS_SW_CTX:
 		proc_pop_swtf(&ctx->tf.sw_tf);
 		break;
 	case ROS_VM_CTX:
 		proc_pop_vmtf(&ctx->tf.vm_tf);
 		break;
 	default:
 		/* We should have caught this when securing the ctx */
 		panic("Unknown context type %d!", ctx->type);
 	}
 }

 void proc_init_ctx(struct user_context *ctx, uint32_t vcoreid, uintptr_t entryp,
                    uintptr_t stack_top, uintptr_t tls_desc)
 {
 	struct sw_trapframe *sw_tf = &ctx->tf.sw_tf;

 	/* zero the entire structure for any type, prevent potential disclosure
 	 */
 	memset(ctx, 0, sizeof(struct user_context));
 	ctx->type = ROS_SW_CTX;
 	/* Stack pointers in x86 C functions need to be such that adding or
 	 * subtracting 8 will result in 16 byte alignment (AMD64 ABI), which we
 	 * call an odd-8-byte alignment.  The reason is so that input arguments
 	 * (on the stack) are 16 byte aligned.  The extra 8 bytes is the
 	 * retaddr, pushed on the stack.  Compilers know they can subtract 8 to
 	 * get 16 byte alignment for instructions like movaps.
 	 *
 	 * However, the kernel will start contexts at 16 byte aligned stacks.
 	 * This is because glibc's _start (in ASM) expects this.  Parlib x86's
 	 * vcore entry does the same.
 	 *
 	 * We init contexts for both an elf startup as well as vcore entry.  It
 	 * is up to the caller (including the user) to make sure the stack is
 	 * aligned properly.  elf.c doesn't know about these concerns, so if it
 	 * messes up, there's nothing we can really do, since the args are just
 	 * wrong.  ld will fail immediately though, so we'll find out quickly.
 	 * */
 	sw_tf->tf_rsp = stack_top;
 	sw_tf->tf_rip = entryp;
 	sw_tf->tf_rbp = 0;	/* for potential backtraces */
 	sw_tf->tf_mxcsr = 0x00001f80;	/* x86 default mxcsr */
 	sw_tf->tf_fpucw = 0x037f;		/* x86 default FP CW */
 	/* Coupled closely with user's entry.S.  id is the vcoreid, which
 	 * entry.S uses to determine what to do.  vcoreid == 0 is the main
 	 * core/context. */
 	sw_tf->tf_rbx = vcoreid;
 	sw_tf->tf_fsbase = tls_desc;
 	proc_secure_ctx(ctx);
 }

 static void proc_secure_hwtf(struct hw_trapframe *tf)
 {
 	enforce_user_canon(&tf->tf_gsbase);
 	enforce_user_canon(&tf->tf_fsbase);
 	enforce_user_canon(&tf->tf_rip);
 	enforce_user_canon(&tf->tf_rsp);
 	/* GD_UD is the user data segment selector in the GDT, and
 	 * GD_UT is the user text segment selector (see inc/memlayout.h).
 	 * The low 2 bits of each segment register contains the
 	 * Requestor Privilege Level (RPL); 3 means user mode. */
 	tf->tf_ss = GD_UD | 3;
 	tf->tf_cs = GD_UT | 3;
 	/* Always 1: interrupts */
 	tf->tf_rflags |= FL_IF;
 	/* Always 0: IOPL must be set to 0.  VM (virtual 8086) probably doesn't
 	 * matter - SDM says it can't get modified via iret anyways.  VIF and
 	 * VIP are also virtual-8086 mode stuff.  Supposedly NT is settable by
 	 * userspace, but there's no good reason for it.  Rather be paranoid. */
 	tf->tf_rflags &= ~(FL_IOPL_MASK | FL_VM | FL_NT | FL_VIF | FL_VIP);
 	tf->tf_rflags |= FL_RSVD_1;
 	tf->tf_rflags &= FL_RSVD_0;
 	x86_hwtf_clear_partial(tf);
 }

 static void proc_secure_swtf(struct sw_trapframe *tf)
 {
 	enforce_user_canon(&tf->tf_gsbase);
 	enforce_user_canon(&tf->tf_fsbase);
 	enforce_user_canon(&tf->tf_rip);
 	enforce_user_canon(&tf->tf_rsp);
 	/* The kernel doesn't actually load the mxcsr or the fpucw, but we can
 	 * still sanitize it in case we ever do load it. */
 	tf->tf_mxcsr &= MXCSR_RSVD_0;
 	x86_swtf_clear_partial(tf);
 }

 static void proc_secure_vmtf(struct vm_trapframe *tf)
 {
 	/* The user can say whatever it wants for the bulk of the TF.  If they
 	 * mess up something in the guest-area, it'll be treated like a vmexit.
 	 * There are a few things in the TF that we use on the kernel side.
 	 *
 	 * If guest_pcoreid is bad (not a guest_pcore), we'll fail to load the
 	 * GPC and reflect the fault to userspace.
 	 *
 	 * Regarding tf_flags, some are informational for the user, some are
 	 * used for our own use in the kernel.
 	 * - VMCTX_FL_PARTIAL: We clear this below
 	 * - VMCTX_FL_VMRESUME: Used to temporarily carry a bool in pop_vmtf,
 	 *   but we never trust the value in the VM TF.
 	 * These are write-only from the kernel and passed to the user:
 	 * - VMCTX_FL_HAS_FAULT
 	 * - VMCTX_FL_EPT_VMR_BACKED */
 	x86_vmtf_clear_partial(tf);
 }

 void proc_secure_ctx(struct user_context *ctx)
 {
 	switch (ctx->type) {
 	case ROS_HW_CTX:
 		proc_secure_hwtf(&ctx->tf.hw_tf);
 		break;
 	case ROS_SW_CTX:
 		proc_secure_swtf(&ctx->tf.sw_tf);
 		break;
 	case ROS_VM_CTX:
 		proc_secure_vmtf(&ctx->tf.vm_tf);
 		break;
 	default:
 		/* If we aren't another ctx type, we're assuming (and forcing) a
 		 * HW ctx.  If this is somehow fucked up, userspace should die
 		 * rather quickly. */
 		ctx->type = ROS_HW_CTX;
 		proc_secure_hwtf(&ctx->tf.hw_tf);
 	}
 }

 /* Called when we are currently running an address space on our core and want to
  * abandon it.  We need a known good pgdir before releasing the old one.  We
  * decref, since current no longer tracks the proc (and current no longer
  * protects the cr3). */
 void __abandon_core(void)
 {
 	struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 	struct proc *old_proc;

 	lcr3(boot_cr3);
 	old_proc = pcpui->cur_proc;
 	pcpui->cur_proc = NULL;
 	proc_decref(old_proc);
 }

 void __clear_owning_proc(uint32_t coreid)
 {
 	vmx_clear_vmcs();
 }
	#include <arch/arch.h>
	#include <trap.h>
	#include <process.h>
	#include <pmap.h>
	#include <smp.h>
	#include <arch/fsgsbase.h>

	#include <string.h>
	#include <assert.h>
	#include <stdio.h>

	static void __attribute__((noreturn)) proc_pop_hwtf(struct hw_trapframe *tf)
	{
	/* for both HW and SW, note we pass an offset into the TF, beyond the fs
	* and gs bases */
	if (x86_hwtf_is_partial(tf)) {
	swap_gs();
	} else {
	write_gsbase(tf->tf_gsbase);
	write_fsbase(tf->tf_fsbase);
	}
	asm volatile (".globl __asm_pop_hwtf_start;"
	"__asm_pop_hwtf_start: "
	"movq %0, %%rsp; "
	"popq %%rax; "
	"popq %%rbx; "
	"popq %%rcx; "
	"popq %%rdx; "
	"popq %%rbp; "
	"popq %%rsi; "
	"popq %%rdi; "
	"popq %%r8; "
	"popq %%r9; "
	"popq %%r10; "
	"popq %%r11; "
	"popq %%r12; "
	"popq %%r13; "
	"popq %%r14; "
	"popq %%r15; "
	"addq $0x10, %%rsp; "
	"iretq; "
	".globl __asm_pop_hwtf_end;"
	"__asm_pop_hwtf_end: "
	: : "g" (&tf->tf_rax) : "memory");
	panic("iretq failed");
	}

	static void __attribute__((noreturn)) proc_pop_swtf(struct sw_trapframe *tf)
	{
	if (x86_swtf_is_partial(tf)) {
	swap_gs();
	} else {
	write_gsbase(tf->tf_gsbase);
	write_fsbase(tf->tf_fsbase);
	}
	/* We need to 0 out any registers that aren't part of the sw_tf and that
	* we won't use/clobber on the out-path. While these aren't part of the
	* sw_tf, we also don't want to leak any kernel register content. */
	asm volatile (".globl __asm_pop_swtf_start;"
	"__asm_pop_swtf_start: "
	"movq %0, %%rsp; "
	"movq $0, %%rax; "
	"movq $0, %%rdx; "
	"movq $0, %%rsi; "
	"movq $0, %%rdi; "
	"movq $0, %%r8; "
	"movq $0, %%r9; "
	"movq $0, %%r10; "
	"popq %%rbx; "
	"popq %%rbp; "
	"popq %%r12; "
	"popq %%r13; "
	"popq %%r14; "
	"popq %%r15; "
	"movq %1, %%r11; "
	"popq %%rcx; "
	"popq %%rsp; "
	"rex.w sysret; "
	".globl __asm_pop_swtf_end;"
	"__asm_pop_swtf_end: "
	: : "g"(&tf->tf_rbx), "i"(FL_IF) : "memory");
	panic("sysret failed");
	}

	/* If popping a VM TF fails for some reason, we need to reflect it back to the
	* user. It is possible that the reflection fails. We still need to run
	* something, and it's a lousy time to try something else. So We'll give them a
	* TF that will probably fault right away and kill them. */
	static void __attribute__((noreturn)) handle_bad_vm_tf(struct vm_trapframe *tf)
	{
	struct per_cpu_info *pcpui = &per_cpu_info[core_id()];

	tf->tf_exit_reason \|= VMX_EXIT_REASONS_FAILED_VMENTRY;
	tf->tf_flags \|= VMCTX_FL_HAS_FAULT;
	if (reflect_current_context()) {
	printk("[kernel] Unable to reflect after a bad VM enter\n");
	proc_init_ctx(pcpui->cur_ctx, 0, 0xcafebabe, 0, 0);
	}
	proc_pop_ctx(pcpui->cur_ctx);
	}

	static void __attribute__((noreturn)) proc_pop_vmtf(struct vm_trapframe *tf)
	{
	struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
	struct proc *p = pcpui->cur_proc;
	struct guest_pcore *gpc;

	if (x86_vmtf_is_partial(tf)) {
	gpc = lookup_guest_pcore(p, tf->tf_guest_pcoreid);
	assert(gpc);
	assert(pcpui->guest_pcoreid == tf->tf_guest_pcoreid);
	assert(gpc->should_vmresume);
	} else {
	gpc = load_guest_pcore(p, tf->tf_guest_pcoreid);
	if (!gpc) {
	tf->tf_exit_reason = EXIT_REASON_GUEST_IN_USE;
	handle_bad_vm_tf(tf);
	}
	}
	vmcs_write(GUEST_RSP, tf->tf_rsp);
	vmcs_write(GUEST_CR3, tf->tf_cr3);
	vmcs_write(GUEST_RIP, tf->tf_rip);
	vmcs_write(GUEST_RFLAGS, tf->tf_rflags);
	/* The host stacktop could have changed, even if we are still a partial
	* context. Consider a vmcall that blocks. We'll restart the partial
	* context, but be on a new stack. set_stack_top() doesn't really know
	* about the VMCS. */
	vmcs_write(HOST_RSP, pcpui->stacktop);
	/* cr2 is not part of the VMCS state; we need to save/restore it
	* manually */
	lcr2(tf->tf_cr2);
	vmcs_write(VM_ENTRY_INTR_INFO_FIELD, tf->tf_trap_inject);
	/* Someone may have tried poking the guest and posting an IRQ, but the
	* IPI missed (concurrent vmexit). In these cases, the 'outstanding
	* notification' bit should still be set, and we can resend the IPI.
	* This will arrive after we vmenter, since IRQs are currently disabled.
	* */
	if (test_bit(VMX_POSTED_OUTSTANDING_NOTIF, gpc->posted_irq_desc))
	send_self_ipi(I_POKE_GUEST);
	/* The first time a VMCS is started after being loaded, it must be
	* launched. Subsequent starts must be resumes. Once the VMCS is
	* cleared, we start with a launch again. Note this is the VMCS, not
	* the GPC unload. */
	if (gpc->should_vmresume) {
	tf->tf_flags \|= VMCTX_FL_VMRESUME;
	} else {
	tf->tf_flags &= ~VMCTX_FL_VMRESUME;
	gpc->should_vmresume = TRUE;
	}
	/* vmlaunch/resume can fail, so we need to be able to return from this.
	* Thus we can't clobber rsp via the popq style of setting the
	* registers. Likewise, we don't want to lose rbp via the clobber list.
	*
	* Partial contexts have already been launched, so we resume them. */
	asm volatile (".globl __asm_pop_vmtf_start;"
	"__asm_pop_vmtf_start: "
	"testl $"STRINGIFY(VMCTX_FL_VMRESUME)", %c[flags](%0);"
	"pushq %%rbp; " /* save in case we fail */
	"movq %c[rbx](%0), %%rbx; "
	"movq %c[rcx](%0), %%rcx; "
	"movq %c[rdx](%0), %%rdx; "
	"movq %c[rbp](%0), %%rbp; "
	"movq %c[rsi](%0), %%rsi; "
	"movq %c[rdi](%0), %%rdi; "
	"movq %c[r8](%0), %%r8; "
	"movq %c[r9](%0), %%r9; "
	"movq %c[r10](%0), %%r10; "
	"movq %c[r11](%0), %%r11; "
	"movq %c[r12](%0), %%r12; "
	"movq %c[r13](%0), %%r13; "
	"movq %c[r14](%0), %%r14; "
	"movq %c[r15](%0), %%r15; "
	"movq %c[rax](%0), %%rax; " /* clobber our tf last /
	"jnz 1f; " /* jump if resume */
	ASM_VMX_VMLAUNCH"; " /* non-resume gets launched*/
	"jmp 2f; "
	"1: "ASM_VMX_VMRESUME"; "
	"2: popq %%rbp; " /* vmlaunch failed */
	".globl __asm_pop_vmtf_end;"
	"__asm_pop_vmtf_end: "
	:
	: "a" (tf),
	[rax]"i"(offsetof(struct vm_trapframe, tf_rax)),
	[rbx]"i"(offsetof(struct vm_trapframe, tf_rbx)),
	[rcx]"i"(offsetof(struct vm_trapframe, tf_rcx)),
	[rdx]"i"(offsetof(struct vm_trapframe, tf_rdx)),
	[rbp]"i"(offsetof(struct vm_trapframe, tf_rbp)),
	[rsi]"i"(offsetof(struct vm_trapframe, tf_rsi)),
	[rdi]"i"(offsetof(struct vm_trapframe, tf_rdi)),
	[r8]"i"(offsetof(struct vm_trapframe, tf_r8)),
	[r9]"i"(offsetof(struct vm_trapframe, tf_r9)),
	[r10]"i"(offsetof(struct vm_trapframe, tf_r10)),
	[r11]"i"(offsetof(struct vm_trapframe, tf_r11)),
	[r12]"i"(offsetof(struct vm_trapframe, tf_r12)),
	[r13]"i"(offsetof(struct vm_trapframe, tf_r13)),
	[r14]"i"(offsetof(struct vm_trapframe, tf_r14)),
	[r15]"i"(offsetof(struct vm_trapframe, tf_r15)),
	[flags]"i"(offsetof(struct vm_trapframe, tf_flags))
	: "cc", "memory", "rbx", "rcx", "rdx", "rsi", "rdi",
	"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15");
	/* vmlaunch/resume failed. It could be for a few reasons, including
	* things like launching instead of resuming, not having a VMCS loaded,
	* failing a host-state area check, etc. Those are kernel problems.
	*
	* The user should not be able to trigger these problems. The user
	* could trigger a problem loading the guest-state area, such as a
	* non-canonical address for RIP. Those sorts of errors should appear
	* to be a normal vmexit with some flags set.
	*
	* Any failed vmlaunch/resume is likely a kernel bug, but we'll still
	* reflect it to the user for debugability.
	*
	* Also we should always have a non-shadow VMCS, so ZF should be 1 and
	* we can read the error register. */
	assert(read_flags() & FL_ZF);
	tf->tf_exit_reason = EXIT_REASON_VMENTER_FAILED;
	tf->tf_exit_qual = vmcs_read(VM_INSTRUCTION_ERROR);
	tf->tf_flags \|= VMCTX_FL_PARTIAL;
	warn("vmlaunch / vmresume failed!, check userspace's reflected fault");
	handle_bad_vm_tf(tf);
	}

	void proc_pop_ctx(struct user_context *ctx)
	{
	disable_irq();
	switch (ctx->type) {
	case ROS_HW_CTX:
	proc_pop_hwtf(&ctx->tf.hw_tf);
	break;
	case ROS_SW_CTX:
	proc_pop_swtf(&ctx->tf.sw_tf);
	break;
	case ROS_VM_CTX:
	proc_pop_vmtf(&ctx->tf.vm_tf);
	break;
	default:
	/* We should have caught this when securing the ctx */
	panic("Unknown context type %d!", ctx->type);
	}
	}

	void proc_init_ctx(struct user_context *ctx, uint32_t vcoreid, uintptr_t entryp,
	uintptr_t stack_top, uintptr_t tls_desc)
	{
	struct sw_trapframe *sw_tf = &ctx->tf.sw_tf;

	/* zero the entire structure for any type, prevent potential disclosure
	*/
	memset(ctx, 0, sizeof(struct user_context));
	ctx->type = ROS_SW_CTX;
	/* Stack pointers in x86 C functions need to be such that adding or
	* subtracting 8 will result in 16 byte alignment (AMD64 ABI), which we
	* call an odd-8-byte alignment. The reason is so that input arguments
	* (on the stack) are 16 byte aligned. The extra 8 bytes is the
	* retaddr, pushed on the stack. Compilers know they can subtract 8 to
	* get 16 byte alignment for instructions like movaps.
	*
	* However, the kernel will start contexts at 16 byte aligned stacks.
	* This is because glibc's _start (in ASM) expects this. Parlib x86's
	* vcore entry does the same.
	*
	* We init contexts for both an elf startup as well as vcore entry. It
	* is up to the caller (including the user) to make sure the stack is
	* aligned properly. elf.c doesn't know about these concerns, so if it
	* messes up, there's nothing we can really do, since the args are just
	* wrong. ld will fail immediately though, so we'll find out quickly.
	* */
	sw_tf->tf_rsp = stack_top;
	sw_tf->tf_rip = entryp;
	sw_tf->tf_rbp = 0; /* for potential backtraces */
	sw_tf->tf_mxcsr = 0x00001f80; /* x86 default mxcsr */
	sw_tf->tf_fpucw = 0x037f; /* x86 default FP CW */
	/* Coupled closely with user's entry.S. id is the vcoreid, which
	* entry.S uses to determine what to do. vcoreid == 0 is the main
	* core/context. */
	sw_tf->tf_rbx = vcoreid;
	sw_tf->tf_fsbase = tls_desc;
	proc_secure_ctx(ctx);
	}

	static void proc_secure_hwtf(struct hw_trapframe *tf)
	{
	enforce_user_canon(&tf->tf_gsbase);
	enforce_user_canon(&tf->tf_fsbase);
	enforce_user_canon(&tf->tf_rip);
	enforce_user_canon(&tf->tf_rsp);
	/* GD_UD is the user data segment selector in the GDT, and
	* GD_UT is the user text segment selector (see inc/memlayout.h).
	* The low 2 bits of each segment register contains the
	* Requestor Privilege Level (RPL); 3 means user mode. */
	tf->tf_ss = GD_UD \| 3;
	tf->tf_cs = GD_UT \| 3;
	/* Always 1: interrupts */
	tf->tf_rflags \|= FL_IF;
	/* Always 0: IOPL must be set to 0. VM (virtual 8086) probably doesn't
	* matter - SDM says it can't get modified via iret anyways. VIF and
	* VIP are also virtual-8086 mode stuff. Supposedly NT is settable by
	* userspace, but there's no good reason for it. Rather be paranoid. */
	tf->tf_rflags &= ~(FL_IOPL_MASK \| FL_VM \| FL_NT \| FL_VIF \| FL_VIP);
	tf->tf_rflags \|= FL_RSVD_1;
	tf->tf_rflags &= FL_RSVD_0;
	x86_hwtf_clear_partial(tf);
	}

	static void proc_secure_swtf(struct sw_trapframe *tf)
	{
	enforce_user_canon(&tf->tf_gsbase);
	enforce_user_canon(&tf->tf_fsbase);
	enforce_user_canon(&tf->tf_rip);
	enforce_user_canon(&tf->tf_rsp);
	/* The kernel doesn't actually load the mxcsr or the fpucw, but we can
	* still sanitize it in case we ever do load it. */
	tf->tf_mxcsr &= MXCSR_RSVD_0;
	x86_swtf_clear_partial(tf);
	}

	static void proc_secure_vmtf(struct vm_trapframe *tf)
	{
	/* The user can say whatever it wants for the bulk of the TF. If they
	* mess up something in the guest-area, it'll be treated like a vmexit.
	* There are a few things in the TF that we use on the kernel side.
	*
	* If guest_pcoreid is bad (not a guest_pcore), we'll fail to load the
	* GPC and reflect the fault to userspace.
	*
	* Regarding tf_flags, some are informational for the user, some are
	* used for our own use in the kernel.
	* - VMCTX_FL_PARTIAL: We clear this below
	* - VMCTX_FL_VMRESUME: Used to temporarily carry a bool in pop_vmtf,
	* but we never trust the value in the VM TF.
	* These are write-only from the kernel and passed to the user:
	* - VMCTX_FL_HAS_FAULT
	* - VMCTX_FL_EPT_VMR_BACKED */
	x86_vmtf_clear_partial(tf);
	}

	void proc_secure_ctx(struct user_context *ctx)
	{
	switch (ctx->type) {
	case ROS_HW_CTX:
	proc_secure_hwtf(&ctx->tf.hw_tf);
	break;
	case ROS_SW_CTX:
	proc_secure_swtf(&ctx->tf.sw_tf);
	break;
	case ROS_VM_CTX:
	proc_secure_vmtf(&ctx->tf.vm_tf);
	break;
	default:
	/* If we aren't another ctx type, we're assuming (and forcing) a
	* HW ctx. If this is somehow fucked up, userspace should die
	* rather quickly. */
	ctx->type = ROS_HW_CTX;
	proc_secure_hwtf(&ctx->tf.hw_tf);
	}
	}

	/* Called when we are currently running an address space on our core and want to
	* abandon it. We need a known good pgdir before releasing the old one. We
	* decref, since current no longer tracks the proc (and current no longer
	* protects the cr3). */
	void __abandon_core(void)
	{
	struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
	struct proc *old_proc;

	lcr3(boot_cr3);
	old_proc = pcpui->cur_proc;
	pcpui->cur_proc = NULL;
	proc_decref(old_proc);
	}

	void __clear_owning_proc(uint32_t coreid)
	{
	vmx_clear_vmcs();
	}