blob: 1b4dca3e06490932dfcea2a3bbd1bc0e3e0b2bae [file] [log] [blame]
#include <ros/syscall.h>
#include <parlib/vcore.h>
#include <parlib/stdio.h>
#include <stdlib.h>
/* Here's how the HW popping works: It sets up the future stack pointer to
* have extra stuff after it, and then it pops the registers, then pops the new
* context's stack pointer. Then it uses the extra stuff (the new PC is on the
* stack, the location of notif_disabled, and a clobbered work register) to
* enable notifs, make sure notif IPIs weren't pending, restore the work reg,
* and then "ret".
*
* However, we can't just put the extra stuff directly below the rsp. We need
* to leave room for the redzone: area that is potentially being used. (Even if
* you compile with -mno-red-zone, some asm code (glibc memcpy) will still use
* that area).
*
* This is what the target uthread's stack will look like (growing down):
*
* Target RSP -> | u_thread's old stuff | the future %rsp, tf->tf_rsp
* | . | beginning of Red Zone
* | . |
* | 128 Bytes of Red Zone |
* | . |
* | . | end of Red Zone
* | new rip | 0x08 below Red (one slot is 0x08)
* | rflags space | 0x10 below
* | rdi save space | 0x18 below
* | *sysc ptr to syscall | 0x20 below
* | notif_pending_loc | 0x28 below
* | notif_disabled_loc | 0x30 below
*
* The important thing is that it can handle a notification after it enables
* notifications, and when it gets resumed it can ultimately run the new
* context. Enough state is saved in the running context and stack to continue
* running.
*
* Related to that is whether or not our stack pointer is sufficiently far down
* so that restarting *this* code won't clobber shit we need later. The way we
* do this is that we do any "stack jumping" before we enable interrupts/notifs.
* These jumps are when we directly modify rsp, specifically in the down
* direction (subtracts). Adds would be okay. */
/* Helper for writing the info we need later to the u_tf's stack. Also, note
* this goes backwards, since memory reads up the stack. */
struct restart_helper {
void *notif_disab_loc;
void *notif_pend_loc;
struct syscall *sysc;
uint64_t rdi_save;
uint64_t rflags;
uint64_t rip;
};
/* Static syscall, used for self-notifying. We never wait on it, and we
* actually might submit it multiple times in parallel on different cores!
* While this may seem dangerous, the kernel needs to be able to handle this
* scenario. It's also important that we never wait on this, since for all but
* the first call, the DONE flag will be set. (Set once, then never reset) */
struct syscall vc_entry = {
.num = SYS_vc_entry,
.err = 0,
.retval = 0,
.flags = 0,
.ev_q = 0,
.u_data = 0,
.arg0 = 0,
.arg1 = 0,
.arg2 = 0,
.arg3 = 0,
.arg4 = 0,
.arg5 = 0,
};
static void pop_hw_tf(struct hw_trapframe *tf, uint32_t vcoreid)
{
#define X86_RED_ZONE_SZ 128
struct restart_helper *rst;
struct preempt_data *vcpd = &__procdata.vcore_preempt_data[vcoreid];
/* The stuff we need to write will be below the current stack and red
* zone of the utf */
rst = (struct restart_helper*)((void*)tf->tf_rsp - X86_RED_ZONE_SZ -
sizeof(struct restart_helper));
/* Fill in the info we'll need later */
rst->notif_disab_loc = &vcpd->notif_disabled;
rst->notif_pend_loc = &vcpd->notif_pending;
rst->sysc = &vc_entry;
rst->rdi_save = 0; /* avoid bugs */
rst->rflags = tf->tf_rflags;
rst->rip = tf->tf_rip;
asm volatile ("movq %0, %%rsp; " /* jump rsp to the utf */
"popq %%rax; " /* restore registers */
"popq %%rbx; "
"popq %%rcx; "
"popq %%rdx; "
"popq %%rbp; "
"popq %%rsi; "
"popq %%rdi; "
"popq %%r8; "
"popq %%r9; "
"popq %%r10; "
"popq %%r11; "
"popq %%r12; "
"popq %%r13; "
"popq %%r14; "
"popq %%r15; "
"addq $0x28, %%rsp; " /* move to rsp slot in the tf */
"popq %%rsp; " /* change to the utf's %rsp */
"subq %[red], %%rsp; " /* jump over the redzone */
"subq $0x10, %%rsp; " /* move rsp to below rdi's slot*/
"pushq %%rdi; " /* save rdi, will clobber soon */
"subq $0x18, %%rsp; " /* move to notif_dis_loc slot */
"popq %%rdi; " /* load notif_disabled addr */
"movb $0x00, (%%rdi); " /* enable notifications */
/* Need a wrmb() here so the write of enable_notif can't
* pass the read of notif_pending (racing with a potential
* cross-core call with proc_notify()). */
"lock addb $0, (%%rdi);" /* LOCK is a CPU mb() */
/* From here down, we can get interrupted and restarted */
"popq %%rdi; " /* get notif_pending status loc*/
"testb $0x01, (%%rdi); " /* test if a notif is pending */
"jz 1f; " /* if not pending skip syscall */
/* Actual syscall. Note we don't wait on the async call */
"popq %%rdi; " /* &sysc, trap arg0 */
"pushq %%rsi; " /* save rax, will be trap arg1 */
"pushq %%rax; " /* save rax, will be trap ret */
"movq $0x1, %%rsi; " /* send one async syscall: arg1*/
"int %1; " /* fire the syscall */
"popq %%rax; " /* restore regs after syscall */
"popq %%rsi; "
"jmp 2f; " /* skip 1:, already popped */
"1: addq $0x08, %%rsp; " /* discard &sysc on non-sc path*/
"2: popq %%rdi; " /* restore tf's %rdi both paths*/
"popfq; " /* restore utf's rflags */
"ret %[red]; " /* ret to the new PC, skip red */
:
: "g"(&tf->tf_rax), "i"(T_SYSCALL),
[red]"i"(X86_RED_ZONE_SZ)
: "memory");
}
static void pop_sw_tf(struct sw_trapframe *sw_tf, uint32_t vcoreid)
{
struct preempt_data *vcpd = &__procdata.vcore_preempt_data[vcoreid];
/* Restore callee-saved FPU state. We need to clear exceptions before
* reloading the FP CW, in case the new CW unmasks any. We also need to
* reset the tag word to clear out the stack.
*
* The main issue here is that while our context was saved in an
* ABI-complaint manner, we may be starting up on a somewhat random FPU
* state. Having gibberish in registers isn't a big deal, but some of
* the FP environment settings could cause trouble. If fnclex; emms
* isn't enough, we could also save/restore the entire FP env with
* fldenv, or do an fninit before fldcw. */
asm volatile ("ldmxcsr %0" : : "m"(sw_tf->tf_mxcsr));
asm volatile ("fnclex; emms; fldcw %0" : : "m"(sw_tf->tf_fpucw));
/* Basic plan: restore all regs, off rcx as the sw_tf. Switch to the
* new stack, save the PC so we can jump to it later. Use clobberably
* registers for the locations of sysc, notif_dis, and notif_pend. Once
* on the new stack, we enable notifs, check if we missed one, and if
* so, self notify. Note the syscall clobbers rax. */
asm volatile ("movq 0x00(%0), %%rbx; " /* restore regs */
"movq 0x08(%0), %%rbp; "
"movq 0x10(%0), %%r12; "
"movq 0x18(%0), %%r13; "
"movq 0x20(%0), %%r14; "
"movq 0x28(%0), %%r15; "
"movq 0x30(%0), %%r8; " /* save rip in r8 */
"movq 0x38(%0), %%rsp; " /* jump to future stack */
"movb $0x00, (%2); " /* enable notifications */
/* Need a wrmb() here so the write of enable_notif can't
* pass the read of notif_pending (racing with a potential
* cross-core call with proc_notify()). */
"lock addb $0, (%2); " /* LOCK is a CPU mb() */
/* From here down, we can get interrupted and restarted */
"testb $0x01, (%3); " /* test if a notif is pending */
"jz 1f; " /* if not pending, skip syscall*/
/* Actual syscall. Note we don't wait on the async call.
* &vc_entry is already in rdi (trap arg0). */
"movq $0x1, %%rsi; " /* send one async syscall: arg1*/
"int %4; " /* fire the syscall */
"1: jmp *%%r8; " /* ret saved earlier */
:
: "c"(&sw_tf->tf_rbx),
"D"(&vc_entry),
"S"(&vcpd->notif_disabled),
"d"(&vcpd->notif_pending),
"i"(T_SYSCALL)
: "memory");
}
/* Pops a user context, reanabling notifications at the same time. A Userspace
* scheduler can call this when transitioning off the transition stack.
*
* pop_user_ctx will fail if we have a notif_pending; you're not allowed to
* leave vcore context with notif_pending set. Some code in vcore_entry
* needs clear notif_pending and check whatever might have caused a notif
* (e.g. call handle_events()).
*
* If notif_pending is not clear, this will self_notify this core, since it
* should be because we missed a notification message while notifs were
* disabled. */
void pop_user_ctx(struct user_context *ctx, uint32_t vcoreid)
{
struct preempt_data *vcpd = vcpd_of(vcoreid);
/* We check early for notif_pending, since if we deal with it during
* pop_hw_tf, we grow the stack slightly. If a thread consistently
* fails to restart due to notif pending, it will eventually run off the
* bottom of its stack. By performing the check here, we shrink that
* window. You'd have to have a notif come after this check, but also
* *not* before this check. If you PF in pop_user_ctx, this all failed.
* */
if (vcpd->notif_pending) {
/* if pop_user_ctx fails (and resets the vcore), the ctx
* contents must be in VCPD (due to !UTHREAD_SAVED). it might
* already be there. */
if (ctx != &vcpd->uthread_ctx)
vcpd->uthread_ctx = *ctx;
/* To restart the vcore, we must have the right TLS, stack
* pointer, and vc_ctx = TRUE. */
set_tls_desc((void*)vcpd->vcore_tls_desc);
begin_safe_access_tls_vars()
__vcore_context = TRUE;
end_safe_access_tls_vars()
set_stack_pointer((void*)vcpd->vcore_stack);
vcore_entry();
}
switch (ctx->type) {
case ROS_HW_CTX:
pop_hw_tf(&ctx->tf.hw_tf, vcoreid);
break;
case ROS_SW_CTX:
pop_sw_tf(&ctx->tf.sw_tf, vcoreid);
break;
case ROS_VM_CTX:
ros_syscall(SYS_pop_ctx, ctx, 0, 0, 0, 0, 0);
break;
}
assert(0);
}
/* Like the regular pop_user_ctx, but this one doesn't check or clear
* notif_pending. The only case where we use this is when an IRQ/notif
* interrupts a uthread that is in the process of disabling notifs.
*
* If we need to support VM_CTXs here, we'll need to tell the kernel whether or
* not we want to enable_notifs (flag to SYS_pop_ctx). The only use case for
* this is when disabling notifs. Currently, a VM can't do this or do things
* like uthread_yield. It doesn't have access to the vcore's or uthread's TLS
* to bootstrap any of that stuff. */
void pop_user_ctx_raw(struct user_context *ctx, uint32_t vcoreid)
{
struct hw_trapframe *tf = &ctx->tf.hw_tf;
struct restart_helper *rst;
struct preempt_data *vcpd = &__procdata.vcore_preempt_data[vcoreid];
assert(ctx->type == ROS_HW_CTX);
/* The stuff we need to write will be below the current stack of the utf
*/
rst = (struct restart_helper*)((void*)tf->tf_rsp -
sizeof(struct restart_helper));
/* Fill in the info we'll need later */
rst->notif_disab_loc = &vcpd->notif_disabled;
rst->rdi_save = 0; /* avoid bugs */
rst->rflags = tf->tf_rflags;
rst->rip = tf->tf_rip;
asm volatile ("movq %0, %%rsp; " /* jump esp to the utf */
"popq %%rax; " /* restore registers */
"popq %%rbx; "
"popq %%rcx; "
"popq %%rdx; "
"popq %%rbp; "
"popq %%rsi; "
"popq %%rdi; "
"popq %%r8; "
"popq %%r9; "
"popq %%r10; "
"popq %%r11; "
"popq %%r12; "
"popq %%r13; "
"popq %%r14; "
"popq %%r15; "
"addq $0x28, %%rsp; " /* move to rsp slot in the tf */
"popq %%rsp; " /* change to the utf's %rsp */
"subq $0x10, %%rsp; " /* move rsp to below rdi's slot*/
"pushq %%rdi; " /* save rdi, will clobber soon */
"subq $0x18, %%rsp; " /* move to notif_dis_loc slot */
"popq %%rdi; " /* load notif_disabled addr */
"movb $0x00, (%%rdi); " /* enable notifications */
/* Here's where we differ from the regular pop_user_ctx().
* We need to adjust rsp and whatnot, but don't do test,
* clear notif_pending, or call a syscall. */
/* From here down, we can get interrupted and restarted */
"addq $0x10, %%rsp; " /* move to rdi save slot */
"popq %%rdi; " /* restore tf's %rdi */
"popfq; " /* restore utf's rflags */
"ret; " /* return to the new PC */
:
: "g"(&tf->tf_rax)
: "memory");
}
void fprintf_hw_tf(FILE *f, struct hw_trapframe *hw_tf)
{
fprintf(f, "[user] HW TRAP frame 0x%016x\n", hw_tf);
fprintf(f, " rax 0x%016lx\n", hw_tf->tf_rax);
fprintf(f, " rbx 0x%016lx\n", hw_tf->tf_rbx);
fprintf(f, " rcx 0x%016lx\n", hw_tf->tf_rcx);
fprintf(f, " rdx 0x%016lx\n", hw_tf->tf_rdx);
fprintf(f, " rbp 0x%016lx\n", hw_tf->tf_rbp);
fprintf(f, " rsi 0x%016lx\n", hw_tf->tf_rsi);
fprintf(f, " rdi 0x%016lx\n", hw_tf->tf_rdi);
fprintf(f, " r8 0x%016lx\n", hw_tf->tf_r8);
fprintf(f, " r9 0x%016lx\n", hw_tf->tf_r9);
fprintf(f, " r10 0x%016lx\n", hw_tf->tf_r10);
fprintf(f, " r11 0x%016lx\n", hw_tf->tf_r11);
fprintf(f, " r12 0x%016lx\n", hw_tf->tf_r12);
fprintf(f, " r13 0x%016lx\n", hw_tf->tf_r13);
fprintf(f, " r14 0x%016lx\n", hw_tf->tf_r14);
fprintf(f, " r15 0x%016lx\n", hw_tf->tf_r15);
fprintf(f, " trap 0x%08x\n", hw_tf->tf_trapno);
fprintf(f, " gsbs 0x%016lx\n", hw_tf->tf_gsbase);
fprintf(f, " fsbs 0x%016lx\n", hw_tf->tf_fsbase);
fprintf(f, " err 0x--------%08x\n", hw_tf->tf_err);
fprintf(f, " rip 0x%016lx\n", hw_tf->tf_rip);
fprintf(f, " cs 0x------------%04x\n", hw_tf->tf_cs);
fprintf(f, " flag 0x%016lx\n", hw_tf->tf_rflags);
fprintf(f, " rsp 0x%016lx\n", hw_tf->tf_rsp);
fprintf(f, " ss 0x------------%04x\n", hw_tf->tf_ss);
}
void fprintf_sw_tf(FILE *f, struct sw_trapframe *sw_tf)
{
fprintf(f, "[user] SW TRAP frame 0x%016p\n", sw_tf);
fprintf(f, " rbx 0x%016lx\n", sw_tf->tf_rbx);
fprintf(f, " rbp 0x%016lx\n", sw_tf->tf_rbp);
fprintf(f, " r12 0x%016lx\n", sw_tf->tf_r12);
fprintf(f, " r13 0x%016lx\n", sw_tf->tf_r13);
fprintf(f, " r14 0x%016lx\n", sw_tf->tf_r14);
fprintf(f, " r15 0x%016lx\n", sw_tf->tf_r15);
fprintf(f, " gsbs 0x%016lx\n", sw_tf->tf_gsbase);
fprintf(f, " fsbs 0x%016lx\n", sw_tf->tf_fsbase);
fprintf(f, " rip 0x%016lx\n", sw_tf->tf_rip);
fprintf(f, " rsp 0x%016lx\n", sw_tf->tf_rsp);
fprintf(f, " mxcsr 0x%08x\n", sw_tf->tf_mxcsr);
fprintf(f, " fpucw 0x%04x\n", sw_tf->tf_fpucw);
}
void fprintf_vm_tf(FILE *f, struct vm_trapframe *vm_tf)
{
fprintf(f, "[user] VM Trapframe 0x%016x\n", vm_tf);
fprintf(f, " rax 0x%016lx\n", vm_tf->tf_rax);
fprintf(f, " rbx 0x%016lx\n", vm_tf->tf_rbx);
fprintf(f, " rcx 0x%016lx\n", vm_tf->tf_rcx);
fprintf(f, " rdx 0x%016lx\n", vm_tf->tf_rdx);
fprintf(f, " rbp 0x%016lx\n", vm_tf->tf_rbp);
fprintf(f, " rsi 0x%016lx\n", vm_tf->tf_rsi);
fprintf(f, " rdi 0x%016lx\n", vm_tf->tf_rdi);
fprintf(f, " r8 0x%016lx\n", vm_tf->tf_r8);
fprintf(f, " r9 0x%016lx\n", vm_tf->tf_r9);
fprintf(f, " r10 0x%016lx\n", vm_tf->tf_r10);
fprintf(f, " r11 0x%016lx\n", vm_tf->tf_r11);
fprintf(f, " r12 0x%016lx\n", vm_tf->tf_r12);
fprintf(f, " r13 0x%016lx\n", vm_tf->tf_r13);
fprintf(f, " r14 0x%016lx\n", vm_tf->tf_r14);
fprintf(f, " r15 0x%016lx\n", vm_tf->tf_r15);
fprintf(f, " rip 0x%016lx\n", vm_tf->tf_rip);
fprintf(f, " rflg 0x%016lx\n", vm_tf->tf_rflags);
fprintf(f, " rsp 0x%016lx\n", vm_tf->tf_rsp);
fprintf(f, " cr2 0x%016lx\n", vm_tf->tf_cr2);
fprintf(f, " cr3 0x%016lx\n", vm_tf->tf_cr3);
fprintf(f, "Gpcore 0x%08x\n", vm_tf->tf_guest_pcoreid);
fprintf(f, "Flags 0x%08x\n", vm_tf->tf_flags);
fprintf(f, "Inject 0x%08x\n", vm_tf->tf_trap_inject);
fprintf(f, "ExitRs 0x%08x\n", vm_tf->tf_exit_reason);
fprintf(f, "ExitQl 0x%08x\n", vm_tf->tf_exit_qual);
fprintf(f, "Intr1 0x%016lx\n", vm_tf->tf_intrinfo1);
fprintf(f, "Intr2 0x%016lx\n", vm_tf->tf_intrinfo2);
fprintf(f, "GIntr 0x----%04x\n", vm_tf->tf_guest_intr_status);
fprintf(f, "GVA 0x%016lx\n", vm_tf->tf_guest_va);
fprintf(f, "GPA 0x%016lx\n", vm_tf->tf_guest_pa);
}
void print_user_context(struct user_context *ctx)
{
switch (ctx->type) {
case ROS_HW_CTX:
fprintf_hw_tf(stdout, &ctx->tf.hw_tf);
break;
case ROS_SW_CTX:
fprintf_sw_tf(stdout, &ctx->tf.sw_tf);
break;
case ROS_VM_CTX:
fprintf_vm_tf(stdout, &ctx->tf.vm_tf);
break;
default:
fprintf(stderr, "Unknown context type %d\n", ctx->type);
}
}
/* The second-lowest level function jumped to by the kernel on every vcore
* entry. We get called from __kernel_vcore_entry.
*
* We should consider making it mandatory to set the tls_desc in the kernel. We
* wouldn't even need to pass the vcore id to user space at all if we did this.
* It would already be set in the preinstalled TLS as __vcore_id. */
void __attribute__((noreturn)) __kvc_entry_c(void)
{
/* The kernel sets the TLS desc for us, based on whatever is in VCPD.
*
* x86 32-bit TLS is pretty jacked up, so the kernel doesn't set the TLS
* desc for us. it's a little more expensive to do it here, esp for
* amd64. Can remove this when/if we overhaul 32 bit TLS. */
int id = __vcore_id_on_entry;
#ifndef __x86_64__
set_tls_desc(vcpd_of(id)->vcore_tls_desc);
#endif
/* Every time the vcore comes up, it must set that it is in vcore
* context. uthreads may share the same TLS as their vcore (when
* uthreads do not have their own TLS), and if a uthread was preempted,
* __vcore_context == FALSE, and that will continue to be true the next
* time the vcore pops up. */
__vcore_context = TRUE;
vcore_entry();
fprintf(stderr, "vcore_entry() should never return!\n");
abort();
__builtin_unreachable();
}