blob: 93dcd32d937215e6509ddf6043e1b8b7815328d2 [file] [log] [blame]
#ifndef PARLIB_ARCH_VCORE64_H
#define PARLIB_ARCH_VCORE64_H
#ifndef PARLIB_ARCH_VCORE_H
#error "Do not include include vcore32.h directly"
#endif
#include <ros/common.h>
#include <ros/trapframe.h>
#include <ros/procdata.h>
#include <ros/syscall.h>
#include <ros/arch/mmu.h>
#include <sys/vcore-tls.h>
/* Here's how the HW popping works: It sets up the future stack pointer to
* have extra stuff after it, and then it pops the registers, then pops the new
* context's stack pointer. Then it uses the extra stuff (the new PC is on the
* stack, the location of notif_disabled, and a clobbered work register) to
* enable notifs, make sure notif IPIs weren't pending, restore the work reg,
* and then "ret".
*
* This is what the target uthread's stack will look like (growing down):
*
* Target RSP -> | u_thread's old stuff | the future %rsp, tf->tf_rsp
* | new rip | 0x08 below %rsp (one slot is 0x08)
* | rflags space | 0x10 below
* | rdi save space | 0x18 below
* | *sysc ptr to syscall | 0x20 below
* | notif_pending_loc | 0x28 below
* | notif_disabled_loc | 0x30 below
*
* The important thing is that it can handle a notification after it enables
* notifications, and when it gets resumed it can ultimately run the new
* context. Enough state is saved in the running context and stack to continue
* running.
*
* Related to that is whether or not our stack pointer is sufficiently far down
* so that restarting *this* code won't clobber shit we need later. The way we
* do this is that we do any "stack jumping" before we enable interrupts/notifs.
* These jumps are when we directly modify rsp, specifically in the down
* direction (subtracts). Adds would be okay.
*
* Another 64-bit concern is the red-zone. The AMD64 ABI allows the use of
* space below the stack pointer by regular programs. If we allowed this, we
* would clobber that space when we do our TF restarts, much like with OSs and
* IRQ handlers. Thus we have the cross compiler automatically disabling the
* redzone (-mno-red-zone is a built-in option).
*
* When compared to the 32 bit code, notice we use rdi, instead of eax, for our
* work. This is because rdi is the arg0 of a syscall. Using it saves us some
* extra moves, since we need to pop the *sysc before saving any other
* registers. */
/* Helper for writing the info we need later to the u_tf's stack. Also, note
* this goes backwards, since memory reads up the stack. */
struct restart_helper {
void *notif_disab_loc;
void *notif_pend_loc;
struct syscall *sysc;
uint64_t rdi_save;
uint64_t rflags;
uint64_t rip;
};
/* Static syscall, used for self-notifying. We never wait on it, and we
* actually might submit it multiple times in parallel on different cores!
* While this may seem dangerous, the kernel needs to be able to handle this
* scenario. It's also important that we never wait on this, since for all but
* the first call, the DONE flag will be set. (Set once, then never reset) */
extern struct syscall vc_entry; /* in x86/vcore.c */
static inline void pop_hw_tf(struct hw_trapframe *tf, uint32_t vcoreid)
{
struct restart_helper *rst;
struct preempt_data *vcpd = &__procdata.vcore_preempt_data[vcoreid];
/* The stuff we need to write will be below the current stack of the utf */
rst = (struct restart_helper*)((void*)tf->tf_rsp -
sizeof(struct restart_helper));
/* Fill in the info we'll need later */
rst->notif_disab_loc = &vcpd->notif_disabled;
rst->notif_pend_loc = &vcpd->notif_pending;
rst->sysc = &vc_entry;
rst->rdi_save = 0; /* avoid bugs */
rst->rflags = tf->tf_rflags;
rst->rip = tf->tf_rip;
asm volatile ("movq %0, %%rsp; " /* jump rsp to the utf */
"popq %%rax; " /* restore registers */
"popq %%rbx; "
"popq %%rcx; "
"popq %%rdx; "
"popq %%rbp; "
"popq %%rsi; "
"popq %%rdi; "
"popq %%r8; "
"popq %%r9; "
"popq %%r10; "
"popq %%r11; "
"popq %%r12; "
"popq %%r13; "
"popq %%r14; "
"popq %%r15; "
"addq $0x28, %%rsp; " /* move to the rsp slot in the tf */
"popq %%rsp; " /* change to the utf's %rsp */
"subq $0x10, %%rsp; " /* move rsp to below rdi's slot */
"pushq %%rdi; " /* save rdi, will clobber soon */
"subq $0x18, %%rsp; " /* move to notif_dis_loc slot */
"popq %%rdi; " /* load notif_disabled addr */
"movb $0x00, (%%rdi); " /* enable notifications */
/* Need a wrmb() here so the write of enable_notif can't pass
* the read of notif_pending (racing with a potential
* cross-core call with proc_notify()). */
"lock addq $0, (%%rdi);" /* LOCK is a CPU mb() */
/* From here down, we can get interrupted and restarted */
"popq %%rdi; " /* get notif_pending status loc */
"testb $0x01, (%%rdi); " /* test if a notif is pending */
"jz 1f; " /* if not pending, skip syscall */
/* Actual syscall. Note we don't wait on the async call */
"popq %%rdi; " /* &sysc, trap arg0 */
"pushq %%rsi; " /* save rax, will be trap arg1 */
"pushq %%rax; " /* save rax, will be trap ret */
"movq $0x1, %%rsi; " /* sending one async syscall: arg1 */
"int %1; " /* fire the syscall */
"popq %%rax; " /* restore regs after syscall */
"popq %%rsi; "
"jmp 2f; " /* skip 1:, already popped */
"1: addq $0x08, %%rsp; " /* discard &sysc (on non-sc path) */
"2: popq %%rdi; " /* restore tf's %rdi (both paths) */
"popfq; " /* restore utf's rflags */
"ret; " /* return to the new PC */
:
: "g"(&tf->tf_rax), "i"(T_SYSCALL)
: "memory");
}
static inline void pop_sw_tf(struct sw_trapframe *sw_tf, uint32_t vcoreid)
{
struct preempt_data *vcpd = &__procdata.vcore_preempt_data[vcoreid];
/* Restore callee-saved FPU state. We need to clear exceptions before
* reloading the FP CW, in case the new CW unmasks any. We also need to
* reset the tag word to clear out the stack.
*
* The main issue here is that while our context was saved in an
* ABI-complaint manner, we may be starting up on a somewhat random FPU
* state. Having gibberish in registers isn't a big deal, but some of the
* FP environment settings could cause trouble. If fnclex; emms isn't
* enough, we could also save/restore the entire FP env with fldenv, or do
* an fninit before fldcw. */
asm volatile ("ldmxcsr %0" : : "m"(sw_tf->tf_mxcsr));
asm volatile ("fnclex; emms; fldcw %0" : : "m"(sw_tf->tf_fpucw));
/* Basic plan: restore all regs, off rcx as the sw_tf. Switch to the new
* stack, save the PC so we can jump to it later. Use clobberably
* registers for the locations of sysc, notif_dis, and notif_pend. Once on
* the new stack, we enable notifs, check if we missed one, and if so, self
* notify. Note the syscall clobbers rax. */
asm volatile ("movq 0x00(%0), %%rbx; " /* restore regs */
"movq 0x08(%0), %%rbp; "
"movq 0x10(%0), %%r12; "
"movq 0x18(%0), %%r13; "
"movq 0x20(%0), %%r14; "
"movq 0x28(%0), %%r15; "
"movq 0x30(%0), %%r8; " /* save rip in r8 */
"movq 0x38(%0), %%rsp; " /* jump to future stack */
"movb $0x00, (%2); " /* enable notifications */
/* Need a wrmb() here so the write of enable_notif can't pass
* the read of notif_pending (racing with a potential
* cross-core call with proc_notify()). */
"lock addq $0, (%2); " /* LOCK is a CPU mb() */
/* From here down, we can get interrupted and restarted */
"testb $0x01, (%3); " /* test if a notif is pending */
"jz 1f; " /* if not pending, skip syscall */
/* Actual syscall. Note we don't wait on the async call.
* &vc_entry is already in rdi (trap arg0). */
"movq $0x1, %%rsi; " /* sending one async syscall: arg1 */
"int %4; " /* fire the syscall */
"1: jmp *%%r8; " /* ret saved earlier */
:
: "c"(&sw_tf->tf_rbx),
"D"(&vc_entry),
"S"(&vcpd->notif_disabled),
"d"(&vcpd->notif_pending),
"i"(T_SYSCALL)
: "memory");
}
/* Pops a user context, reanabling notifications at the same time. A Userspace
* scheduler can call this when transitioning off the transition stack.
*
* At some point in vcore context before calling this, you need to clear
* notif_pending (do this by calling handle_events()). As a potential
* optimization, consider clearing the notif_pending flag / handle_events again
* (right before popping), right before calling this. If notif_pending is not
* clear, this will self_notify this core, since it should be because we missed
* a notification message while notifs were disabled. */
static inline void pop_user_ctx(struct user_context *ctx, uint32_t vcoreid)
{
if (ctx->type == ROS_HW_CTX)
pop_hw_tf(&ctx->tf.hw_tf, vcoreid);
else
pop_sw_tf(&ctx->tf.sw_tf, vcoreid);
}
/* Like the regular pop_user_ctx, but this one doesn't check or clear
* notif_pending. The only case where we use this is when an IRQ/notif
* interrupts a uthread that is in the process of disabling notifs. */
static inline void pop_user_ctx_raw(struct user_context *ctx, uint32_t vcoreid)
{
struct hw_trapframe *tf = &ctx->tf.hw_tf;
assert(ctx->type == ROS_HW_CTX);
struct restart_helper *rst;
struct preempt_data *vcpd = &__procdata.vcore_preempt_data[vcoreid];
/* The stuff we need to write will be below the current stack of the utf */
rst = (struct restart_helper*)((void*)tf->tf_rsp -
sizeof(struct restart_helper));
/* Fill in the info we'll need later */
rst->notif_disab_loc = &vcpd->notif_disabled;
rst->rdi_save = 0; /* avoid bugs */
rst->rflags = tf->tf_rflags;
rst->rip = tf->tf_rip;
asm volatile ("movq %0, %%rsp; " /* jump esp to the utf */
"popq %%rax; " /* restore registers */
"popq %%rbx; "
"popq %%rcx; "
"popq %%rdx; "
"popq %%rbp; "
"popq %%rsi; "
"popq %%rdi; "
"popq %%r8; "
"popq %%r9; "
"popq %%r10; "
"popq %%r11; "
"popq %%r12; "
"popq %%r13; "
"popq %%r14; "
"popq %%r15; "
"addq $0x28, %%rsp; " /* move to the rsp slot in the tf */
"popq %%rsp; " /* change to the utf's %rsp */
"subq $0x10, %%rsp; " /* move rsp to below rdi's slot */
"pushq %%rdi; " /* save rdi, will clobber soon */
"subq $0x18, %%rsp; " /* move to notif_dis_loc slot */
"popq %%rdi; " /* load notif_disabled addr */
"movb $0x00, (%%rdi); " /* enable notifications */
/* Here's where we differ from the regular pop_user_ctx().
* We need to adjust rsp and whatnot, but don't do test,
* clear notif_pending, or call a syscall. */
/* From here down, we can get interrupted and restarted */
"addq $0x10, %%rsp; " /* move to rdi save slot */
"popq %%rdi; " /* restore tf's %rdi */
"popfq; " /* restore utf's rflags */
"ret; " /* return to the new PC */
:
: "g"(&tf->tf_rax)
: "memory");
}
/* Save's a SW context, setting the PC to the end of this function. We only
* save callee-saved registers (of the sysv abi). The compiler knows to save
* the others via the input/clobber lists.
*
* Callers of this function need to have at least one
* 'calling-convention-compliant' function call between this and any floating
* point, so that the compiler saves any caller-saved FP before getting to
* here.
*
* To some extent, TLS is 'callee-saved', in that no one ever expects it to
* change. We handle uthread TLS changes separately, since we often change to
* them early to set some variables. Arguably we should do this different. */
static inline void save_user_ctx(struct user_context *ctx)
{
struct sw_trapframe *sw_tf = &ctx->tf.sw_tf;
long dummy;
ctx->type = ROS_SW_CTX;
asm volatile ("stmxcsr %0" : "=m"(sw_tf->tf_mxcsr));
asm volatile ("fnstcw %0" : "=m"(sw_tf->tf_fpucw));
/* Pretty simple: save all the regs, IAW the sys-v ABI */
asm volatile("mov %%rsp, 0x48(%0); " /* save rsp in its slot*/
"leaq 1f, %%rax; " /* get future rip */
"mov %%rax, 0x40(%0); " /* save rip in its slot*/
"mov %%r15, 0x38(%0); "
"mov %%r14, 0x30(%0); "
"mov %%r13, 0x28(%0); "
"mov %%r12, 0x20(%0); "
"mov %%rbp, 0x18(%0); "
"mov %%rbx, 0x10(%0); "
"1: " /* where this tf will restart */
: "=D"(dummy) /* force clobber for rdi */
: "D"(sw_tf)
: "rax", "rcx", "rdx", "rsi", "r8", "r9", "r10", "r11",
"memory", "cc");
} __attribute__((always_inline, returns_twice))
/* The old version, kept around for testing */
/* Hasn't been used yet for 64 bit. If you use this, it's worth checking to
* make sure rax isn't selected for 0, 1, or 2. (and we probably don't need to
* save rax in the beginning) */
static inline void save_user_ctx_hw(struct user_context *ctx)
{
struct hw_trapframe *tf = &ctx->tf.hw_tf;
ctx->type = ROS_HW_CTX;
memset(tf, 0, sizeof(struct hw_trapframe)); /* sanity */
/* set CS and make sure eflags is okay */
tf->tf_cs = GD_UT | 3;
tf->tf_rflags = 0x200; /* interrupts enabled. bare minimum rflags. */
/* Save the regs and the future rsp. */
asm volatile("movq %%rsp, (%0); " /* save rsp in it's slot*/
"pushq %%rax; " /* temp save rax */
"leaq 1f, %%rax; " /* get future rip */
"movq %%rax, (%1); " /* store future rip */
"popq %%rax; " /* restore rax */
"movq %2, %%rsp; " /* move to the rax slot of the tf */
"addl $0x78,%%esp; " /* move to just past r15 */
"pushq %%r15; " /* save regs */
"pushq %%r14; "
"pushq %%r13; "
"pushq %%r12; "
"pushq %%r11; "
"pushq %%r10; "
"pushq %%r9; "
"pushq %%r8; "
"pushq %%rdi; "
"pushq %%rsi; "
"pushq %%rbp; "
"pushq %%rdx; "
"pushq %%rcx; "
"pushq %%rbx; "
"pushq %%rax; "
"addq $0xa0, %%rsp; " /* move to rsp slot */
"popq %%rsp; " /* restore saved/original rsp */
"1: " /* where this tf will restart */
:
: "g"(&tf->tf_rsp), "g"(&tf->tf_rip), "g"(tf->tf_rax)
: "rax", "memory", "cc");
} __attribute__((always_inline, returns_twice))
static inline void init_user_ctx(struct user_context *ctx, uintptr_t entry_pt,
uintptr_t stack_top)
{
struct sw_trapframe *sw_tf = &ctx->tf.sw_tf;
ctx->type = ROS_SW_CTX;
/* Stack pointers in a fresh stackframe need to be such that adding or
* subtracting 8 will result in 16 byte alignment (AMD64 ABI). The reason
* is so that input arguments (on the stack) are 16 byte aligned. The
* extra 8 bytes is the retaddr, pushed on the stack. Compilers know they
* can subtract 8 to get 16 byte alignment for instructions like movaps. */
sw_tf->tf_rsp = ROUNDDOWN(stack_top, 16) - 8;
sw_tf->tf_rip = entry_pt;
sw_tf->tf_rbp = 0; /* for potential backtraces */
/* No need to bother with setting the other GP registers; the called
* function won't care about their contents. */
sw_tf->tf_mxcsr = 0x00001f80; /* x86 default mxcsr */
sw_tf->tf_fpucw = 0x037f; /* x86 default FP CW */
}
// this is how we get our thread id on entry.
#define __vcore_id_on_entry \
({ \
register int temp asm ("rax"); \
temp; \
})
/* For debugging. */
#include <stdio.h>
static void print_hw_tf(struct hw_trapframe *hw_tf)
{
printf("[user] HW TRAP frame 0x%016x\n", hw_tf);
printf(" rax 0x%016lx\n", hw_tf->tf_rax);
printf(" rbx 0x%016lx\n", hw_tf->tf_rbx);
printf(" rcx 0x%016lx\n", hw_tf->tf_rcx);
printf(" rdx 0x%016lx\n", hw_tf->tf_rdx);
printf(" rbp 0x%016lx\n", hw_tf->tf_rbp);
printf(" rsi 0x%016lx\n", hw_tf->tf_rsi);
printf(" rdi 0x%016lx\n", hw_tf->tf_rdi);
printf(" r8 0x%016lx\n", hw_tf->tf_r8);
printf(" r9 0x%016lx\n", hw_tf->tf_r9);
printf(" r10 0x%016lx\n", hw_tf->tf_r10);
printf(" r11 0x%016lx\n", hw_tf->tf_r11);
printf(" r12 0x%016lx\n", hw_tf->tf_r12);
printf(" r13 0x%016lx\n", hw_tf->tf_r13);
printf(" r14 0x%016lx\n", hw_tf->tf_r14);
printf(" r15 0x%016lx\n", hw_tf->tf_r15);
printf(" trap 0x%08x\n", hw_tf->tf_trapno);
printf(" gsbs 0x%016lx\n", hw_tf->tf_gsbase);
printf(" fsbs 0x%016lx\n", hw_tf->tf_fsbase);
printf(" err 0x--------%08x\n", hw_tf->tf_err);
printf(" rip 0x%016lx\n", hw_tf->tf_rip);
printf(" cs 0x------------%04x\n", hw_tf->tf_cs);
printf(" flag 0x%016lx\n", hw_tf->tf_rflags);
printf(" rsp 0x%016lx\n", hw_tf->tf_rsp);
printf(" ss 0x------------%04x\n", hw_tf->tf_ss);
}
static void print_sw_tf(struct sw_trapframe *sw_tf)
{
printf("[user] SW TRAP frame 0x%016p\n", sw_tf);
printf(" rbx 0x%016lx\n", sw_tf->tf_rbx);
printf(" rbp 0x%016lx\n", sw_tf->tf_rbp);
printf(" r12 0x%016lx\n", sw_tf->tf_r12);
printf(" r13 0x%016lx\n", sw_tf->tf_r13);
printf(" r14 0x%016lx\n", sw_tf->tf_r14);
printf(" r15 0x%016lx\n", sw_tf->tf_r15);
printf(" gsbs 0x%016lx\n", sw_tf->tf_gsbase);
printf(" fsbs 0x%016lx\n", sw_tf->tf_fsbase);
printf(" rip 0x%016lx\n", sw_tf->tf_rip);
printf(" rsp 0x%016lx\n", sw_tf->tf_rsp);
printf(" mxcsr 0x%08x\n", sw_tf->tf_mxcsr);
printf(" fpucw 0x%04x\n", sw_tf->tf_fpucw);
}
static void print_user_context(struct user_context *ctx)
{
if (ctx->type == ROS_HW_CTX)
print_hw_tf(&ctx->tf.hw_tf);
else if (ctx->type == ROS_SW_CTX)
print_sw_tf(&ctx->tf.sw_tf);
else
printf("Unknown context type %d\n", ctx->type);
}
#endif /* PARLIB_ARCH_VCORE64_H */