| #ifndef PARLIB_ARCH_VCORE64_H |
| #define PARLIB_ARCH_VCORE64_H |
| |
| #ifndef PARLIB_ARCH_VCORE_H |
| #error "Do not include include vcore32.h directly" |
| #endif |
| |
| #include <ros/common.h> |
| #include <ros/trapframe.h> |
| #include <ros/procdata.h> |
| #include <ros/syscall.h> |
| #include <ros/arch/mmu.h> |
| #include <sys/vcore-tls.h> |
| |
| /* Here's how the HW popping works: It sets up the future stack pointer to |
| * have extra stuff after it, and then it pops the registers, then pops the new |
| * context's stack pointer. Then it uses the extra stuff (the new PC is on the |
| * stack, the location of notif_disabled, and a clobbered work register) to |
| * enable notifs, make sure notif IPIs weren't pending, restore the work reg, |
| * and then "ret". |
| * |
| * This is what the target uthread's stack will look like (growing down): |
| * |
| * Target RSP -> | u_thread's old stuff | the future %rsp, tf->tf_rsp |
| * | new rip | 0x08 below %rsp (one slot is 0x08) |
| * | rflags space | 0x10 below |
| * | rdi save space | 0x18 below |
| * | *sysc ptr to syscall | 0x20 below |
| * | notif_pending_loc | 0x28 below |
| * | notif_disabled_loc | 0x30 below |
| * |
| * The important thing is that it can handle a notification after it enables |
| * notifications, and when it gets resumed it can ultimately run the new |
| * context. Enough state is saved in the running context and stack to continue |
| * running. |
| * |
| * Related to that is whether or not our stack pointer is sufficiently far down |
| * so that restarting *this* code won't clobber shit we need later. The way we |
| * do this is that we do any "stack jumping" before we enable interrupts/notifs. |
| * These jumps are when we directly modify rsp, specifically in the down |
| * direction (subtracts). Adds would be okay. |
| * |
| * Another 64-bit concern is the red-zone. The AMD64 ABI allows the use of |
| * space below the stack pointer by regular programs. If we allowed this, we |
| * would clobber that space when we do our TF restarts, much like with OSs and |
| * IRQ handlers. Thus we have the cross compiler automatically disabling the |
| * redzone (-mno-red-zone is a built-in option). |
| * |
| * When compared to the 32 bit code, notice we use rdi, instead of eax, for our |
| * work. This is because rdi is the arg0 of a syscall. Using it saves us some |
| * extra moves, since we need to pop the *sysc before saving any other |
| * registers. */ |
| |
| /* Helper for writing the info we need later to the u_tf's stack. Also, note |
| * this goes backwards, since memory reads up the stack. */ |
| struct restart_helper { |
| void *notif_disab_loc; |
| void *notif_pend_loc; |
| struct syscall *sysc; |
| uint64_t rdi_save; |
| uint64_t rflags; |
| uint64_t rip; |
| }; |
| |
| /* Static syscall, used for self-notifying. We never wait on it, and we |
| * actually might submit it multiple times in parallel on different cores! |
| * While this may seem dangerous, the kernel needs to be able to handle this |
| * scenario. It's also important that we never wait on this, since for all but |
| * the first call, the DONE flag will be set. (Set once, then never reset) */ |
| extern struct syscall vc_entry; /* in x86/vcore.c */ |
| |
| static inline void pop_hw_tf(struct hw_trapframe *tf, uint32_t vcoreid) |
| { |
| struct restart_helper *rst; |
| struct preempt_data *vcpd = &__procdata.vcore_preempt_data[vcoreid]; |
| /* The stuff we need to write will be below the current stack of the utf */ |
| rst = (struct restart_helper*)((void*)tf->tf_rsp - |
| sizeof(struct restart_helper)); |
| /* Fill in the info we'll need later */ |
| rst->notif_disab_loc = &vcpd->notif_disabled; |
| rst->notif_pend_loc = &vcpd->notif_pending; |
| rst->sysc = &vc_entry; |
| rst->rdi_save = 0; /* avoid bugs */ |
| rst->rflags = tf->tf_rflags; |
| rst->rip = tf->tf_rip; |
| |
| asm volatile ("movq %0, %%rsp; " /* jump rsp to the utf */ |
| "popq %%rax; " /* restore registers */ |
| "popq %%rbx; " |
| "popq %%rcx; " |
| "popq %%rdx; " |
| "popq %%rbp; " |
| "popq %%rsi; " |
| "popq %%rdi; " |
| "popq %%r8; " |
| "popq %%r9; " |
| "popq %%r10; " |
| "popq %%r11; " |
| "popq %%r12; " |
| "popq %%r13; " |
| "popq %%r14; " |
| "popq %%r15; " |
| "addq $0x28, %%rsp; " /* move to the rsp slot in the tf */ |
| "popq %%rsp; " /* change to the utf's %rsp */ |
| "subq $0x10, %%rsp; " /* move rsp to below rdi's slot */ |
| "pushq %%rdi; " /* save rdi, will clobber soon */ |
| "subq $0x18, %%rsp; " /* move to notif_dis_loc slot */ |
| "popq %%rdi; " /* load notif_disabled addr */ |
| "movb $0x00, (%%rdi); " /* enable notifications */ |
| /* Need a wrmb() here so the write of enable_notif can't pass |
| * the read of notif_pending (racing with a potential |
| * cross-core call with proc_notify()). */ |
| "lock addq $0, (%%rdi);" /* LOCK is a CPU mb() */ |
| /* From here down, we can get interrupted and restarted */ |
| "popq %%rdi; " /* get notif_pending status loc */ |
| "testb $0x01, (%%rdi); " /* test if a notif is pending */ |
| "jz 1f; " /* if not pending, skip syscall */ |
| /* Actual syscall. Note we don't wait on the async call */ |
| "popq %%rdi; " /* &sysc, trap arg0 */ |
| "pushq %%rsi; " /* save rax, will be trap arg1 */ |
| "pushq %%rax; " /* save rax, will be trap ret */ |
| "movq $0x1, %%rsi; " /* sending one async syscall: arg1 */ |
| "int %1; " /* fire the syscall */ |
| "popq %%rax; " /* restore regs after syscall */ |
| "popq %%rsi; " |
| "jmp 2f; " /* skip 1:, already popped */ |
| "1: addq $0x08, %%rsp; " /* discard &sysc (on non-sc path) */ |
| "2: popq %%rdi; " /* restore tf's %rdi (both paths) */ |
| "popfq; " /* restore utf's rflags */ |
| "ret; " /* return to the new PC */ |
| : |
| : "g"(&tf->tf_rax), "i"(T_SYSCALL) |
| : "memory"); |
| } |
| |
| static inline void pop_sw_tf(struct sw_trapframe *sw_tf, uint32_t vcoreid) |
| { |
| struct preempt_data *vcpd = &__procdata.vcore_preempt_data[vcoreid]; |
| /* Restore callee-saved FPU state. We need to clear exceptions before |
| * reloading the FP CW, in case the new CW unmasks any. We also need to |
| * reset the tag word to clear out the stack. |
| * |
| * The main issue here is that while our context was saved in an |
| * ABI-complaint manner, we may be starting up on a somewhat random FPU |
| * state. Having gibberish in registers isn't a big deal, but some of the |
| * FP environment settings could cause trouble. If fnclex; emms isn't |
| * enough, we could also save/restore the entire FP env with fldenv, or do |
| * an fninit before fldcw. */ |
| asm volatile ("ldmxcsr %0" : : "m"(sw_tf->tf_mxcsr)); |
| asm volatile ("fnclex; emms; fldcw %0" : : "m"(sw_tf->tf_fpucw)); |
| /* Basic plan: restore all regs, off rcx as the sw_tf. Switch to the new |
| * stack, save the PC so we can jump to it later. Use clobberably |
| * registers for the locations of sysc, notif_dis, and notif_pend. Once on |
| * the new stack, we enable notifs, check if we missed one, and if so, self |
| * notify. Note the syscall clobbers rax. */ |
| asm volatile ("movq 0x00(%0), %%rbx; " /* restore regs */ |
| "movq 0x08(%0), %%rbp; " |
| "movq 0x10(%0), %%r12; " |
| "movq 0x18(%0), %%r13; " |
| "movq 0x20(%0), %%r14; " |
| "movq 0x28(%0), %%r15; " |
| "movq 0x30(%0), %%r8; " /* save rip in r8 */ |
| "movq 0x38(%0), %%rsp; " /* jump to future stack */ |
| "movb $0x00, (%2); " /* enable notifications */ |
| /* Need a wrmb() here so the write of enable_notif can't pass |
| * the read of notif_pending (racing with a potential |
| * cross-core call with proc_notify()). */ |
| "lock addq $0, (%2); " /* LOCK is a CPU mb() */ |
| /* From here down, we can get interrupted and restarted */ |
| "testb $0x01, (%3); " /* test if a notif is pending */ |
| "jz 1f; " /* if not pending, skip syscall */ |
| /* Actual syscall. Note we don't wait on the async call. |
| * &vc_entry is already in rdi (trap arg0). */ |
| "movq $0x1, %%rsi; " /* sending one async syscall: arg1 */ |
| "int %4; " /* fire the syscall */ |
| "1: jmp *%%r8; " /* ret saved earlier */ |
| : |
| : "c"(&sw_tf->tf_rbx), |
| "D"(&vc_entry), |
| "S"(&vcpd->notif_disabled), |
| "d"(&vcpd->notif_pending), |
| "i"(T_SYSCALL) |
| : "memory"); |
| } |
| |
| /* Pops a user context, reanabling notifications at the same time. A Userspace |
| * scheduler can call this when transitioning off the transition stack. |
| * |
| * At some point in vcore context before calling this, you need to clear |
| * notif_pending (do this by calling handle_events()). As a potential |
| * optimization, consider clearing the notif_pending flag / handle_events again |
| * (right before popping), right before calling this. If notif_pending is not |
| * clear, this will self_notify this core, since it should be because we missed |
| * a notification message while notifs were disabled. */ |
| static inline void pop_user_ctx(struct user_context *ctx, uint32_t vcoreid) |
| { |
| if (ctx->type == ROS_HW_CTX) |
| pop_hw_tf(&ctx->tf.hw_tf, vcoreid); |
| else |
| pop_sw_tf(&ctx->tf.sw_tf, vcoreid); |
| } |
| |
| /* Like the regular pop_user_ctx, but this one doesn't check or clear |
| * notif_pending. The only case where we use this is when an IRQ/notif |
| * interrupts a uthread that is in the process of disabling notifs. */ |
| static inline void pop_user_ctx_raw(struct user_context *ctx, uint32_t vcoreid) |
| { |
| struct hw_trapframe *tf = &ctx->tf.hw_tf; |
| assert(ctx->type == ROS_HW_CTX); |
| struct restart_helper *rst; |
| struct preempt_data *vcpd = &__procdata.vcore_preempt_data[vcoreid]; |
| /* The stuff we need to write will be below the current stack of the utf */ |
| rst = (struct restart_helper*)((void*)tf->tf_rsp - |
| sizeof(struct restart_helper)); |
| /* Fill in the info we'll need later */ |
| rst->notif_disab_loc = &vcpd->notif_disabled; |
| rst->rdi_save = 0; /* avoid bugs */ |
| rst->rflags = tf->tf_rflags; |
| rst->rip = tf->tf_rip; |
| |
| asm volatile ("movq %0, %%rsp; " /* jump esp to the utf */ |
| "popq %%rax; " /* restore registers */ |
| "popq %%rbx; " |
| "popq %%rcx; " |
| "popq %%rdx; " |
| "popq %%rbp; " |
| "popq %%rsi; " |
| "popq %%rdi; " |
| "popq %%r8; " |
| "popq %%r9; " |
| "popq %%r10; " |
| "popq %%r11; " |
| "popq %%r12; " |
| "popq %%r13; " |
| "popq %%r14; " |
| "popq %%r15; " |
| "addq $0x28, %%rsp; " /* move to the rsp slot in the tf */ |
| "popq %%rsp; " /* change to the utf's %rsp */ |
| "subq $0x10, %%rsp; " /* move rsp to below rdi's slot */ |
| "pushq %%rdi; " /* save rdi, will clobber soon */ |
| "subq $0x18, %%rsp; " /* move to notif_dis_loc slot */ |
| "popq %%rdi; " /* load notif_disabled addr */ |
| "movb $0x00, (%%rdi); " /* enable notifications */ |
| /* Here's where we differ from the regular pop_user_ctx(). |
| * We need to adjust rsp and whatnot, but don't do test, |
| * clear notif_pending, or call a syscall. */ |
| /* From here down, we can get interrupted and restarted */ |
| "addq $0x10, %%rsp; " /* move to rdi save slot */ |
| "popq %%rdi; " /* restore tf's %rdi */ |
| "popfq; " /* restore utf's rflags */ |
| "ret; " /* return to the new PC */ |
| : |
| : "g"(&tf->tf_rax) |
| : "memory"); |
| } |
| |
| /* Save's a SW context, setting the PC to the end of this function. We only |
| * save callee-saved registers (of the sysv abi). The compiler knows to save |
| * the others via the input/clobber lists. |
| * |
| * Callers of this function need to have at least one |
| * 'calling-convention-compliant' function call between this and any floating |
| * point, so that the compiler saves any caller-saved FP before getting to |
| * here. |
| * |
| * To some extent, TLS is 'callee-saved', in that no one ever expects it to |
| * change. We handle uthread TLS changes separately, since we often change to |
| * them early to set some variables. Arguably we should do this different. */ |
| static inline void save_user_ctx(struct user_context *ctx) |
| { |
| struct sw_trapframe *sw_tf = &ctx->tf.sw_tf; |
| long dummy; |
| ctx->type = ROS_SW_CTX; |
| asm volatile ("stmxcsr %0" : "=m"(sw_tf->tf_mxcsr)); |
| asm volatile ("fnstcw %0" : "=m"(sw_tf->tf_fpucw)); |
| /* Pretty simple: save all the regs, IAW the sys-v ABI */ |
| asm volatile("mov %%rsp, 0x48(%0); " /* save rsp in its slot*/ |
| "leaq 1f, %%rax; " /* get future rip */ |
| "mov %%rax, 0x40(%0); " /* save rip in its slot*/ |
| "mov %%r15, 0x38(%0); " |
| "mov %%r14, 0x30(%0); " |
| "mov %%r13, 0x28(%0); " |
| "mov %%r12, 0x20(%0); " |
| "mov %%rbp, 0x18(%0); " |
| "mov %%rbx, 0x10(%0); " |
| "1: " /* where this tf will restart */ |
| : "=D"(dummy) /* force clobber for rdi */ |
| : "D"(sw_tf) |
| : "rax", "rcx", "rdx", "rsi", "r8", "r9", "r10", "r11", |
| "memory", "cc"); |
| } __attribute__((always_inline, returns_twice)) |
| |
| /* The old version, kept around for testing */ |
| /* Hasn't been used yet for 64 bit. If you use this, it's worth checking to |
| * make sure rax isn't selected for 0, 1, or 2. (and we probably don't need to |
| * save rax in the beginning) */ |
| static inline void save_user_ctx_hw(struct user_context *ctx) |
| { |
| struct hw_trapframe *tf = &ctx->tf.hw_tf; |
| ctx->type = ROS_HW_CTX; |
| memset(tf, 0, sizeof(struct hw_trapframe)); /* sanity */ |
| /* set CS and make sure eflags is okay */ |
| tf->tf_cs = GD_UT | 3; |
| tf->tf_rflags = 0x200; /* interrupts enabled. bare minimum rflags. */ |
| /* Save the regs and the future rsp. */ |
| asm volatile("movq %%rsp, (%0); " /* save rsp in it's slot*/ |
| "pushq %%rax; " /* temp save rax */ |
| "leaq 1f, %%rax; " /* get future rip */ |
| "movq %%rax, (%1); " /* store future rip */ |
| "popq %%rax; " /* restore rax */ |
| "movq %2, %%rsp; " /* move to the rax slot of the tf */ |
| "addl $0x78,%%esp; " /* move to just past r15 */ |
| "pushq %%r15; " /* save regs */ |
| "pushq %%r14; " |
| "pushq %%r13; " |
| "pushq %%r12; " |
| "pushq %%r11; " |
| "pushq %%r10; " |
| "pushq %%r9; " |
| "pushq %%r8; " |
| "pushq %%rdi; " |
| "pushq %%rsi; " |
| "pushq %%rbp; " |
| "pushq %%rdx; " |
| "pushq %%rcx; " |
| "pushq %%rbx; " |
| "pushq %%rax; " |
| "addq $0xa0, %%rsp; " /* move to rsp slot */ |
| "popq %%rsp; " /* restore saved/original rsp */ |
| "1: " /* where this tf will restart */ |
| : |
| : "g"(&tf->tf_rsp), "g"(&tf->tf_rip), "g"(tf->tf_rax) |
| : "rax", "memory", "cc"); |
| } __attribute__((always_inline, returns_twice)) |
| |
| static inline void init_user_ctx(struct user_context *ctx, uintptr_t entry_pt, |
| uintptr_t stack_top) |
| { |
| struct sw_trapframe *sw_tf = &ctx->tf.sw_tf; |
| ctx->type = ROS_SW_CTX; |
| /* Stack pointers in a fresh stackframe need to be such that adding or |
| * subtracting 8 will result in 16 byte alignment (AMD64 ABI). The reason |
| * is so that input arguments (on the stack) are 16 byte aligned. The |
| * extra 8 bytes is the retaddr, pushed on the stack. Compilers know they |
| * can subtract 8 to get 16 byte alignment for instructions like movaps. */ |
| sw_tf->tf_rsp = ROUNDDOWN(stack_top, 16) - 8; |
| sw_tf->tf_rip = entry_pt; |
| sw_tf->tf_rbp = 0; /* for potential backtraces */ |
| /* No need to bother with setting the other GP registers; the called |
| * function won't care about their contents. */ |
| sw_tf->tf_mxcsr = 0x00001f80; /* x86 default mxcsr */ |
| sw_tf->tf_fpucw = 0x037f; /* x86 default FP CW */ |
| } |
| |
| // this is how we get our thread id on entry. |
| #define __vcore_id_on_entry \ |
| ({ \ |
| register int temp asm ("rax"); \ |
| temp; \ |
| }) |
| |
| /* For debugging. */ |
| #include <stdio.h> |
| static void print_hw_tf(struct hw_trapframe *hw_tf) |
| { |
| printf("[user] HW TRAP frame 0x%016x\n", hw_tf); |
| printf(" rax 0x%016lx\n", hw_tf->tf_rax); |
| printf(" rbx 0x%016lx\n", hw_tf->tf_rbx); |
| printf(" rcx 0x%016lx\n", hw_tf->tf_rcx); |
| printf(" rdx 0x%016lx\n", hw_tf->tf_rdx); |
| printf(" rbp 0x%016lx\n", hw_tf->tf_rbp); |
| printf(" rsi 0x%016lx\n", hw_tf->tf_rsi); |
| printf(" rdi 0x%016lx\n", hw_tf->tf_rdi); |
| printf(" r8 0x%016lx\n", hw_tf->tf_r8); |
| printf(" r9 0x%016lx\n", hw_tf->tf_r9); |
| printf(" r10 0x%016lx\n", hw_tf->tf_r10); |
| printf(" r11 0x%016lx\n", hw_tf->tf_r11); |
| printf(" r12 0x%016lx\n", hw_tf->tf_r12); |
| printf(" r13 0x%016lx\n", hw_tf->tf_r13); |
| printf(" r14 0x%016lx\n", hw_tf->tf_r14); |
| printf(" r15 0x%016lx\n", hw_tf->tf_r15); |
| printf(" trap 0x%08x\n", hw_tf->tf_trapno); |
| printf(" gsbs 0x%016lx\n", hw_tf->tf_gsbase); |
| printf(" fsbs 0x%016lx\n", hw_tf->tf_fsbase); |
| printf(" err 0x--------%08x\n", hw_tf->tf_err); |
| printf(" rip 0x%016lx\n", hw_tf->tf_rip); |
| printf(" cs 0x------------%04x\n", hw_tf->tf_cs); |
| printf(" flag 0x%016lx\n", hw_tf->tf_rflags); |
| printf(" rsp 0x%016lx\n", hw_tf->tf_rsp); |
| printf(" ss 0x------------%04x\n", hw_tf->tf_ss); |
| } |
| |
| static void print_sw_tf(struct sw_trapframe *sw_tf) |
| { |
| printf("[user] SW TRAP frame 0x%016p\n", sw_tf); |
| printf(" rbx 0x%016lx\n", sw_tf->tf_rbx); |
| printf(" rbp 0x%016lx\n", sw_tf->tf_rbp); |
| printf(" r12 0x%016lx\n", sw_tf->tf_r12); |
| printf(" r13 0x%016lx\n", sw_tf->tf_r13); |
| printf(" r14 0x%016lx\n", sw_tf->tf_r14); |
| printf(" r15 0x%016lx\n", sw_tf->tf_r15); |
| printf(" gsbs 0x%016lx\n", sw_tf->tf_gsbase); |
| printf(" fsbs 0x%016lx\n", sw_tf->tf_fsbase); |
| printf(" rip 0x%016lx\n", sw_tf->tf_rip); |
| printf(" rsp 0x%016lx\n", sw_tf->tf_rsp); |
| printf(" mxcsr 0x%08x\n", sw_tf->tf_mxcsr); |
| printf(" fpucw 0x%04x\n", sw_tf->tf_fpucw); |
| } |
| |
| static void print_user_context(struct user_context *ctx) |
| { |
| if (ctx->type == ROS_HW_CTX) |
| print_hw_tf(&ctx->tf.hw_tf); |
| else if (ctx->type == ROS_SW_CTX) |
| print_sw_tf(&ctx->tf.sw_tf); |
| else |
| printf("Unknown context type %d\n", ctx->type); |
| } |
| |
| #endif /* PARLIB_ARCH_VCORE64_H */ |