x86: Use P-states and C-states (XCC) To use turbo mode, we need to both set the fastest P-state, which is the turbo mode ratio, and have the halted cores enter a deep enough C-state to allow the hardware to boost the 'active' cores. To halt in any C-state deeper than C1, you need to use mwait. For those curious, you can see the max ratio available, given the number of active cores, in MSR_TURBO_RATIO_LIMIT. For instance, on my Haswell, I get something like: / $ rdmsr 0x1ad Core 0, MSR 0x000001ad: 0x1a1a1b1c1d1e2020 That means that if you have 0-1 cores active, they can each get to 0x20. Two cores, 0x1e. Etc. If all cores are active, it's 0x1a. (There are other MSRs for the additional cores, but they are all 0x1a). Those ratios are multiplied by the bus freq, 100 MHz in this case. So this means the top-end for Turbo mode is 3.2 GHz. If all the cores are running, each core maxes out at 2.6 GHz. The default at boot is 0x18, which is 2.4 GHz, and also happens to be the (invariant) TSC frequency. This commit adds a very basic infrastructure for managing P-states and C-states, and it uses mwait to halt when available. By default, every core will be set to the fastest P-state and the shallowest sleep that still allows Turbo mode. I think. I'll provide an interface via devarch for users to tweak this however they'd like. As future work, we can add something like Linux's idle driver and/or acpi-cpufreq driver. Or we can just leave it to userspace. Reinstall your kernel headers. Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
diff --git a/kern/arch/x86/Kbuild b/kern/arch/x86/Kbuild index 8bb34b2..142531c 100644 --- a/kern/arch/x86/Kbuild +++ b/kern/arch/x86/Kbuild
@@ -6,6 +6,7 @@ obj-y += devarch.o obj-y += entry64.o obj-y += frontend.o +obj-y += idle.o obj-y += init.o obj-y += intel.o obj-y += ioapic.o
diff --git a/kern/arch/x86/arch.h b/kern/arch/x86/arch.h index 20efdf9..2326d5d 100644 --- a/kern/arch/x86/arch.h +++ b/kern/arch/x86/arch.h
@@ -24,7 +24,6 @@ static inline void disable_irqsave(int8_t *state) __attribute__((always_inline)); static inline void cpu_relax(void) __attribute__((always_inline)); -static inline void cpu_halt(void) __attribute__((always_inline)); static inline void clflush(uintptr_t* addr) __attribute__((always_inline)); static inline int irq_is_enabled(void) __attribute__((always_inline)); static inline void cache_flush(void) __attribute__((always_inline)); @@ -44,6 +43,8 @@ void invlpg(void *addr); void tlbflush(void); void tlb_flush_global(void); +/* idle.c */ +void cpu_halt(void); static inline void breakpoint(void) { @@ -130,13 +131,6 @@ __cpu_relax(); } -/* This atomically enables interrupts and halts. sti does not take effect until - * after the *next* instruction */ -static inline void cpu_halt(void) -{ - asm volatile("sti; hlt" : : : "memory"); -} - static inline void clflush(uintptr_t* addr) { asm volatile("clflush %0" : : "m"(*addr));
diff --git a/kern/arch/x86/cpuinfo.c b/kern/arch/x86/cpuinfo.c index cbd8e21..d72cb17 100644 --- a/kern/arch/x86/cpuinfo.c +++ b/kern/arch/x86/cpuinfo.c
@@ -175,6 +175,8 @@ #define CPUID_FXSR_SUPPORT (1 << 24) #define CPUID_XSAVE_SUPPORT (1 << 26) #define CPUID_XSAVEOPT_SUPPORT (1 << 0) + #define CPUID_MONITOR_MWAIT (1 << 3) + #define CPUID_MWAIT_PWR_MGMT (1 << 0) cpuid(0x01, 0x00, 0, 0, &ecx, &edx); if (CPUID_FXSR_SUPPORT & edx) @@ -186,6 +188,12 @@ if (CPUID_XSAVEOPT_SUPPORT & eax) cpu_set_feat(CPU_FEAT_X86_XSAVEOPT); + cpuid(0x01, 0x00, 0, 0, &ecx, 0); + if (CPUID_MONITOR_MWAIT & ecx) { + cpuid(0x05, 0x00, 0, 0, &ecx, 0); + if (CPUID_MWAIT_PWR_MGMT & ecx) + cpu_set_feat(CPU_FEAT_X86_MWAIT); + } } #define BIT_SPACING " "
diff --git a/kern/arch/x86/idle.c b/kern/arch/x86/idle.c new file mode 100644 index 0000000..523fdb1 --- /dev/null +++ b/kern/arch/x86/idle.c
@@ -0,0 +1,77 @@ +#include <arch/arch.h> +#include <arch/x86.h> +#include <arch/mmu.h> +#include <cpu_feat.h> +#include <arch/uaccess.h> + +static unsigned int x86_cstate; + +/* This atomically enables interrupts and halts. + * + * Note that sti does not take effect until after the *next* instruction */ +void cpu_halt(void) +{ + if (cpu_has_feat(CPU_FEAT_X86_MWAIT)) { + /* TODO: since we're monitoring anyway, x86 could use monitor/mwait for + * KMSGs, instead of relying on IPIs. (Maybe only for ROUTINE). */ + asm volatile("monitor" : : "a"(KERNBASE), "c"(0), "d"(0)); + asm volatile("sti; mwait" : : "c"(0x0), "a"(x86_cstate) : "memory"); + } else { + asm volatile("sti; hlt" : : : "memory"); + } +} + +void set_pstate(unsigned int pstate) +{ + uint64_t perf_ctl; + + /* This MSR was introduced in 0f_03 (family/model), so checking cpuid should + * suffice. Though my Qemu says it is a later generation and still fails to + * support it (patches pending, I hear). */ + if (read_msr_safe(MSR_IA32_PERF_CTL, &perf_ctl)) + return; + /* The p-state ratio is actually at 15:8, AFAIK, for both PERF_CTL and + * PERF_STATUS. Not sure what the lower byte represents. It's probably + * processor specific. */ + perf_ctl &= ~0xff00ULL; + perf_ctl |= pstate << 8; + write_msr_safe(MSR_IA32_PERF_CTL, perf_ctl); +} + +void set_fastest_pstate(void) +{ + uint64_t turbo_ratio_limit; + + /* Support for TURBO_RATIO_LIMIT varies from processor to processor. In + * lieu of a full per-model driver, we can just take a peak. */ + if (read_msr_safe(MSR_TURBO_RATIO_LIMIT, &turbo_ratio_limit)) + return; + /* The lowest byte is the max turbo ratio achievable by one active core. */ + set_pstate(turbo_ratio_limit & 0xff); +} + +/* This returns the desired pstate, which might be less than desired if other + * cores are active. */ +unsigned int get_pstate(void) +{ + uint64_t perf_ctl; + + if (read_msr_safe(MSR_IA32_PERF_CTL, &perf_ctl)) + return 0; + return (perf_ctl & 0xff00) >> 8; +} + +void set_cstate(unsigned int cstate) +{ + /* No real need to lock for an assignment. Any core can set this, and other + * cores will notice the next time they halt. */ + x86_cstate = cstate; +} + +unsigned int get_cstate(void) +{ + /* We won't be able to use anything deeper than C1 without MWAIT */ + if (!cpu_has_feat(CPU_FEAT_X86_MWAIT)) + return X86_MWAIT_C1; + return x86_cstate; +}
diff --git a/kern/arch/x86/ros/cpu_feat.h b/kern/arch/x86/ros/cpu_feat.h index c6faf31..2ecf9b1 100644 --- a/kern/arch/x86/ros/cpu_feat.h +++ b/kern/arch/x86/ros/cpu_feat.h
@@ -16,4 +16,5 @@ #define CPU_FEAT_X86_XSAVE (__CPU_FEAT_ARCH_START + 3) #define CPU_FEAT_X86_XSAVEOPT (__CPU_FEAT_ARCH_START + 4) #define CPU_FEAT_X86_FSGSBASE (__CPU_FEAT_ARCH_START + 5) +#define CPU_FEAT_X86_MWAIT (__CPU_FEAT_ARCH_START + 6) #define __NR_CPU_FEAT (__CPU_FEAT_ARCH_START + 64)
diff --git a/kern/arch/x86/ros/msr-index.h b/kern/arch/x86/ros/msr-index.h index 5ae4835..69456d4 100644 --- a/kern/arch/x86/ros/msr-index.h +++ b/kern/arch/x86/ros/msr-index.h
@@ -73,6 +73,7 @@ #define MSR_OFFCORE_RSP_0 0x000001a6 #define MSR_OFFCORE_RSP_1 0x000001a7 #define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad +#define MSR_TURBO_RATIO_LIMIT 0x000001ad #define MSR_IVT_TURBO_RATIO_LIMIT 0x000001ae #define MSR_LBR_SELECT 0x000001c8
diff --git a/kern/arch/x86/smp_boot.c b/kern/arch/x86/smp_boot.c index efb11e2..e1f1f15 100644 --- a/kern/arch/x86/smp_boot.c +++ b/kern/arch/x86/smp_boot.c
@@ -324,4 +324,12 @@ perfmon_pcpu_init(); vmm_pcpu_init(); lcr4(rcr4() & ~CR4_TSD); + + /* This should allow turbo mode. I haven't found a doc that says how deep + * we need to sleep. At a minimum on some machines, it's C2. Given that + * "C2 or deeper" pops up in a few other areas as a deeper sleep (e.g. + * mwaits on memory accesses from outside the processor won't wake >= C2), + * this might be deep enough for turbo mode to kick in. */ + set_fastest_pstate(); + set_cstate(X86_MWAIT_C2); }
diff --git a/kern/arch/x86/x86.h b/kern/arch/x86/x86.h index 9c49ef0..2331e42 100644 --- a/kern/arch/x86/x86.h +++ b/kern/arch/x86/x86.h
@@ -1,6 +1,7 @@ #pragma once #include <ros/common.h> +#include <ros/arch/msr-index.h> #include <arch/mmu.h> #include <ros/errno.h> #include <arch/fixup.h> @@ -132,6 +133,16 @@ #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ #define X86_CR4_SMAP 0x00200000 /* enable SMAP support */ +/* MWAIT C-state hints. The names might not be right for different processors. + * For instance, the Linux idle driver for a Haswell calls the mwait for 0x10 + * "C3-HSW". */ +#define X86_MWAIT_C1 0x00 +#define X86_MWAIT_C2 0x10 +#define X86_MWAIT_C3 0x20 +#define X86_MWAIT_C4 0x30 +#define X86_MWAIT_C5 0x40 +#define X86_MWAIT_C6 0x50 + /* * x86-64 Task Priority Register, CR8 */ @@ -193,6 +204,12 @@ static inline void wbinvd(void) __attribute__((always_inline)); static inline void __cpu_relax(void) __attribute__((always_inline)); +void set_pstate(unsigned int pstate); +void set_fastest_pstate(void); +unsigned int get_pstate(void); +void set_cstate(unsigned int cstate); +unsigned int get_cstate(void); + static inline uint8_t inb(int port) { uint8_t data;