blob: 7934257121fbbe1067f5734673d41de2b2e648f5 [file] [log] [blame]
#include <stdio.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <parlib/arch/arch.h>
#include <parlib/ros_debug.h>
#include <unistd.h>
#include <gelf.h>
#include <errno.h>
#include <libelf.h>
#include <dirent.h>
#include <stdlib.h>
#include <string.h>
#include <ros/syscall.h>
#include <sys/mman.h>
#include <vmm/vmm.h>
#include <vmm/acpi/acpi.h>
#include <ros/arch/mmu.h>
#include <ros/arch/membar.h>
#include <ros/vmm.h>
#include <parlib/uthread.h>
#include <vmm/linux_bootparam.h>
#include <getopt.h>
#include <parlib/alarm.h>
#include <vmm/virtio.h>
#include <vmm/virtio_blk.h>
#include <vmm/virtio_mmio.h>
#include <vmm/virtio_ids.h>
#include <vmm/virtio_config.h>
#include <vmm/virtio_console.h>
#include <vmm/virtio_net.h>
#include <vmm/virtio_lguest_console.h>
#include <vmm/sched.h>
#include <vmm/net.h>
#include <sys/eventfd.h>
#include <sys/uio.h>
#include <parlib/opts.h>
struct virtual_machine local_vm = {.mtx = UTH_MUTEX_INIT},
*vm = &local_vm;
struct vmm_gpcore_init *gpcis;
void vapic_status_dump(FILE *f, void *vapic);
#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
#error "Get a gcc newer than 4.4.0"
#else
#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
#endif
static void virtio_poke_guest(uint8_t vec, uint32_t dest)
{
if (dest < vm->nr_gpcs) {
vmm_interrupt_guest(vm, dest, vec);
return;
}
if (dest != 0xffffffff)
panic("INVALID DESTINATION: 0x%02x\n", dest);
for (int i = 0; i < vm->nr_gpcs; i++)
vmm_interrupt_guest(vm, i, vec);
}
static struct virtio_mmio_dev cons_mmio_dev = {
.poke_guest = virtio_poke_guest,
};
static struct virtio_console_config cons_cfg;
static struct virtio_console_config cons_cfg_d;
static struct virtio_vq_dev cons_vqdev = {
.name = "console",
.dev_id = VIRTIO_ID_CONSOLE,
.dev_feat =
(1ULL << VIRTIO_F_VERSION_1) | (1 << VIRTIO_RING_F_INDIRECT_DESC),
.num_vqs = 2,
.cfg = &cons_cfg,
.cfg_d = &cons_cfg_d,
.cfg_sz = sizeof(struct virtio_console_config),
.transport_dev = &cons_mmio_dev,
.vqs = {
{
.name = "cons_receiveq",
.qnum_max = 64,
.srv_fn = cons_receiveq_fn,
.vqdev = &cons_vqdev
},
{
.name = "cons_transmitq",
.qnum_max = 64,
.srv_fn = cons_transmitq_fn,
.vqdev = &cons_vqdev
},
}
};
static struct virtio_mmio_dev net_mmio_dev = {
.poke_guest = virtio_poke_guest,
};
static struct virtio_net_config net_cfg = {
.max_virtqueue_pairs = 1
};
static struct virtio_net_config net_cfg_d = {
.max_virtqueue_pairs = 1
};
static struct virtio_vq_dev net_vqdev = {
.name = "network",
.dev_id = VIRTIO_ID_NET,
.dev_feat = (1ULL << VIRTIO_F_VERSION_1 | 1 << VIRTIO_NET_F_MAC),
.num_vqs = 2,
.cfg = &net_cfg,
.cfg_d = &net_cfg_d,
.cfg_sz = sizeof(struct virtio_net_config),
.transport_dev = &net_mmio_dev,
.vqs = {
{
.name = "net_receiveq",
.qnum_max = 64,
.srv_fn = net_receiveq_fn,
.vqdev = &net_vqdev
},
{
.name = "net_transmitq",
.qnum_max = 64,
.srv_fn = net_transmitq_fn,
.vqdev = &net_vqdev
},
}
};
static struct virtio_mmio_dev blk_mmio_dev = {
.poke_guest = virtio_poke_guest,
};
static struct virtio_blk_config blk_cfg = {
};
static struct virtio_blk_config blk_cfg_d = {
};
static struct virtio_vq_dev blk_vqdev = {
.name = "block",
.dev_id = VIRTIO_ID_BLOCK,
.dev_feat = (1ULL << VIRTIO_F_VERSION_1),
.num_vqs = 1,
.cfg = &blk_cfg,
.cfg_d = &blk_cfg_d,
.cfg_sz = sizeof(struct virtio_blk_config),
.transport_dev = &blk_mmio_dev,
.vqs = {
{
.name = "blk_request",
.qnum_max = 64,
.srv_fn = blk_request,
.vqdev = &blk_vqdev
},
}
};
/* Parse func: given a line of text, it sets any vnet options */
static void __parse_vnet_opts(char *_line)
{
char *eq, *spc;
/* Check all bools first */
if (!strcmp(_line, "snoop")) {
vnet_snoop = TRUE;
return;
}
if (!strcmp(_line, "map_diagnostics")) {
vnet_map_diagnostics = TRUE;
return;
}
if (!strcmp(_line, "real_address")) {
vnet_real_ip_addrs = TRUE;
return;
}
/* Numeric fields, must have an = */
eq = strchr(_line, '=');
if (!eq)
return;
*eq++ = 0;
/* Drop spaces before =. atoi trims any spaces after =. */
while ((spc = strrchr(_line, ' ')))
*spc = 0;
if (!strcmp(_line, "nat_timeout")) {
vnet_nat_timeout = atoi(eq);
return;
}
}
static void set_vnet_opts(char *net_opts)
{
if (parse_opts_file(net_opts, __parse_vnet_opts))
perror("parse opts file");
}
/* Parse func: given a line of text, it builds any vnet port forwardings. */
static void __parse_vnet_port_fwds(char *_line)
{
char *tok, *tok_save = 0;
char *proto, *host_port, *guest_port;
tok = strtok_r(_line, ":", &tok_save);
if (!tok)
return;
if (strcmp(tok, "port"))
return;
tok = strtok_r(NULL, ":", &tok_save);
if (!tok) {
fprintf(stderr, "%s, port with no proto!", __func__);
return;
}
proto = tok;
tok = strtok_r(NULL, ":", &tok_save);
if (!tok) {
fprintf(stderr, "%s, port with no host port!", __func__);
return;
}
host_port = tok;
tok = strtok_r(NULL, ":", &tok_save);
if (!tok) {
fprintf(stderr, "%s, port with no guest port!", __func__);
return;
}
guest_port = tok;
vnet_port_forward(proto, host_port, guest_port);
}
static void set_vnet_port_fwds(char *net_opts)
{
if (parse_opts_file(net_opts, __parse_vnet_port_fwds))
perror("parse opts file");
}
/* We map the APIC-access page, the per core Virtual APIC page and the
* per core Posted Interrupt Descriptors.
* Note: check if the PID/PIR needs to be a 4k page. */
void alloc_intr_pages(void)
{
void *a_page;
void *pages, *pir;
a_page = mmap((void *)APIC_GPA, PGSIZE, PROT_READ | PROT_WRITE,
MAP_POPULATE | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
fprintf(stderr, "a_page mmap pointer %p\n", a_page);
if (a_page != (void *)APIC_GPA) {
perror("Could not mmap APIC");
exit(1);
}
/* The VM should never actually read from this page. */
for (int i = 0; i < PGSIZE/4; i++)
((uint32_t *)a_page)[i] = 0xDEADBEEF;
/* Allocate VAPIC and PIR pages. */
pages = mmap((void*)0, vm->nr_gpcs * 2 * PGSIZE, PROT_READ | PROT_WRITE,
MAP_POPULATE | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (pages == MAP_FAILED) {
perror("Unable to map VAPIC and PIR pages.");
exit(1);
}
/* We use the first vm->nr_gpcs pages for the VAPIC, and the second set
* for the PIRs. Each VAPIC and PIR gets its own 4k page. */
pir = pages + (vm->nr_gpcs * PGSIZE);
/* Set the addresses in the gpcis. These gpcis get copied into the
* guest_threads during their construction. */
for (int i = 0; i < vm->nr_gpcs; i++) {
gpcis[i].posted_irq_desc = pir + (PGSIZE * i);
gpcis[i].vapic_addr = pages + (PGSIZE * i);
gpcis[i].apic_addr = a_page;
gpcis[i].fsbase = 0;
gpcis[i].gsbase = 0;
/* Set APIC ID. */
((uint32_t *)gpcis[i].vapic_addr)[0x20/4] = i;
/* Set APIC VERSION. */
((uint32_t *)gpcis[i].vapic_addr)[0x30/4] = 0x01060015;
/* Set LOGICAL APIC ID. */
((uint32_t *)gpcis[i].vapic_addr)[0xD0/4] = 1 << i;
}
}
/* This rams all cores with a valid timer vector and initial count
* with a timer interrupt. Used only for debugging/as a temporary workaround */
void *inject_timer_spurious(void *args)
{
struct vmm_gpcore_init *curgpci;
uint32_t initial_count;
uint8_t vector;
for (int i = 0; i < vm->nr_gpcs; i++) {
curgpci = gth_to_gpci(gpcid_to_gth(vm, i));
vector = ((uint32_t *)curgpci->vapic_addr)[0x32] & 0xff;
initial_count = ((uint32_t *)curgpci->vapic_addr)[0x38];
if (initial_count && vector)
vmm_interrupt_guest(vm, i, vector);
}
return 0;
}
/* This injects the timer interrupt to the guest. */
void *inject_timer(void *args)
{
struct guest_thread *gth = (struct guest_thread*)args;
struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
uint8_t vector = ((uint32_t *)gpci->vapic_addr)[0x32] & 0xff;
vmm_interrupt_guest(vm, gth->gpc_id, vector);
return 0;
}
/* This handler must never call set_alarm after interrupting the guest,
* otherwise the guest could try to write to the timer msrs and cause a
* race condition. */
void timer_alarm_handler(struct alarm_waiter *waiter)
{
uint8_t vector;
uint32_t initial_count;
uint32_t divide_config_reg;
uint32_t multiplier;
uint32_t timer_mode;
struct guest_thread *gth = (struct guest_thread*)waiter->data;
struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
vector = ((uint32_t *)gpci->vapic_addr)[0x32] & 0xff;
timer_mode = (((uint32_t *)gpci->vapic_addr)[0x32] >> 17) & 0x03;
initial_count = ((uint32_t *)gpci->vapic_addr)[0x38];
divide_config_reg = ((uint32_t *)gpci->vapic_addr)[0x3E];
/* Don't blame me for this. Look at the intel manual
* Vol 3 10.5.4 APIC Timer */
multiplier = (((divide_config_reg & 0x08) >> 1) |
(divide_config_reg & 0x03)) + 1;
multiplier &= 0x07;
if (vector && initial_count && timer_mode == 0x01) {
/* This is periodic, we reset the alarm */
set_awaiter_rel(waiter, initial_count << multiplier);
set_alarm(waiter);
}
/* We spin up a task to inject the timer because vmm_interrupt_guest
* may block and we can't do that from vcore context. */
vmm_run_task(vm, inject_timer, gth);
}
/* This sets up the structs for each of the guest pcore's timers, but
* doesn't actually start the alarms until the core writes all the reasonable
* values to the x2apic msrs. */
void init_timer_alarms(void)
{
for (uint64_t i = 0; i < vm->nr_gpcs; i++) {
struct alarm_waiter *timer_alarm =
malloc(sizeof(struct alarm_waiter));
struct guest_thread *gth = gpcid_to_gth(vm, i);
/* TODO: consider a struct to bundle a bunch of things, not just
* timer_alarm. */
gth->user_data = (void *)timer_alarm;
timer_alarm->data = gth;
init_awaiter(timer_alarm, timer_alarm_handler);
}
}
int main(int argc, char **argv)
{
int debug = 0;
unsigned long long memsize = GiB;
uintptr_t memstart = MinMemory;
uintptr_t memend;
struct boot_params *bp;
char cmdline_default[512] = {0};
char *cmdline_extra = "\0";
char *cmdline;
void *a = (void *)0xe0000;
int vmmflags = 0;
uint64_t entry = 0;
int ret;
struct vm_trapframe *vm_tf;
char *cmdlinep;
int cmdlinesz, len, cmdline_fd;
char *disk_image_file = NULL;
int c;
struct stat stat_result;
int num_read;
int option_index;
char *smbiostable = NULL;
char *net_opts = NULL;
uint64_t num_pcs = 1;
bool is_greedy = FALSE;
bool is_scp = FALSE;
char *initrd = NULL;
uint64_t initrd_start = 0, initrd_size = 0;
uint64_t kernel_max_address;
static struct option long_options[] = {
{"debug", no_argument, 0, 'd'},
{"vmm_vmcall", no_argument, 0, 'v'},
{"maxresume", required_argument, 0, 'R'},
{"memsize", required_argument, 0, 'm'},
{"memstart", required_argument, 0, 'M'},
{"cmdline_extra", required_argument, 0, 'c'},
{"greedy", no_argument, 0, 'g'},
{"initrd", required_argument, 0, 'i'},
{"scp", no_argument, 0, 's'},
{"image_file", required_argument, 0, 'f'},
{"cmdline", required_argument, 0, 'k'},
{"net", required_argument, 0, 'n'},
{"num_cores", required_argument, 0, 'N'},
{"smbiostable", required_argument, 0, 't'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}
};
fprintf(stderr, "%p %p %p %p\n", PGSIZE, PGSHIFT, PML1_SHIFT,
PML1_PTE_REACH);
if ((uintptr_t)__procinfo.program_end >= MinMemory) {
fprintf(stderr,
"Panic: vmrunkernel binary extends into guest memory\n");
exit(1);
}
vm->low4k = malloc(PGSIZE);
memset(vm->low4k, 0xff, PGSIZE);
vm->low4k[0x40e] = 0;
vm->low4k[0x40f] = 0;
// Why is this here? Because the static initializer is getting
// set to 1. Yes, 1. This might be part of the weirdness
// Barrett is reporting with linker sets. So let's leave it
// here until we trust our toolchain.
if (memsize != GiB)
fprintf(stderr, "static initializers are broken\n");
memsize = GiB;
while ((c = getopt_long(argc, argv, "dvi:m:M:c:gsf:k:N:n:t:hR:",
long_options, &option_index)) != -1) {
switch (c) {
case 'd':
debug++;
break;
case 'v':
vmmflags |= VMM_CTL_FL_KERN_PRINTC;
break;
case 'm':
memsize = strtoull(optarg, 0, 0);
break;
case 'M':
memstart = strtoull(optarg, 0, 0);
break;
case 'c':
cmdline_extra = optarg;
case 'g': /* greedy */
parlib_never_yield = TRUE;
if (is_scp) {
fprintf(stderr,
"Can't be both greedy and an SCP\n");
exit(1);
}
is_greedy = TRUE;
break;
case 's': /* scp */
parlib_wants_to_be_mcp = FALSE;
if (is_greedy) {
fprintf(stderr,
"Can't be both greedy and an SCP\n");
exit(1);
}
is_scp = TRUE;
break;
case 'f': /* file to pass to blk_init */
disk_image_file = optarg;
break;
case 'i':
initrd = optarg;
break;
case 'k': /* specify file to get cmdline args from */
cmdline_fd = open(optarg, O_RDONLY);
if (cmdline_fd < 0) {
fprintf(stderr, "failed to open file: %s\n",
optarg);
exit(1);
}
if (stat(optarg, &stat_result) == -1) {
fprintf(stderr, "stat of %s failed\n", optarg);
exit(1);
}
len = stat_result.st_size;
if (len > 512) {
fprintf(stderr, "command line options exceed 512 bytes!");
exit(1);
}
num_read = read(cmdline_fd, cmdline_default, len);
if (num_read != len) {
fprintf(stderr, "read failed len was : %d, num_read was: %d\n",
len, num_read);
exit(1);
}
close(cmdline_fd);
break;
case 't':
smbiostable = optarg;
break;
case 'n':
net_opts = optarg;
break;
case 'N':
num_pcs = strtoull(optarg, 0, 0);
break;
case 'h':
default:
// Sadly, the getopt_long struct does
// not have a pointer to help text.
for (int i = 0;
i <
sizeof(long_options) / sizeof(long_options[0]) - 1;
i++) {
struct option *l = &long_options[i];
fprintf(stderr, "%s or %c%s\n", l->name, l->val,
l->has_arg ? " <arg>" : "");
}
exit(0);
}
}
if (strlen(cmdline_default) == 0) {
fprintf(stderr,
"WARNING: No command line parameter file specified.\n");
}
argc -= optind;
argv += optind;
if (argc < 1) {
fprintf(stderr, "Usage: %s vmimage [-n (no vmcall printf)]\n",
argv[0]);
exit(1);
}
// Set vm->nr_gpcs before it's referenced in the struct setups below.
vm->nr_gpcs = num_pcs;
fprintf(stderr, "NUM PCS: %d\n", num_pcs);
/* These are only used to be passed to vmm_init, which makes copies
* internally */
gpcis = (struct vmm_gpcore_init *)
malloc(num_pcs * sizeof(struct vmm_gpcore_init));
alloc_intr_pages();
memend = memstart + memsize - 1;
if (memend >= BRK_START) {
fprintf(stderr,
"memstart 0x%llx memsize 0x%llx -> 0x%llx is too large; overlaps BRK_START at %p\n",
memstart, memsize, memstart + memsize, BRK_START);
exit(1);
}
mmap_memory(vm, memstart, memsize);
entry = load_elf(argv[0], 0, &kernel_max_address, NULL);
if (entry == 0) {
fprintf(stderr, "Unable to load kernel %s\n", argv[0]);
exit(1);
}
a = setup_biostables(vm, a, smbiostable);
bp = a;
a = init_e820map(vm, bp);
if (initrd) {
initrd_start = ROUNDUP(kernel_max_address, PGSIZE);
fprintf(stderr,
"kernel_max_address is %#p; Load initrd @ %#p\n",
kernel_max_address, initrd_start);
initrd_size = setup_initrd(initrd, (void *)initrd_start,
memend - initrd_start + 1);
if (initrd_size <= 0) {
fprintf(stderr, "Unable to load initrd %s\n", initrd);
exit(1);
}
bp->hdr.ramdisk_image = initrd_start;
bp->hdr.ramdisk_size = initrd_size;
bp->hdr.root_dev = 0x100;
bp->hdr.type_of_loader = 0xff;
fprintf(stderr, "Set bp initrd to %p / %p\n",
initrd_start, initrd_size);
}
/* The MMIO address of the console device is really the address of an
* unbacked EPT page: accesses to this page will cause a page fault that
* traps to the host, which will examine the fault, see it was for the
* known MMIO address, and fulfill the MMIO read or write on the guest's
* behalf accordingly. We place the virtio space at 512 GB higher than
* the guest physical memory to avoid a full page table walk. */
uintptr_t virtio_mmio_base_addr_hint;
uintptr_t virtio_mmio_base_addr;
virtio_mmio_base_addr_hint =
ROUNDUP((bp->e820_map[bp->e820_entries - 1].addr +
bp->e820_map[bp->e820_entries - 1].size),
PML4_PTE_REACH);
/* mmap with prot_none so we don't accidentally mmap something else
* here.
* We give space for 512 devices right now.
* TODO(ganshun): Make it dynamic based on number of virtio devices. */
virtio_mmio_base_addr =
(uintptr_t) mmap((void *) virtio_mmio_base_addr_hint, 512 * PGSIZE,
PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (!virtio_mmio_base_addr || virtio_mmio_base_addr >= BRK_START) {
/* Either we were unable to mmap at all or we mapped it too
* high. */
panic("Unable to mmap protect space for virtio devices, got 0x%016x",
virtio_mmio_base_addr);
}
cons_mmio_dev.addr =
virtio_mmio_base_addr + PGSIZE * VIRTIO_MMIO_CONSOLE_DEV;
cons_mmio_dev.vqdev = &cons_vqdev;
vm->virtio_mmio_devices[VIRTIO_MMIO_CONSOLE_DEV] = &cons_mmio_dev;
net_mmio_dev.addr =
virtio_mmio_base_addr + PGSIZE * VIRTIO_MMIO_NETWORK_DEV;
net_mmio_dev.vqdev = &net_vqdev;
vm->virtio_mmio_devices[VIRTIO_MMIO_NETWORK_DEV] = &net_mmio_dev;
if (disk_image_file != NULL) {
blk_mmio_dev.addr =
virtio_mmio_base_addr + PGSIZE * VIRTIO_MMIO_BLOCK_DEV;
blk_mmio_dev.vqdev = &blk_vqdev;
vm->virtio_mmio_devices[VIRTIO_MMIO_BLOCK_DEV] = &blk_mmio_dev;
blk_init_fn(&blk_vqdev, disk_image_file);
}
set_vnet_opts(net_opts);
vnet_init(vm, &net_vqdev);
set_vnet_port_fwds(net_opts);
/* Set the kernel command line parameters */
a += 4096;
cmdline = a;
a += 4096;
bp->hdr.cmd_line_ptr = (uintptr_t) cmdline;
len = snprintf(cmdline, 4096, "%s %s", cmdline_default, cmdline_extra);
cmdlinesz = 4096 - len;
cmdlinep = cmdline + len;
for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
if (vm->virtio_mmio_devices[i] == NULL)
continue;
/* Append all the virtio mmio base addresses. */
/* Since the lower number irqs are no longer being used, the
* irqs can now be assigned starting from 0. */
vm->virtio_mmio_devices[i]->irq = i;
len = snprintf(cmdlinep, cmdlinesz,
"\n virtio_mmio.device=1K@0x%llx:%lld",
vm->virtio_mmio_devices[i]->addr,
vm->virtio_mmio_devices[i]->irq);
if (len >= cmdlinesz) {
fprintf(stderr, "Too many arguments to the linux command line.");
exit(1);
}
cmdlinesz -= len;
cmdlinep += len;
}
/* Set maxcpus to the number of cores we're giving the guest. */
len = snprintf(cmdlinep, cmdlinesz,
"\n maxcpus=%lld\n possible_cpus=%lld", vm->nr_gpcs,
vm->nr_gpcs);
if (len >= cmdlinesz) {
fprintf(stderr,
"Too many arguments to the linux command line.");
exit(1);
}
cmdlinesz -= len;
cmdlinep += len;
ret = vmm_init(vm, gpcis, vmmflags);
assert(!ret);
free(gpcis);
init_timer_alarms();
setup_paging(vm);
vm_tf = gpcid_to_vmtf(vm, 0);
vm_tf->tf_cr3 = (uint64_t) vm->root;
vm_tf->tf_rip = entry;
vm_tf->tf_rsp = 0xe0000;
vm_tf->tf_rsi = (uint64_t) bp;
vm_tf->tf_rflags = FL_RSVD_1;
vm->up_gpcs = 1;
fprintf(stderr, "Start guest: cr3 %p rip %p\n", vm_tf->tf_cr3, entry);
start_guest_thread(gpcid_to_gth(vm, 0));
uthread_sleep_forever();
return 0;
}