blob: 3150087c8a8e80c1e8e6cf818524f01aaca5bd11 [file] [log] [blame] [edit]
#include <stdio.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <parlib/arch/arch.h>
#include <parlib/ros_debug.h>
#include <unistd.h>
#include <gelf.h>
#include <errno.h>
#include <libelf.h>
#include <dirent.h>
#include <stdlib.h>
#include <string.h>
#include <ros/syscall.h>
#include <sys/mman.h>
#include <vmm/vmm.h>
#include <vmm/acpi/acpi.h>
#include <vmm/acpi/vmm_simple_dsdt.h>
#include <ros/arch/mmu.h>
#include <ros/arch/membar.h>
#include <ros/vmm.h>
#include <parlib/uthread.h>
#include <vmm/linux_bootparam.h>
#include <getopt.h>
#include <vmm/virtio.h>
#include <vmm/virtio_blk.h>
#include <vmm/virtio_mmio.h>
#include <vmm/virtio_ids.h>
#include <vmm/virtio_config.h>
#include <vmm/virtio_console.h>
#include <vmm/virtio_net.h>
#include <vmm/virtio_lguest_console.h>
#include <vmm/sched.h>
#include <vmm/net.h>
#include <sys/eventfd.h>
#include <sys/uio.h>
#include <parlib/opts.h>
struct virtual_machine local_vm, *vm = &local_vm;
struct vmm_gpcore_init gpci;
/* By 1999, you could just scan the hardware
* and work it out. But 2005, that was no longer possible. How sad.
* so we have to fake acpi to make it all work.
* This will be copied to memory at 0xe0000, so the kernel can find it.
*/
/* assume they're all 256 bytes long just to make it easy.
* Just have pointers that point to aligned things.
*/
struct acpi_table_rsdp rsdp = {
.signature = ACPI_SIG_RSDP,
.oem_id = "AKAROS",
.revision = 2,
.length = 36,
};
struct acpi_table_xsdt xsdt = {
.header = {
.signature = ACPI_SIG_DSDT,
.revision = 2,
.oem_id = "AKAROS",
.oem_table_id = "ALPHABET",
.oem_revision = 0,
.asl_compiler_id = "RON ",
.asl_compiler_revision = 0,
},
};
struct acpi_table_fadt fadt = {
.header = {
.signature = ACPI_SIG_FADT,
.revision = 2,
.oem_id = "AKAROS",
.oem_table_id = "ALPHABET",
.oem_revision = 0,
.asl_compiler_id = "RON ",
.asl_compiler_revision = 0,
},
};
/* This has to be dropped into memory, then the other crap just follows it.
*/
struct acpi_table_madt madt = {
.header = {
.signature = ACPI_SIG_MADT,
.revision = 2,
.oem_id = "AKAROS",
.oem_table_id = "ALPHABET",
.oem_revision = 0,
.asl_compiler_id = "RON ",
.asl_compiler_revision = 0,
},
.address = 0xfee00000ULL,
.flags = 0,
};
struct acpi_madt_local_apic Apic0 = {.header = {.type = ACPI_MADT_TYPE_LOCAL_APIC, .length = sizeof(struct acpi_madt_local_apic)},
.processor_id = 0, .id = 0, .lapic_flags = 1};
struct acpi_madt_io_apic Apic1 = {.header = {.type = ACPI_MADT_TYPE_IO_APIC, .length = sizeof(struct acpi_madt_io_apic)},
.id = 0, .address = 0xfec00000, .global_irq_base = 0};
struct acpi_madt_local_x2apic X2Apic0 = {
.header = {
.type = ACPI_MADT_TYPE_LOCAL_X2APIC,
.length = sizeof(struct acpi_madt_local_x2apic)
},
.local_apic_id = 0,
.uid = 0
};
struct acpi_madt_interrupt_override isor[] = {
/* From the ACPI Specification Version 6.1:
* For example, if your machine has the ISA Programmable Interrupt Timer
* (PIT) connected to ISA IRQ 0, but in APIC mode, it is connected to I/O
* APIC interrupt input 2, then you would need an Interrupt Source Override
* where the source entry is ‘0’ and the Global System Interrupt is ‘2.’
*/
};
/* this test will run the "kernel" in the negative address space. We hope. */
void *low1m;
volatile int shared = 0;
volatile int quit = 0;
/* total hack. If the vm runs away we want to get control again. */
unsigned int maxresume = (unsigned int) -1;
#define MiB 0x100000ull
#define GiB (1ull << 30)
#define MinMemory (16*MiB)
void *kernel;
unsigned long long memsize = GiB;
uintptr_t memstart = MinMemory;
uintptr_t stack;
typedef struct {
uint64_t pte[512];
} ptp;
ptp *p512, *p1, *p2m;
void **my_retvals;
int nr_threads = 4;
int debug = 0;
int resumeprompt = 0;
/* unlike Linux, this shared struct is for both host and guest. */
// struct virtqueue *constoguest =
// vring_new_virtqueue(0, 512, 8192, 0, inpages, NULL, NULL, "test");
void vapic_status_dump(FILE *f, void *vapic);
#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
#error "Get a gcc newer than 4.4.0"
#else
#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
#endif
#define LOCK_PREFIX "lock "
#define ADDR BITOP_ADDR(addr)
static inline int test_and_set_bit(int nr, volatile unsigned long *addr);
pthread_t timerthread_struct;
void timer_thread(void *arg)
{
uint8_t vector;
uint32_t initial_count;
while (1) {
vector = ((uint32_t *)gpci.vapic_addr)[0x32] & 0xff;
initial_count = ((uint32_t *)gpci.vapic_addr)[0x38];
if (vector && initial_count)
vmm_interrupt_guest(vm, 0, vector);
uthread_usleep(100000);
}
fprintf(stderr, "SENDING TIMER\n");
}
// FIXME.
volatile int consdata = 0;
static void virtio_poke_guest(uint8_t vec, uint32_t dest)
{
if (dest < vm->nr_gpcs) {
vmm_interrupt_guest(vm, dest, vec);
return;
}
if (dest != 0xffffffff)
panic("INVALID DESTINATION: 0x%02x\n", dest);
for (int i = 0; i < vm->nr_gpcs; i++)
vmm_interrupt_guest(vm, i, vec);
}
static struct virtio_mmio_dev cons_mmio_dev = {
.poke_guest = virtio_poke_guest,
};
static struct virtio_console_config cons_cfg;
static struct virtio_console_config cons_cfg_d;
static struct virtio_vq_dev cons_vqdev = {
.name = "console",
.dev_id = VIRTIO_ID_CONSOLE,
.dev_feat =
(1ULL << VIRTIO_F_VERSION_1) | (1 << VIRTIO_RING_F_INDIRECT_DESC),
.num_vqs = 2,
.cfg = &cons_cfg,
.cfg_d = &cons_cfg_d,
.cfg_sz = sizeof(struct virtio_console_config),
.transport_dev = &cons_mmio_dev,
.vqs = {
{
.name = "cons_receiveq",
.qnum_max = 64,
.srv_fn = cons_receiveq_fn,
.vqdev = &cons_vqdev
},
{
.name = "cons_transmitq",
.qnum_max = 64,
.srv_fn = cons_transmitq_fn,
.vqdev = &cons_vqdev
},
}
};
static struct virtio_mmio_dev net_mmio_dev = {
.poke_guest = virtio_poke_guest,
};
static struct virtio_net_config net_cfg = {
.max_virtqueue_pairs = 1
};
static struct virtio_net_config net_cfg_d = {
.max_virtqueue_pairs = 1
};
static struct virtio_vq_dev net_vqdev = {
.name = "network",
.dev_id = VIRTIO_ID_NET,
.dev_feat = (1ULL << VIRTIO_F_VERSION_1 | 1 << VIRTIO_NET_F_MAC),
.num_vqs = 2,
.cfg = &net_cfg,
.cfg_d = &net_cfg_d,
.cfg_sz = sizeof(struct virtio_net_config),
.transport_dev = &net_mmio_dev,
.vqs = {
{
.name = "net_receiveq",
.qnum_max = 64,
.srv_fn = net_receiveq_fn,
.vqdev = &net_vqdev
},
{
.name = "net_transmitq",
.qnum_max = 64,
.srv_fn = net_transmitq_fn,
.vqdev = &net_vqdev
},
}
};
static struct virtio_mmio_dev blk_mmio_dev = {
.poke_guest = virtio_poke_guest,
};
static struct virtio_blk_config blk_cfg = {
};
static struct virtio_blk_config blk_cfg_d = {
};
static struct virtio_vq_dev blk_vqdev = {
.name = "block",
.dev_id = VIRTIO_ID_BLOCK,
.dev_feat = (1ULL << VIRTIO_F_VERSION_1),
.num_vqs = 1,
.cfg = &blk_cfg,
.cfg_d = &blk_cfg_d,
.cfg_sz = sizeof(struct virtio_blk_config),
.transport_dev = &blk_mmio_dev,
.vqs = {
{
.name = "blk_request",
.qnum_max = 64,
.srv_fn = blk_request,
.vqdev = &blk_vqdev
},
}
};
void lowmem() {
__asm__ __volatile__ (".section .lowmem, \"aw\"\n\tlow: \n\t.=0x1000\n\t.align 0x100000\n\t.previous\n");
}
static uint8_t acpi_tb_checksum(uint8_t *buffer, uint32_t length)
{
uint8_t sum = 0;
uint8_t *end = buffer + length;
fprintf(stderr, "tbchecksum %p for %d", buffer, length);
while (buffer < end) {
if (end - buffer < 2)
fprintf(stderr, "%02x\n", sum);
sum = (uint8_t)(sum + *(buffer++));
}
fprintf(stderr, " is %02x\n", sum);
return (sum);
}
static void gencsum(uint8_t *target, void *data, int len)
{
uint8_t csum;
// blast target to zero so it does not get counted
// (it might be in the struct we checksum) And, yes, it is, goodness.
fprintf(stderr, "gencsum %p target %p source %d bytes\n", target, data, len);
*target = 0;
csum = acpi_tb_checksum((uint8_t *)data, len);
*target = ~csum + 1;
fprintf(stderr, "Cmoputed is %02x\n", *target);
}
static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
{
int oldbit;
asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
"sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
return oldbit;
}
/* load_kernel loads an ELF file as a kernel. */
uintptr_t
load_kernel(char *filename)
{
Elf64_Ehdr *ehdr;
Elf *elf;
size_t phnum = 0;
Elf64_Phdr *hdrs;
int fd;
elf_version(EV_CURRENT);
fd = open(filename, O_RDONLY);
if (fd < 0) {
fprintf(stderr, "Can't open %s: %r\n", filename);
return 0;
}
elf = elf_begin(fd, ELF_C_READ, NULL);
if (elf == NULL) {
fprintf(stderr, "%s: cannot read %s ELF file.\n", __func__, filename);
close(fd);
return 0;
}
ehdr = elf64_getehdr(elf);
if (ehdr == NULL) {
fprintf(stderr, "%s: cannot get exec header of %s.\n",
__func__, filename);
goto fail;
}
fprintf(stderr, "%s ELF entry point is %p\n", filename,
(void *)ehdr->e_entry);
if (elf_getphdrnum(elf, &phnum) < 0) {
fprintf(stderr, "%s: cannot get program header num of %s.\n",
__func__, filename);
goto fail;
}
fprintf(stderr, "%s has %p program headers\n", filename, phnum);
hdrs = elf64_getphdr(elf);
if (hdrs == NULL) {
fprintf(stderr, "%s: cannot get program headers of %s.\n",
__func__, filename);
goto fail;
}
for (int i = 0; i < phnum; i++) {
size_t tot;
Elf64_Phdr *h = &hdrs[i];
uintptr_t pa;
fprintf(stderr,
"%d: type 0x%lx flags 0x%lx offset 0x%lx vaddr 0x%lx paddr 0x%lx size 0x%lx memsz 0x%lx align 0x%lx\n",
i,
h->p_type, /* Segment type */
h->p_flags, /* Segment flags */
h->p_offset, /* Segment file offset */
h->p_vaddr, /* Segment virtual address */
h->p_paddr, /* Segment physical address */
h->p_filesz, /* Segment size in file */
h->p_memsz, /* Segment size in memory */
h->p_align /* Segment alignment */);
if (h->p_type != PT_LOAD)
continue;
if ((h->p_flags & (PF_R | PF_W | PF_X)) == 0)
continue;
pa = h->p_paddr;
fprintf(stderr,
"Read header %d @offset %p to %p (elf PA is %p) %d bytes:",
i, h->p_offset, pa, h->p_paddr, h->p_filesz);
tot = 0;
while (tot < h->p_filesz) {
int amt = pread(fd, (void *)(pa + tot), h->p_filesz - tot,
h->p_offset + tot);
if (amt < 1)
break;
tot += amt;
}
fprintf(stderr, "read a total of %d bytes\n", tot);
if (tot < h->p_filesz) {
fprintf(stderr, "%s: got %d bytes, wanted %d bytes\n",
filename, tot, h->p_filesz);
goto fail;
}
}
close(fd);
elf_end(elf);
return ehdr->e_entry;
fail:
close(fd);
elf_end(elf);
return 0;
}
/* TODO: put this in a library somewhere */
int cat(char *file, void *where)
{
int fd;
int amt, tot = 0;
fd = open(file, O_RDONLY);
if (fd < 0)
return -1;
while (amt = read(fd, where, 4096)) {
if (amt < 0) {
close(fd);
return -1;
}
tot += amt;
where += amt;
}
close(fd);
return tot;
}
int smbios(char *smbiostable, void *esegment)
{
int amt;
amt = cat(smbiostable, esegment);
if (amt < 0) {
fprintf(stderr, "%s: %r\n", smbiostable);
exit(1);
}
return amt;
}
/* Parse func: given a line of text, it sets any vnet options */
static void __parse_vnet_opts(char *_line)
{
char *eq, *spc;
/* Check all bools first */
if (!strcmp(_line, "snoop")) {
vnet_snoop = TRUE;
return;
}
if (!strcmp(_line, "map_diagnostics")) {
vnet_map_diagnostics = TRUE;
return;
}
if (!strcmp(_line, "real_address")) {
vnet_real_ip_addrs = TRUE;
return;
}
/* Numeric fields, must have an = */
eq = strchr(_line, '=');
if (!eq)
return;
*eq++ = 0;
/* Drop spaces before =. atoi trims any spaces after =. */
while ((spc = strrchr(_line, ' ')))
*spc = 0;
if (!strcmp(_line, "nat_timeout")) {
vnet_nat_timeout = atoi(eq);
return;
}
}
static void set_vnet_opts(char *net_opts)
{
if (parse_opts_file(net_opts, __parse_vnet_opts))
perror("parse opts file");
}
/* Parse func: given a line of text, it builds any vnet port forwardings. */
static void __parse_vnet_port_fwds(char *_line)
{
char *tok, *tok_save = 0;
char *proto, *host_port, *guest_port;
tok = strtok_r(_line, ":", &tok_save);
if (!tok)
return;
if (strcmp(tok, "port"))
return;
tok = strtok_r(NULL, ":", &tok_save);
if (!tok) {
fprintf(stderr, "%s, port with no proto!", __func__);
return;
}
proto = tok;
tok = strtok_r(NULL, ":", &tok_save);
if (!tok) {
fprintf(stderr, "%s, port with no host port!", __func__);
return;
}
host_port = tok;
tok = strtok_r(NULL, ":", &tok_save);
if (!tok) {
fprintf(stderr, "%s, port with no guest port!", __func__);
return;
}
guest_port = tok;
vnet_port_forward(proto, host_port, guest_port);
}
static void set_vnet_port_fwds(char *net_opts)
{
if (parse_opts_file(net_opts, __parse_vnet_port_fwds))
perror("parse opts file");
}
int main(int argc, char **argv)
{
struct boot_params *bp;
char cmdline_default[512] = {0};
char *cmdline_extra = "\0";
char *cmdline;
void *a = (void *)0xe0000;
struct acpi_table_rsdp *r;
struct acpi_table_fadt *f;
struct acpi_table_madt *m;
struct acpi_table_xsdt *x;
int vmmflags = 0; // Disabled probably forever. VMM_VMCALL_PRINTF;
uint64_t entry = 0;
int ret;
uint8_t csum;
void *a_page;
struct vm_trapframe *vm_tf;
uint64_t tsc_freq_khz;
char *cmdlinep;
int cmdlinesz, len, cmdline_fd;
char *disk_image_file = NULL;
int c;
struct stat stat_result;
int num_read;
int option_index;
char *smbiostable = NULL;
int nptp, npml4, npml3, npml2;
char *net_opts = NULL;
static struct option long_options[] = {
{"debug", no_argument, 0, 'd'},
{"vmm_vmcall", no_argument, 0, 'v'},
{"maxresume", required_argument, 0, 'R'},
{"memsize", required_argument, 0, 'm'},
{"memstart", required_argument, 0, 'M'},
{"stack", required_argument, 0, 'S'},
{"cmdline_extra", required_argument, 0, 'c'},
{"greedy", no_argument, 0, 'g'},
{"scp", no_argument, 0, 's'},
{"image_file", required_argument, 0, 'f'},
{"cmdline", required_argument, 0, 'k'},
{"net", required_argument, 0, 'n'},
{"smbiostable", required_argument, 0, 't'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}
};
fprintf(stderr, "%p %p %p %p\n", PGSIZE, PGSHIFT, PML1_SHIFT,
PML1_PTE_REACH);
if ((uintptr_t)__procinfo.program_end >= MinMemory) {
fprintf(stderr,
"Panic: vmrunkernel binary extends into guest memory\n");
exit(1);
}
vm->low4k = malloc(PGSIZE);
memset(vm->low4k, 0xff, PGSIZE);
vm->low4k[0x40e] = 0;
vm->low4k[0x40f] = 0;
//Place mmap(Gan)
a_page = mmap((void *)0xfee00000, PGSIZE, PROT_READ | PROT_WRITE,
MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
fprintf(stderr, "a_page mmap pointer %p\n", a_page);
if (a_page != (void *)0xfee00000) {
perror("Could not mmap APIC");
exit(1);
}
if (((uint64_t)a_page & 0xfff) != 0) {
perror("APIC page mapping is not page aligned");
exit(1);
}
((uint32_t *)a_page)[0x30/4] = 0x01060015;
//((uint32_t *)a_page)[0x30/4] = 0xDEADBEEF;
while ((c = getopt_long(argc, argv, "dvm:M:S:c:gsf:k:n:t:hR:",
long_options, &option_index)) != -1) {
switch (c) {
case 'd':
debug++;
break;
case 'v':
vmmflags |= VMM_VMCALL_PRINTF;
break;
case 'm':
memsize = strtoull(optarg, 0, 0);
break;
case 'M':
memstart = strtoull(optarg, 0, 0);
break;
case 'S':
stack = strtoull(optarg, 0, 0);
break;
case 'R':
maxresume = strtoull(optarg, 0, 0);
break;
case 'c':
cmdline_extra = optarg;
case 'g': /* greedy */
parlib_never_yield = TRUE;
break;
case 's': /* scp */
parlib_wants_to_be_mcp = FALSE;
break;
case 'f': /* file to pass to blk_init */
disk_image_file = optarg;
break;
case 'k': /* specify file to get cmdline args from */
cmdline_fd = open(optarg, O_RDONLY);
if (cmdline_fd < 0) {
fprintf(stderr, "failed to open file: %s\n", optarg);
exit(1);
}
if (stat(optarg, &stat_result) == -1) {
fprintf(stderr, "stat of %s failed\n", optarg);
exit(1);
}
len = stat_result.st_size;
if (len > 512) {
fprintf(stderr, "command line options exceed 512 bytes!");
exit(1);
}
num_read = read(cmdline_fd, cmdline_default, len);
if (num_read != len) {
fprintf(stderr, "read failed len was : %d, num_read was: %d\n",
len, num_read);
exit(1);
}
close(cmdline_fd);
break;
case 't':
smbiostable = optarg;
break;
case 'n':
net_opts = optarg;
break;
case 'h':
default:
// Sadly, the getopt_long struct does
// not have a pointer to help text.
for (int i = 0;
i < sizeof(long_options)/sizeof(long_options[0]) - 1;
i++) {
struct option *l = &long_options[i];
fprintf(stderr, "%s or %c%s\n", l->name, l->val,
l->has_arg ? " <arg>" : "");
}
exit(0);
}
}
if (strlen(cmdline_default) == 0) {
fprintf(stderr, "WARNING: No command line parameter file specified.\n");
}
argc -= optind;
argv += optind;
if (argc < 1) {
fprintf(stderr, "Usage: %s vmimage [-n (no vmcall printf)]\n", argv[0]);
exit(1);
}
if ((uintptr_t)(memstart + memsize) >= (uintptr_t)BRK_START) {
fprintf(stderr,
"memstart 0x%lx memsize 0x%lx -> 0x%lx is too large; overlaps BRK_START at %p\n",
memstart, memsize, memstart + memsize, BRK_START);
exit(1);
}
kernel = mmap((void *)memstart, memsize,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
if (kernel != (void *)memstart) {
fprintf(stderr, "Could not mmap 0x%lx bytes at 0x%lx\n",
memsize, memstart);
exit(1);
}
entry = load_kernel(argv[0]);
if (entry == 0) {
fprintf(stderr, "Unable to load kernel %s\n", argv[0]);
exit(1);
}
// The low 1m so we can fill in bullshit like ACPI. */
// And, sorry, due to the STUPID format of the RSDP for now we need the low 1M.
low1m = mmap((int*)4096, MiB-4096, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS, -1, 0);
if (low1m != (void *)4096) {
perror("Unable to mmap low 1m");
exit(1);
}
r = a;
fprintf(stderr, "install rsdp to %p\n", r);
*r = rsdp;
a += sizeof(*r);
r->xsdt_physical_address = (uint64_t)a;
gencsum(&r->checksum, r, ACPI_RSDP_CHECKSUM_LENGTH);
if ((csum = acpi_tb_checksum((uint8_t *) r, ACPI_RSDP_CHECKSUM_LENGTH)) != 0) {
fprintf(stderr, "RSDP has bad checksum; summed to %x\n", csum);
exit(1);
}
/* Check extended checksum if table version >= 2 */
gencsum(&r->extended_checksum, r, ACPI_RSDP_XCHECKSUM_LENGTH);
if ((rsdp.revision >= 2) &&
(acpi_tb_checksum((uint8_t *) r, ACPI_RSDP_XCHECKSUM_LENGTH) != 0)) {
fprintf(stderr, "RSDP has bad checksum v2\n");
exit(1);
}
/* just leave a bunch of space for the xsdt. */
/* we need to zero the area since it has pointers. */
x = a;
a += sizeof(*x) + 8*sizeof(void *);
memset(x, 0, a - (void *)x);
fprintf(stderr, "install xsdt to %p\n", x);
*x = xsdt;
x->table_offset_entry[0] = 0;
x->table_offset_entry[1] = 0;
x->header.length = a - (void *)x;
f = a;
fprintf(stderr, "install fadt to %p\n", f);
*f = fadt;
x->table_offset_entry[0] = (uint64_t)f; // fadt MUST be first in xsdt!
a += sizeof(*f);
f->header.length = a - (void *)f;
f->Xdsdt = (uint64_t) a;
fprintf(stderr, "install dsdt to %p\n", a);
memcpy(a, &DSDT_DSDTTBL_Header, 36);
a += 36;
gencsum(&f->header.checksum, f, f->header.length);
if (acpi_tb_checksum((uint8_t *)f, f->header.length) != 0) {
fprintf(stderr, "fadt has bad checksum v2\n");
exit(1);
}
m = a;
*m = madt;
x->table_offset_entry[3] = (uint64_t) m;
a += sizeof(*m);
fprintf(stderr, "install madt to %p\n", m);
memmove(a, &Apic0, sizeof(Apic0));
a += sizeof(Apic0);
memmove(a, &Apic1, sizeof(Apic1));
a += sizeof(Apic1);
memmove(a, &X2Apic0, sizeof(X2Apic0));
a += sizeof(X2Apic0);
memmove(a, &isor, sizeof(isor));
a += sizeof(isor);
m->header.length = a - (void *)m;
gencsum(&m->header.checksum, m, m->header.length);
if (acpi_tb_checksum((uint8_t *) m, m->header.length) != 0) {
fprintf(stderr, "madt has bad checksum v2\n");
exit(1);
}
gencsum(&x->header.checksum, x, x->header.length);
if ((csum = acpi_tb_checksum((uint8_t *) x, x->header.length)) != 0) {
fprintf(stderr, "XSDT has bad checksum; summed to %x\n", csum);
exit(1);
}
fprintf(stderr, "allchecksums ok\n");
hexdump(stdout, r, a-(void *)r);
a = (void *)(((unsigned long)a + 0xfff) & ~0xfff);
gpci.posted_irq_desc = a;
memset(a, 0, 4096);
a += 4096;
gpci.vapic_addr = a;
memset(a, 0, 4096);
((uint32_t *)a)[0x30/4] = 0x01060014;
// set up apic values? do we need to?
// qemu does this.
//((uint8_t *)a)[4] = 1;
a += 4096;
gpci.apic_addr = (void*)0xfee00000;
/* Allocate memory for, and zero the bootparams
* page before writing to it, or Linux thinks
* we're talking crazy.
*/
a += 4096;
bp = a;
memset(bp, 0, 4096);
/* Put the e820 memory region information in the boot_params */
bp->e820_entries = 5;
int e820i = 0;
/* Give it just a tiny bit of memory -- 60k -- at low memory. */
bp->e820_map[e820i].addr = 0;
bp->e820_map[e820i].size = 4 * 1024;
bp->e820_map[e820i++].type = E820_RESERVED;
bp->e820_map[e820i].addr = 4 * 1024;
bp->e820_map[e820i].size = 64 * 1024 - 4 * 1024;
bp->e820_map[e820i++].type = E820_RAM;
bp->e820_map[e820i].addr = 64 * 1024;
bp->e820_map[e820i].size = memstart - 64 * 1024;
bp->e820_map[e820i++].type = E820_RESERVED;
bp->e820_map[e820i].addr = memstart;
bp->e820_map[e820i].size = memsize;
bp->e820_map[e820i++].type = E820_RAM;
bp->e820_map[e820i].addr = 0xf0000000;
bp->e820_map[e820i].size = 0x10000000;
bp->e820_map[e820i++].type = E820_RESERVED;
/* The MMIO address of the console device is really the address of an
* unbacked EPT page: accesses to this page will cause a page fault that
* traps to the host, which will examine the fault, see it was for the
* known MMIO address, and fulfill the MMIO read or write on the guest's
* behalf accordingly. We place the virtio space at 512 GB higher than the
* guest physical memory to avoid a full page table walk. */
uint64_t virtio_mmio_base_addr = ROUNDUP((bp->e820_map[e820i - 1].addr +
bp->e820_map[e820i - 1].size),
512 * GiB);
cons_mmio_dev.addr =
virtio_mmio_base_addr + PGSIZE * VIRTIO_MMIO_CONSOLE_DEV;
cons_mmio_dev.vqdev = &cons_vqdev;
vm->virtio_mmio_devices[VIRTIO_MMIO_CONSOLE_DEV] = &cons_mmio_dev;
net_mmio_dev.addr =
virtio_mmio_base_addr + PGSIZE * VIRTIO_MMIO_NETWORK_DEV;
net_mmio_dev.vqdev = &net_vqdev;
vm->virtio_mmio_devices[VIRTIO_MMIO_NETWORK_DEV] = &net_mmio_dev;
if (disk_image_file != NULL) {
blk_mmio_dev.addr =
virtio_mmio_base_addr + PGSIZE * VIRTIO_MMIO_BLOCK_DEV;
blk_mmio_dev.vqdev = &blk_vqdev;
vm->virtio_mmio_devices[VIRTIO_MMIO_BLOCK_DEV] = &blk_mmio_dev;
blk_init_fn(&blk_vqdev, disk_image_file);
}
set_vnet_opts(net_opts);
vnet_init(vm, &net_vqdev);
set_vnet_port_fwds(net_opts);
/* Set the kernel command line parameters */
a += 4096;
cmdline = a;
a += 4096;
if (smbiostable) {
if (a > (void *)0xf0000) {
fprintf(stderr, "No room for SMBIOS table: current table pointer is %p\n",
a);
exit(1);
}
a = (void *)0xf0000;
fprintf(stderr, "Using SMBIOS table %s\n", smbiostable);
a += smbios(smbiostable, a);
}
bp->hdr.cmd_line_ptr = (uintptr_t) cmdline;
tsc_freq_khz = get_tsc_freq()/1000;
len = snprintf(cmdline, 4096, "%s tscfreq=%lld %s", cmdline_default,
tsc_freq_khz, cmdline_extra);
cmdlinesz = 4096 - len;
cmdlinep = cmdline + len;
for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
if (vm->virtio_mmio_devices[i] == NULL)
continue;
/* Append all the virtio mmio base addresses. */
/* Since the lower number irqs are no longer being used, the irqs
* can now be assigned starting from 0.
*/
vm->virtio_mmio_devices[i]->irq = i;
len = snprintf(cmdlinep, cmdlinesz,
" virtio_mmio.device=1K@0x%llx:%lld",
vm->virtio_mmio_devices[i]->addr,
vm->virtio_mmio_devices[i]->irq);
if (len >= cmdlinesz) {
fprintf(stderr, "Too many arguments to the linux command line.");
exit(1);
}
cmdlinesz -= len;
cmdlinep += len;
}
vm->nr_gpcs = 1;
vm->gpcis = &gpci;
ret = vmm_init(vm, vmmflags);
assert(!ret);
/* How many page table pages do we need? We conservatively
* assume that we are in low memory, and hence assume a
* 0-based range. Note that in many cases, kernels will
* immediately set up their own map. But for "dune" like
* applications, it's necessary. Note also that in most cases,
* the total number of pages will be < 16 or so. */
npml4 = DIV_ROUND_UP(memstart + memsize, PML4_REACH);
nptp = npml4;
npml3 = DIV_ROUND_UP(memstart + memsize, PML3_REACH);
nptp += npml3;
/* and 1 for each 2 MiB of memory */
npml2 = DIV_ROUND_UP(memstart + memsize, PML2_REACH);
nptp += npml2;
fprintf(stderr, "Memstart + memsize is %llx; %d pml4 %d pml3 %d pml2\n",
memstart + memsize, npml4, npml3, npml2);
/* Place these page tables right after VM memory. We
* used to use posix_memalign but that puts them
* outside EPT-accessible space on some CPUs. */
p512 = mmap((void *)memstart + memsize, nptp * 4096, PROT_READ | PROT_WRITE,
MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
if (p512 == MAP_FAILED) {
perror("page table page alloc");
exit(1);
}
p1 = &p512[npml4];
p2m = &p1[npml3];
/* Set up a 1:1 ("identity") page mapping from guest virtual
* to guest physical using the (host virtual)
* `kerneladdress`. This mapping may be used for only a short
* time, until the guest sets up its own page tables. Be aware
* that the values stored in the table are physical addresses.
* This is subtle and mistakes are easily disguised due to the
* identity mapping, so take care when manipulating these
* mappings. */
p2m->pte[PML2(0)] = (uint64_t)0 | PTE_KERN_RW | PTE_PS;
fprintf(stderr, "Map %p for %zu bytes\n", memstart, memsize);
for (uintptr_t p4 = memstart; p4 < memstart + memsize;
p4 += PML4_PTE_REACH, p1++) {
p512->pte[PML4(p4)] = (uint64_t)p1 | PTE_KERN_RW;
if (debug)
fprintf(stderr, "l4@%p: %p set index 0x%x to 0x%llx\n",
&p512->pte[PML4(p4)],
p4, PML4(p4), p512->pte[PML4(p4)]);
for (uintptr_t p3 = p4; p3 < memstart + memsize;
p3 += PML3_PTE_REACH, p2m++) {
p1->pte[PML3(p3)] = (uint64_t)p2m | PTE_KERN_RW;
if (debug)
fprintf(stderr, "\tl3@%p: %p set index 0x%x to 0x%llx\n",
&p1->pte[PML3(p3)],
p3, PML3(p3), p1->pte[PML3(p3)]);
for (uintptr_t p2 = p3; p2 < memstart + memsize;
p2 += PML2_PTE_REACH) {
p2m->pte[PML2(p2)] = (uint64_t)p2 | PTE_KERN_RW | PTE_PS;
if (debug)
fprintf(stderr, "\t\tl2@%p: %p set index 0x%x to 0x%llx\n",
&p2m->pte[PML2(p2)],
p2, PML2(p2), p2m->pte[PML2(p2)]);
}
}
}
vmm_run_task(vm, timer_thread, 0);
vm_tf = gth_to_vmtf(vm->gths[0]);
vm_tf->tf_cr3 = (uint64_t) p512;
vm_tf->tf_rip = entry;
vm_tf->tf_rsp = stack;
vm_tf->tf_rsi = (uint64_t) bp;
start_guest_thread(vm->gths[0]);
uthread_sleep_forever();
return 0;
}