user/vmm/decode.c - upstream - Git at Google

 /*
  * Copyright 2015 Google Inc.
  *
  * This file is part of Akaros.
  *
  * Akarosn is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation, version 2 of the License.
  *
  * Akaros is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * Lesser GNU General Public License for more details.
  *
  * See COPYING.LESSER for details on the GNU Lesser General Public License.
  * See COPYING for details on the GNU General Public License.
  */

 #include <stdio.h>
 #include <sys/types.h>
 #include <pthread.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <parlib/arch/arch.h>
 #include <parlib/ros_debug.h>
 #include <unistd.h>
 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/uio.h>
 #include <stdint.h>
 #include <err.h>
 #include <sys/mman.h>
 #include <vmm/vmm.h>
 #include <vmm/virtio.h>
 #include <vmm/virtio_mmio.h>
 #include <vmm/virtio_ids.h>
 #include <vmm/virtio_config.h>
 #include <ros/arch/trapframe.h>

 int debug_decode = 0;
 #define DPRINTF(fmt, ...) \
 	if (debug_decode) { printf("decode: " fmt , ## __VA_ARGS__); }

 static char *modrmreg[] = {"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi"};

 // Since we at most have to decode less than half of each instruction, I'm trying to be dumb here.
 // Fortunately, for me, that's not hard.
 // I'm trying to avoid the whole Big Fun of full instruction decode, and in most of these
 // cases we only have to know register, address, operation size, and instruction length.
 // The ugly messiness of the SIB and all that are not yet needed. Maybe they
 // never will be.

 // Target size -- 1, 2, 4, or 8 bytes. We have yet to see 64 bytes.
 // TODO: if we ever see it, test the prefix. Since this only supports the low 1M,
 // that's not likely.
 static int target(void *insn, int *store)
 {
 	*store = 0;
 	int s = -1;
 	uint8_t *byte = insn;
 	uint16_t *word = insn;

 	if (*byte == 0x66) {
 		s = target(insn+1,store);
 		// flip the sense of s.
 		s = s == 4 ? 2 : 4;
 		return s;
 	}
 	if (*byte == 0x44) {
 		byte++;
 		word++;
 	}
 	switch(*byte) {
 	case 0x3a:
 	case 0x8a:
 	case 0x88:
 		s = 1;
 		break;
 	case 0x89:
 	case 0x8b:
 		// TODO: To really know, for sure, that this is 32 bit, we'd likely have
 		//       to check the segment descriptor for the guest's current code
 		//       segment in it's GDT. The D flag (bit 22) determines whether the
 		//       instruction is using 32 or 16-bit operand size. I'm just going
 		//       to assume the flag is set (meaning 32 bit operands) for now, in
 		//       order to make virtio work. But really we should check if we
 		//       want to know for sure. Note that this hack (changing the below
 		//       line) only applies to mov instructions.
 		//
 		//       And I think there's also a prefix you can use to switch the
 		//       instruction to 16-bit addressing
 		//       (address-size override prefix?)
 		s = 4;
 		break;
 	case 0x81:
 		s = 4;
 		break;
 	case 0x0f:
 	switch(*word) {
 		case 0xb70f:
 			s = 2;
 			break;
 		default:
 			fprintf(stderr, "can't get size of %02x/%04x @ %p\n", *byte, *word, byte);
 			return -1;
 			break;
 		}
 		break;
 	default:
 		fprintf(stderr, "can't get size of %02x @ %p\n", *byte, byte);
 		return -1;
 		break;
 	}

 	switch(*byte) {
 	case 0x0f:
 		break;
 	case 0x3a:
 	case 0x8a:
 	case 0x88:
 	case 0x89:
 	case 0x8b:
 	case 0x81:
 		*store = !(*byte & 2);
 		break;
 	default:
 		fprintf(stderr, "%s: Can't happen. rip is: %p\n", __func__, byte);
 		break;
 	}
 	return s;
 }

 char *regname(uint8_t reg)
 {
 	return modrmreg[reg];
 }

 static int insize(void *rip)
 {
 	uint8_t *kva = rip;
 	int advance = 3;
 	int extra = 0;
 	if (kva[0] == 0x44) {
 		extra = 1;
 		kva++;
 	}

 	/* the dreaded mod/rm byte. */
 	int mod = kva[1]>>6;
 	int rm = kva[1] & 7;

 	switch(kva[0]) {
 	default:
 		fprintf(stderr, "BUG! %s got 0x%x\n", __func__, kva[0]);
 	case 0x0f:
 		break;
 	case 0x81:
 		advance = 6 + extra;
 		break;
 	case 0x3a:
 	case 0x8a:
 	case 0x88:
 	case 0x89:
 	case 0x8b:
 		switch (mod) {
 		case 0:
 			advance = 2 + (rm == 4) + extra;
 			break;
 		case 1:
 			advance = 3 + (rm == 4) + extra;
 			break;
 		case 2:
 			advance = 6 + (rm == 4) + extra;
 			break;
 		case 3:
 			advance = 2 + extra;
 			break;
 		}
 		break;
 	}
 	return advance;
 }

 // This is a very limited function. It's only here to manage virtio-mmio and low memory
 // pointer loads. I am hoping it won't grow with time. The intent is that we enter it with
 // and EPT fault from a region that is deliberately left unbacked by any memory. We return
 // enough info to let you emulate the operation if you want. Because we have the failing physical
 // address (gpa) the decode is far simpler because we only need to find the register, how many bytes
 // to move, and how big the instruction is. I thought about bringing in emulate.c from kvm from xen,
 // but it has way more stuff than we need.
 // gpa is a pointer to the gpa.
 // int is the reg index which we can use for printing info.
 // regp points to the register in hw_trapframe from which
 // to load or store a result.
 int decode(struct guest_thread *vm_thread, uint64_t *gpa, uint8_t *destreg,
            uint64_t **regp, int *store, int *size, int *advance)
 {
 	struct vm_trapframe *vm_tf = &(vm_thread->uthread.u_ctx.tf.vm_tf);

 	DPRINTF("v is %p\n", vm_tf);

 	// Duh, which way did he go George? Which way did he go?
 	// First hit on Google gets you there!
 	// This is the guest physical address of the access.
 	// This is nice, because if we ever go with more complete
 	// instruction decode, knowing this gpa reduces our work:
 	// we don't have to find the source address in registers,
 	// only the register holding or receiving the value.
 	*gpa = vm_tf->tf_guest_pa;
 	DPRINTF("gpa is %p\n", *gpa);

 	// To find out what to do, we have to look at
 	// RIP. Technically, we should read RIP, walk the page tables
 	// to find the PA, and read that. But we're in the kernel, so
 	// we take a shortcut for now: read the low 30 bits and use
 	// that as the kernel PA, or our VA, and see what's
 	// there. Hokey. Works.
 	uint8_t *kva = (void *)(vm_tf->tf_rip & 0x3fffffff);
 	DPRINTF("kva is %p\n", kva);

 	// fail fast. If we can't get the size we're done.
 	*size = target(kva, store);
 	if (*size < 0)
 		return -1;

 	*advance = insize(kva);

 	uint16_t ins = *(uint16_t *)(kva + (kva[0] == 0x44) + (kva[0] == 0x0f));
 	DPRINTF("ins is %04x\n", ins);

 	*destreg = (ins>>11) & 7;
 	*destreg += 8*(kva[0] == 0x44);
 	// Our primitive approach wins big here.
 	// We don't have to decode the register or the offset used
 	// in the computation; that was done by the CPU and is the gpa.
 	// All we need to know is which destination or source register it is.
 	switch (*destreg) {
 	case 0:
 		*regp = &vm_tf->tf_rax;
 		break;
 	case 1:
 		*regp = &vm_tf->tf_rcx;
 		break;
 	case 2:
 		*regp = &vm_tf->tf_rdx;
 		break;
 	case 3:
 		*regp = &vm_tf->tf_rbx;
 		break;
 	case 4:
 		*regp = &vm_tf->tf_rsp; // uh, right.
 		break;
 	case 5:
 		*regp = &vm_tf->tf_rbp;
 		break;
 	case 6:
 		*regp = &vm_tf->tf_rsi;
 		break;
 	case 7:
 		*regp = &vm_tf->tf_rdi;
 		break;
 	case 8:
 		*regp = &vm_tf->tf_r8;
 		break;
 	case 9:
 		*regp = &vm_tf->tf_r9;
 		break;
 	case 10:
 		*regp = &vm_tf->tf_r10;
 		break;
 	case 11:
 		*regp = &vm_tf->tf_r11;
 		break;
 	case 12:
 		*regp = &vm_tf->tf_r12;
 		break;
 	case 13:
 		*regp = &vm_tf->tf_r13;
 		break;
 	case 14:
 		*regp = &vm_tf->tf_r14;
 		break;
 	case 15:
 		*regp = &vm_tf->tf_r15;
 		break;
 	}
 	return 0;
 }
	/*
	* Copyright 2015 Google Inc.
	*
	* This file is part of Akaros.
	*
	* Akarosn is free software: you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation, version 2 of the License.
	*
	* Akaros is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* Lesser GNU General Public License for more details.
	*
	* See COPYING.LESSER for details on the GNU Lesser General Public License.
	* See COPYING for details on the GNU General Public License.
	*/

	#include <stdio.h>
	#include <sys/types.h>
	#include <pthread.h>
	#include <sys/stat.h>
	#include <fcntl.h>
	#include <parlib/arch/arch.h>
	#include <parlib/ros_debug.h>
	#include <unistd.h>
	#include <errno.h>
	#include <stdlib.h>
	#include <string.h>
	#include <sys/uio.h>
	#include <stdint.h>
	#include <err.h>
	#include <sys/mman.h>
	#include <vmm/vmm.h>
	#include <vmm/virtio.h>
	#include <vmm/virtio_mmio.h>
	#include <vmm/virtio_ids.h>
	#include <vmm/virtio_config.h>
	#include <ros/arch/trapframe.h>

	int debug_decode = 0;
	#define DPRINTF(fmt, ...) \
	if (debug_decode) { printf("decode: " fmt , ## __VA_ARGS__); }

	static char *modrmreg[] = {"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi"};

	// Since we at most have to decode less than half of each instruction, I'm trying to be dumb here.
	// Fortunately, for me, that's not hard.
	// I'm trying to avoid the whole Big Fun of full instruction decode, and in most of these
	// cases we only have to know register, address, operation size, and instruction length.
	// The ugly messiness of the SIB and all that are not yet needed. Maybe they
	// never will be.

	// Target size -- 1, 2, 4, or 8 bytes. We have yet to see 64 bytes.
	// TODO: if we ever see it, test the prefix. Since this only supports the low 1M,
	// that's not likely.
	static int target(void insn, int store)
	{
	*store = 0;
	int s = -1;
	uint8_t *byte = insn;
	uint16_t *word = insn;

	if (*byte == 0x66) {
	s = target(insn+1,store);
	// flip the sense of s.
	s = s == 4 ? 2 : 4;
	return s;
	}
	if (*byte == 0x44) {
	byte++;
	word++;
	}
	switch(*byte) {
	case 0x3a:
	case 0x8a:
	case 0x88:
	s = 1;
	break;
	case 0x89:
	case 0x8b:
	// TODO: To really know, for sure, that this is 32 bit, we'd likely have
	// to check the segment descriptor for the guest's current code
	// segment in it's GDT. The D flag (bit 22) determines whether the
	// instruction is using 32 or 16-bit operand size. I'm just going
	// to assume the flag is set (meaning 32 bit operands) for now, in
	// order to make virtio work. But really we should check if we
	// want to know for sure. Note that this hack (changing the below
	// line) only applies to mov instructions.
	//
	// And I think there's also a prefix you can use to switch the
	// instruction to 16-bit addressing
	// (address-size override prefix?)
	s = 4;
	break;
	case 0x81:
	s = 4;
	break;
	case 0x0f:
	switch(*word) {
	case 0xb70f:
	s = 2;
	break;
	default:
	fprintf(stderr, "can't get size of %02x/%04x @ %p\n", byte, word, byte);
	return -1;
	break;
	}
	break;
	default:
	fprintf(stderr, "can't get size of %02x @ %p\n", *byte, byte);
	return -1;
	break;
	}

	switch(*byte) {
	case 0x0f:
	break;
	case 0x3a:
	case 0x8a:
	case 0x88:
	case 0x89:
	case 0x8b:
	case 0x81:
	store = !(byte & 2);
	break;
	default:
	fprintf(stderr, "%s: Can't happen. rip is: %p\n", __func__, byte);
	break;
	}
	return s;
	}

	char *regname(uint8_t reg)
	{
	return modrmreg[reg];
	}

	static int insize(void *rip)
	{
	uint8_t *kva = rip;
	int advance = 3;
	int extra = 0;
	if (kva[0] == 0x44) {
	extra = 1;
	kva++;
	}

	/* the dreaded mod/rm byte. */
	int mod = kva[1]>>6;
	int rm = kva[1] & 7;

	switch(kva[0]) {
	default:
	fprintf(stderr, "BUG! %s got 0x%x\n", __func__, kva[0]);
	case 0x0f:
	break;
	case 0x81:
	advance = 6 + extra;
	break;
	case 0x3a:
	case 0x8a:
	case 0x88:
	case 0x89:
	case 0x8b:
	switch (mod) {
	case 0:
	advance = 2 + (rm == 4) + extra;
	break;
	case 1:
	advance = 3 + (rm == 4) + extra;
	break;
	case 2:
	advance = 6 + (rm == 4) + extra;
	break;
	case 3:
	advance = 2 + extra;
	break;
	}
	break;
	}
	return advance;
	}

	// This is a very limited function. It's only here to manage virtio-mmio and low memory
	// pointer loads. I am hoping it won't grow with time. The intent is that we enter it with
	// and EPT fault from a region that is deliberately left unbacked by any memory. We return
	// enough info to let you emulate the operation if you want. Because we have the failing physical
	// address (gpa) the decode is far simpler because we only need to find the register, how many bytes
	// to move, and how big the instruction is. I thought about bringing in emulate.c from kvm from xen,
	// but it has way more stuff than we need.
	// gpa is a pointer to the gpa.
	// int is the reg index which we can use for printing info.
	// regp points to the register in hw_trapframe from which
	// to load or store a result.
	int decode(struct guest_thread vm_thread, uint64_t gpa, uint8_t *destreg,
	uint64_t *regp, int store, int size, int advance)
	{
	struct vm_trapframe *vm_tf = &(vm_thread->uthread.u_ctx.tf.vm_tf);

	DPRINTF("v is %p\n", vm_tf);

	// Duh, which way did he go George? Which way did he go?
	// First hit on Google gets you there!
	// This is the guest physical address of the access.
	// This is nice, because if we ever go with more complete
	// instruction decode, knowing this gpa reduces our work:
	// we don't have to find the source address in registers,
	// only the register holding or receiving the value.
	*gpa = vm_tf->tf_guest_pa;
	DPRINTF("gpa is %p\n", *gpa);

	// To find out what to do, we have to look at
	// RIP. Technically, we should read RIP, walk the page tables
	// to find the PA, and read that. But we're in the kernel, so
	// we take a shortcut for now: read the low 30 bits and use
	// that as the kernel PA, or our VA, and see what's
	// there. Hokey. Works.
	uint8_t kva = (void )(vm_tf->tf_rip & 0x3fffffff);
	DPRINTF("kva is %p\n", kva);

	// fail fast. If we can't get the size we're done.
	*size = target(kva, store);
	if (*size < 0)
	return -1;

	*advance = insize(kva);

	uint16_t ins = (uint16_t )(kva + (kva[0] == 0x44) + (kva[0] == 0x0f));
	DPRINTF("ins is %04x\n", ins);

	*destreg = (ins>>11) & 7;
	destreg += 8(kva[0] == 0x44);
	// Our primitive approach wins big here.
	// We don't have to decode the register or the offset used
	// in the computation; that was done by the CPU and is the gpa.
	// All we need to know is which destination or source register it is.
	switch (*destreg) {
	case 0:
	*regp = &vm_tf->tf_rax;
	break;
	case 1:
	*regp = &vm_tf->tf_rcx;
	break;
	case 2:
	*regp = &vm_tf->tf_rdx;
	break;
	case 3:
	*regp = &vm_tf->tf_rbx;
	break;
	case 4:
	*regp = &vm_tf->tf_rsp; // uh, right.
	break;
	case 5:
	*regp = &vm_tf->tf_rbp;
	break;
	case 6:
	*regp = &vm_tf->tf_rsi;
	break;
	case 7:
	*regp = &vm_tf->tf_rdi;
	break;
	case 8:
	*regp = &vm_tf->tf_r8;
	break;
	case 9:
	*regp = &vm_tf->tf_r9;
	break;
	case 10:
	*regp = &vm_tf->tf_r10;
	break;
	case 11:
	*regp = &vm_tf->tf_r11;
	break;
	case 12:
	*regp = &vm_tf->tf_r12;
	break;
	case 13:
	*regp = &vm_tf->tf_r13;
	break;
	case 14:
	*regp = &vm_tf->tf_r14;
	break;
	case 15:
	*regp = &vm_tf->tf_r15;
	break;
	}
	return 0;
	}