kern/drivers/dev/iommu.c - upstream - Git at Google

 /* Copyright (c) 2019, 2020 Google, Inc.x
  *
  * Driver for accessing Intel iommu
  *
  * Aditya Basu <mitthu@google.com>
  * Barret Rhoden <brho@cs.berkeley.edu>
  *
  * (1) proc->dev_qlock => (2) iommu->iommu_lock
  * (1) proc->dev_qlock => (2) pdev->qlock
  *
  * TODO
  * ====
  *  - In iommu_map_pci_devices() assign the correct iommu for scoped DRHD. Right
  *    now the default iommu is assigned to all devices.
  *  - IOMMU_DID_DEFAULT = 1; this means pid = 1 cannot have a device passthru
  *    because we use the pid as "did" or domain ID.
  *
  * lifecycle of CTE entries:
  * - at boot, every CTE (per pdev on an iommu) is set to non-translating.  In
  *   essence, an identity map.
  * - pci devices are initially assigned to the kernel.
  * - when devices are unassigned, their cte mapping is destroyed.
  * - when they are reassigned, their mapping is set to either an identity map
  *   (kernel) or a process's page table.
  *
  * - On the topic of disabling the IOMMU, we used to have an option to just
  *   unset it completely.  Disable TE, clear the root pointer.  Though the code
  *   we had was hokey and broken.  Even then, if we have a device behind an
  *   IOMMU and disable the IOMMU, that would just fuck everything up.  Maybe if
  *   we had identity mapped pages in the IPT, so that when translation turned
  *   off, the device would still work.  Seems like a mess.
  *
  * - We ought to do a domain-selective, context-cache invalidation whenever we
  *   reuse DIDs.  aka, whenever there is a new IPT for a pid, which is every 65k
  *   processes.  Or maybe every 16k, depending on how many pids we have.
  */

 #include <stdio.h>
 #include <error.h>
 #include <common.h>
 #include <net/ip.h>
 #include <atomic.h>

 #include <acpi.h>
 #include <arch/intel-iommu.h>
 #include <env.h>
 #include <arch/pci.h>
 #include <linux_compat.h>

 #define IOMMU "iommu: "
 #define BUFFERSZ 8192

 struct dev iommudevtab;

 static struct iommu_list_tq iommu_list = TAILQ_HEAD_INITIALIZER(iommu_list);
 static bool iommu_is_supported;

 /* QID Path */
 enum {
 	Qdir         = 0,
 	Qmappings    = 1,
 	Qadddev      = 2,
 	Qremovedev   = 3,
 	Qinfo        = 4,
 };

 static struct dirtab iommudir[] = {
 	{".",                   {Qdir, 0, QTDIR}, 0, 0555},
 	{"mappings",            {Qmappings, 0, QTFILE}, 0, 0755},
 	{"attach",              {Qadddev, 0, QTFILE}, 0, 0755},
 	{"detach",              {Qremovedev, 0, QTFILE}, 0, 0755},
 	{"info",                {Qinfo, 0, QTFILE}, 0, 0755},
 };

 /* OK, we never actually use these, since we won't support any IOMMU that
  * requires RWBF (Required Write Buffer Flushing).
  *
  * On older hardware, if we updated data structures from software, the IOMMU
  * wouldn't necessarily see it.  The software write would get held up at various
  * write buffers.  See 6.8.
  *
  * Certain operations, such as ctx cache and iotlb flushes, were OK.  The HW
  * would implicitly do a write buffer flush.  Other operations, like changing an
  * IPT PTE, which do not necessarily require a command flush, would need the
  * WBF.
  *
  * This is different than caching mode (CM).  In CM, hardware (or more often a
  * virtual IOMMU) caches negative PTEs, and you need to poke the IOMMU whenever
  * changing any PTE.  This RWBF isn't about caching old values; it's about not
  * seeing new values due to buffering.
  *
  * Just about any time you want to do a CM operation, you'd also want to check
  * for RWBF.  Though note that we do not use the IOMMU if it requires either CM
  * or RWBF. */
 static inline void write_buffer_flush(struct iommu *iommu)
 {
 	uint32_t cmd, status;

 	if (!iommu->rwbf)
 		return;

 	cmd = read32(iommu->regio + DMAR_GCMD_REG) | DMA_GCMD_WBF;
 	write32(cmd, iommu->regio + DMAR_GCMD_REG);

 	do {
 		status = read32(iommu->regio + DMAR_GSTS_REG);
 	} while (status & DMA_GSTS_WBFS);
 }

 /* OK, read and write draining on flush.  At first I thought this was about
  * ops that queued up, but hadn't gone through the IOMMU yet.  Instead, this is
  * about ops that made it through the IOMMU, but have not made it to main
  * memory.  i.e., the IOMMU translated to a physical address, but the write to
  * that paddr hasn't made it to RAM.  The reason we ask for a TLB flush is
  * typically to make sure the PTE / translation is no longer in use.  Undrained
  * operations that made it past the IOMMU are still using the old translation.
  * Thus we should always read/write drain. */
 static void __iotlb_flush_global(struct iommu *iommu)
 {
 	write64(DMA_TLB_IVT | DMA_TLB_READ_DRAIN | DMA_TLB_WRITE_DRAIN |
 		DMA_TLB_GLOBAL_FLUSH,
 		iommu->regio + iommu->iotlb_cmd_offset);

 	while (read64(iommu->regio + iommu->iotlb_cmd_offset) & DMA_TLB_IVT)
 		cpu_relax();
 }

 static void iotlb_flush(struct iommu *iommu, uint16_t did)
 {
 	write64(DMA_TLB_IVT | DMA_TLB_READ_DRAIN | DMA_TLB_WRITE_DRAIN |
 		DMA_TLB_DSI_FLUSH | DMA_TLB_DID(did),
 		iommu->regio + iommu->iotlb_cmd_offset);

 	while (read64(iommu->regio + iommu->iotlb_cmd_offset) & DMA_TLB_IVT)
 		cpu_relax();
 }

 static inline struct root_entry *get_root_entry(physaddr_t paddr)
 {
 	return (struct root_entry *) KADDR(paddr);
 }

 static inline struct context_entry *get_context_entry(physaddr_t paddr)
 {
 	return (struct context_entry *) KADDR(paddr);
 }

 static void __cte_set_identity_pgtbl(struct context_entry *cte)
 {
 	cte->hi = 0
 		| (IOMMU_DID_DEFAULT << CTX_HI_DID_SHIFT) // DID bit: 72 to 87
 		| (CTX_AW_L4 << CTX_HI_AW_SHIFT); // AW

 	cte->lo = 0 /* assumes page alignment */
 		| (0x2 << CTX_LO_TRANS_SHIFT)
 		| (0x1 << CTX_LO_FPD_SHIFT) // disable faults
 		| (0x1 << CTX_LO_PRESENT_SHIFT); /* mark present */
 }

 static void __cte_set_proc_pgtbl(struct context_entry *cte, struct proc *p)
 {
 	/* TODO: need to limit PID to 16 bits or come up with an alternative */
 	warn_on(p->pid & ~0xffff);

 	cte->hi = 0
 		| ((uint16_t)p->pid << CTX_HI_DID_SHIFT) // DID bit: 72 to 87
 		| (CTX_AW_L4 << CTX_HI_AW_SHIFT); // AW

 	/* The only difference here is PGDIR and the LO_TRANS_SHIFT */
 	cte->lo = PTE_ADDR(p->env_pgdir.eptp)
 		| (0x0 << CTX_LO_TRANS_SHIFT)
 		| (0x1 << CTX_LO_FPD_SHIFT) // disable faults
 		| (0x1 << CTX_LO_PRESENT_SHIFT); /* mark present */
 }

 static physaddr_t ct_init(void)
 {
 	struct context_entry *cte;
 	physaddr_t ct;

 	cte = (struct context_entry *) kpage_zalloc_addr();
 	ct = PADDR(cte);

 	for (int i = 0; i < 32 * 8; i++, cte++) // device * func
 		__cte_set_identity_pgtbl(cte);

 	return ct;
 }

 /* Get a new root_entry table.  Allocates all context entries. */
 static physaddr_t rt_init(void)
 {
 	struct root_entry *rte;
 	physaddr_t rt;
 	physaddr_t ct;

 	/* Page Align = 0x1000 */
 	rte = (struct root_entry *) kpage_zalloc_addr();
 	rt = PADDR(rte);

 	/* create context table */
 	for (int i = 0; i < 256; i++, rte++) {
 		ct = ct_init();
 		rte->hi = 0;
 		rte->lo = 0
 			| ct
 			| (0x1 << RT_LO_PRESENT_SHIFT);
 	}

 	return rt;
 }

 static struct context_entry *get_ctx_for(struct iommu *iommu,
 					 struct pci_device *pdev)
 {
 	struct root_entry *rte;
 	physaddr_t cte_phy;
 	struct context_entry *cte;
 	uint32_t offset = 0;

 	rte = get_root_entry(iommu->roottable) + pdev->bus;

 	cte_phy = rte->lo & 0xFFFFFFFFFFFFF000;
 	cte = get_context_entry(cte_phy);

 	offset = (pdev->dev * 8) + pdev->func;
 	cte += offset;

 	return cte;
 }

 static void __iommu_clear_pgtbl(struct pci_device *pdev, uint16_t did)
 {
 	struct iommu *iommu = pdev->iommu;
 	struct context_entry *cte = get_ctx_for(iommu, pdev);

 	cte->lo &= ~0x1;

 	spin_lock_irqsave(&iommu->iommu_lock);
 	iotlb_flush(iommu, did);
 	spin_unlock_irqsave(&iommu->iommu_lock);
 }

 /* Hold the proc's dev_qlock.  This returns the linkage for p and i, and inserts
  * if it it didn't exist. */
 static struct iommu_proc_link *__get_linkage(struct proc *p, struct iommu *i)
 {
 	struct iommu_proc_link *l;

 	list_for_each_entry(l, &p->iommus, link) {
 		if (l->i == i)
 			return l;
 	}
 	l = kmalloc(sizeof(struct iommu_proc_link), MEM_WAIT);
 	l->i = i;
 	l->p = p;
 	l->nr_devices = 0;
 	list_add_rcu(&l->link, &p->iommus);
 	return l;
 }

 /* Caller holds the pdev->qlock and if proc, the proc->dev_qlock.
  * Careful, this can throw. */
 void __iommu_device_assign(struct pci_device *pdev, struct proc *proc)
 {
 	struct iommu *iommu = pdev->iommu;
 	struct iommu_proc_link *l;

 	if (!proc) {
 		__cte_set_identity_pgtbl(get_ctx_for(pdev->iommu, pdev));
 		return;
 	}

 	/* Lockless peek.  We hold the dev_qlock, so if we are concurrently
 	 * dying, proc_destroy() will come behind us and undo this.  If
 	 * proc_destroy() already removed all devices, we would see DYING. */
 	if (proc_is_dying(proc))
 		error(EINVAL, "process is dying");
 	l = __get_linkage(proc, iommu);

 	l->nr_devices++;
 	TAILQ_INSERT_TAIL(&proc->pci_devs, pdev, proc_link);

 	__cte_set_proc_pgtbl(get_ctx_for(pdev->iommu, pdev), proc);
 }

 /* Caller holds the pdev->qlock and if proc, the proc->dev_qlock. */
 void __iommu_device_unassign(struct pci_device *pdev, struct proc *proc)
 {
 	struct iommu *iommu = pdev->iommu;
 	struct iommu_proc_link *l;

 	assert(iommu == pdev->iommu);

 	if (!proc) {
 		__iommu_clear_pgtbl(pdev, IOMMU_DID_DEFAULT);
 		return;
 	}

 	l = __get_linkage(proc, iommu);

 	__iommu_clear_pgtbl(pdev, proc->pid);

 	l->nr_devices--;
 	if (!l->nr_devices) {
 		list_del_rcu(&l->link);
 		kfree_rcu(l, rcu);
 	}

 	TAILQ_REMOVE(&proc->pci_devs, pdev, proc_link);
 }

 void iommu_unassign_all_devices(struct proc *p)
 {
 	struct pci_device *pdev, *tp;

 	qlock(&p->dev_qlock);
 	/* If you want to get clever and try to batch up the iotlb flushes, it's
 	 * probably not worth it.  The big concern is that the moment you unlock
 	 * the pdev, it can be reassigned.  If you didn't flush the iotlb yet,
 	 * it might have old entries.  Note that when we flush, we pass the DID
 	 * (p->pid), which the next user of the pdev won't know.  I don't know
 	 * if you need to flush the old DID entry or not before reusing a CTE,
 	 * though probably. */
 	TAILQ_FOREACH_SAFE(pdev, &p->pci_devs, proc_link, tp) {
 		qlock(&pdev->qlock);
 		pci_device_unassign_known(pdev, p);
 		qunlock(&pdev->qlock);
 	}
 	qunlock(&p->dev_qlock);
 }

 void proc_iotlb_flush(struct proc *p)
 {
 	struct iommu_proc_link *l;

 	rcu_read_lock();
 	list_for_each_entry_rcu(l, &p->iommus, link) {
 		spin_lock_irqsave(&l->i->iommu_lock);
 		iotlb_flush(l->i, p->pid);
 		spin_unlock_irqsave(&l->i->iommu_lock);
 	}
 	rcu_read_unlock();
 }

 static void __set_root_table(struct iommu *iommu, physaddr_t roottable)
 {
 	write64(roottable, iommu->regio + DMAR_RTADDR_REG);
 	write32(DMA_GCMD_SRTP, iommu->regio + DMAR_GCMD_REG);
 	/* Unlike the write-buffer-flush status and ICC completion check,
 	 * hardware *sets* the bit to 1 when it is done */
 	while (!(read32(iommu->regio + DMAR_GSTS_REG) & DMA_GSTS_RTPS))
 		cpu_relax();
 }

 static void __inval_ctx_cache_global(struct iommu *iommu)
 {
 	write64(DMA_CCMD_ICC | DMA_CCMD_GLOBAL_INVL,
 		iommu->regio + DMAR_CCMD_REG);
 	while (read64(iommu->regio + DMAR_CCMD_REG) & DMA_CCMD_ICC)
 		cpu_relax();
 }

 static void __enable_translation(struct iommu *iommu)
 {
 	/* see 10.4.4 for some concerns if we want to update multiple fields.
 	 * (read status, mask the one-shot commands we don't want on, then set
 	 * the ones we do want). */
 	write32(DMA_GCMD_TE, iommu->regio + DMAR_GCMD_REG);
 	while (!(read32(iommu->regio + DMAR_GSTS_REG) & DMA_GSTS_TES))
 		cpu_relax();
 }

 /* Given an iommu with a root table, enable translation.  The default root table
  * (from rt_init()) is set up to not translate.  i.e. IOVA == PA. */
 static void iommu_enable_translation(struct iommu *iommu)
 {
 	spin_lock_irqsave(&iommu->iommu_lock);
 	__set_root_table(iommu, iommu->roottable);
 	__inval_ctx_cache_global(iommu);
 	__iotlb_flush_global(iommu);
 	__enable_translation(iommu);
 	spin_unlock_irqsave(&iommu->iommu_lock);
 }

 /* Iterate over all IOMMUs and make sure the "rba" present in DRHD are unique */
 static bool iommu_asset_unique_regio(void)
 {
 	struct iommu *outer, *inner;
 	uint64_t rba;
 	bool result = true;

 	TAILQ_FOREACH(outer, &iommu_list, iommu_link) {
 		rba = outer->rba;

 		TAILQ_FOREACH(inner, &iommu_list, iommu_link) {
 			if (outer != inner && rba == inner->rba) {
 				outer->supported = false;
 				result = false;
 			}
 		}
 	}

 	return result;
 }

 static bool iommu_has_required_capabilities(struct iommu *iommu)
 {
 	uint64_t cap, ecap;
 	bool support, result = true;

 	cap = read64(iommu->regio + DMAR_CAP_REG);
 	ecap = read64(iommu->regio + DMAR_ECAP_REG);

 	support = (cap_sagaw(cap) & 0x4) >> 2;
 	if (!support) {
 		printk(IOMMU "%p: unsupported paging level: 0x%x\n",
 			iommu, cap_sagaw(cap));
 		result = false;
 	}

 	support = cap_super_page_val(cap) & 0x1;
 	if (!support) {
 		printk(IOMMU "%p: 1GB super pages not supported\n", iommu);
 		result = false;
 	}

 	if (cap_rwbf(cap)) {
 		printk(IOMMU "%p: HW requires RWBF, will abort\n", iommu);
 		result = false;
 	}

 	if (cap_caching_mode(cap)) {
 		printk(IOMMU "%p: HW requires caching_mode, will abort\n",
 		       iommu);
 		result = false;
 	}

 	support = ecap_pass_through(ecap);
 	if (!support) {
 		printk(IOMMU "%p: pass-through translation type in context entries not supported\n", iommu);
 		result = false;
 	}

 	/* max gaw/haw reported by iommu.  It's fine if these differ.  Spec says
 	 * MGAW must be at least the HAW.  It's OK to be more. */
 	iommu->haw_cap = cap_mgaw(cap);
 	if (iommu->haw_cap < iommu->haw_dmar) {
 		printk(IOMMU "%p: HAW mismatch; DMAR reports %d, CAP reports %d, check CPUID\n",
 			iommu, iommu->haw_dmar, iommu->haw_cap);
 	}

 	return result;
 }

 /* All or nothing */
 static bool have_iommu_support(void)
 {
 	struct iommu *iommu;

 	if (TAILQ_EMPTY(&iommu_list))
 		return false;

 	TAILQ_FOREACH(iommu, &iommu_list, iommu_link) {
 		if (!iommu->supported)
 			return false;
 	}
 	return true;
 }

 /* Run this function after all individual IOMMUs are initialized. */
 void iommu_enable_all(void)
 {
 	struct iommu *iommu;
 	static bool once = false;

 	if (once)
 		warn(IOMMU "Called twice, aborting!");
 	once = true;

 	if (!iommu_asset_unique_regio())
 		warn(IOMMU "same register base addresses detected");

 	iommu_is_supported = have_iommu_support();
 	if (!iommu_is_supported) {
 		printk("No supported IOMMUs detected\n");
 		return;
 	}

 	TAILQ_FOREACH(iommu, &iommu_list, iommu_link) {
 		printk("IOMMU: enabling translation on %p\n", iommu);
 		iommu_enable_translation(iommu);
 	}
 }

 static bool _iommu_is_enabled(struct iommu *iommu)
 {
 	uint32_t status = 0;

 	/* Arguably we don't need the lock when reading. */
 	spin_lock_irqsave(&iommu->iommu_lock);
 	status = read32(iommu->regio + DMAR_GSTS_REG);
 	spin_unlock_irqsave(&iommu->iommu_lock);

 	return status & DMA_GSTS_TES;
 }

 static bool iommu_some_is_enabled(void)
 {
 	struct iommu *iommu;

 	TAILQ_FOREACH(iommu, &iommu_list, iommu_link)
 		if (_iommu_is_enabled(iommu))
 			return true;

 	return false;
 }

 /* grabs the iommu of the first DRHD with INCLUDE_PCI_ALL */
 struct iommu *get_default_iommu(void)
 {
 	struct Dmar *dt;

 	/* dmar is a global variable; see acpi.h */
 	if (dmar == NULL)
 		return NULL;

 	dt = dmar->tbl;
 	for (int i = 0; i < dmar->nchildren; i++) {
 		struct Atable *at = dmar->children[i];
 		struct Drhd *drhd = at->tbl;

 		if (drhd->all & 1)
 			return &drhd->iommu;
 	}

 	return NULL;
 }

 void iommu_map_pci_devices(void)
 {
 	struct pci_device *pci_iter;
 	struct iommu *iommu = get_default_iommu();

 	if (!iommu)
 		return;

 	/* set the default iommu */
 	STAILQ_FOREACH(pci_iter, &pci_devices, all_dev) {
 		pci_iter->iommu = iommu;
 		TAILQ_INSERT_TAIL(&iommu->pci_devs, pci_iter, iommu_link);
 	}
 }

 /* This is called from acpi.c to initialize an iommu. */
 void iommu_acpi_init(struct iommu *iommu, uint8_t haw, uint64_t rba)
 {
 	uint64_t cap, ecap;

 	TAILQ_INIT(&iommu->pci_devs);
 	spinlock_init_irqsave(&iommu->iommu_lock);
 	iommu->rba = rba;
 	iommu->regio = (void __iomem *) vmap_pmem_nocache(rba, VTD_PAGE_SIZE);
 	if (!iommu->regio)
 		warn("Unable to map the iommu, aborting!");
 	iommu->haw_dmar = haw;

 	iommu->supported = iommu_has_required_capabilities(iommu);

 	cap = read64(iommu->regio + DMAR_CAP_REG);
 	ecap = read64(iommu->regio + DMAR_ECAP_REG);

 	/* Creates a root table for non-translating identity maps, but it is not
 	 * enabled / turned on yet. */
 	iommu->roottable = rt_init();
 	iommu->iotlb_cmd_offset = ecap_iotlb_offset(ecap) + 8;
 	iommu->iotlb_addr_offset = ecap_iotlb_offset(ecap);

 	iommu->rwbf = cap_rwbf(cap);
 	iommu->device_iotlb = ecap_dev_iotlb_support(ecap);

 	/* add the iommu to the list of all discovered iommu */
 	TAILQ_INSERT_TAIL(&iommu_list, iommu, iommu_link);
 }

 static void assign_device(int bus, int dev, int func, pid_t pid)
 {
 	ERRSTACK(1);
 	int tbdf = MKBUS(BusPCI, bus, dev, func);
 	struct pci_device *pdev = pci_match_tbdf(tbdf);
 	struct proc *p;

 	if (!pdev)
 		error(EIO, "cannot find dev %x:%x.%x\n", bus, dev, func);
 	if (!pid) {
 		pci_device_assign(pdev, NULL);
 		return;
 	}
 	if (pid == 1)
 		error(EIO, "device passthru not supported for pid = 1");
 	p = pid2proc(pid);
 	if (!p)
 		error(EIO, "cannot find pid %d\n", pid);
 	if (waserror()) {
 		proc_decref(p);
 		nexterror();
 	}
 	pci_device_assign(pdev, p);
 	proc_decref(p);
 	poperror();
 }

 static void unassign_device(int bus, int dev, int func, pid_t pid)
 {
 	ERRSTACK(1);
 	int tbdf = MKBUS(BusPCI, bus, dev, func);
 	struct pci_device *pdev = pci_match_tbdf(tbdf);
 	struct proc *p;

 	if (!pdev)
 		error(EIO, "cannot find dev %x:%x.%x\n", bus, dev, func);
 	if (!pid) {
 		pci_device_unassign(pdev, NULL);
 		return;
 	}
 	p = pid2proc(pid);
 	if (!p)
 		error(EIO, "cannot find pid %d\n", pid);
 	if (waserror()) {
 		proc_decref(p);
 		nexterror();
 	}
 	pci_device_unassign(pdev, p);
 	proc_decref(p);
 	poperror();
 }

 static struct sized_alloc *open_mappings(void)
 {
 	struct iommu *iommu;
 	bool has_dev = false;
 	struct pci_device *pdev;
 	struct sized_alloc *sza = sized_kzmalloc(BUFFERSZ, MEM_WAIT);

 	TAILQ_FOREACH(iommu, &iommu_list, iommu_link) {
 		sza_printf(sza, "Mappings for iommu@%p\n", iommu);
 		spin_lock_irqsave(&iommu->iommu_lock);
 		TAILQ_FOREACH(pdev, &iommu->pci_devs, iommu_link) {
 			if (!pdev->proc_owner)
 				continue;
 			has_dev = true;
 			sza_printf(sza, "\tdevice %02x:%02x.%x, PID %u\n",
 				   pdev->bus, pdev->dev, pdev->func,
 				   pdev->proc_owner->pid);
 		}
 		spin_unlock_irqsave(&iommu->iommu_lock);
 		if (!has_dev)
 			sza_printf(sza, "\t<empty>\n");
 	}

 	return sza;
 }

 static void _open_info(struct iommu *iommu, struct sized_alloc *sza)
 {
 	uint64_t value;

 	sza_printf(sza, "\niommu@%p\n", iommu);
 	sza_printf(sza, "\trba = %p\n", iommu->rba);
 	sza_printf(sza, "\tsupported = %s\n", iommu->supported ? "yes" : "no");
 	sza_printf(sza, "\tregspace = %p\n", iommu->regio);
 	sza_printf(sza, "\thost addr width (dmar) = %d\n", iommu->haw_dmar);
 	sza_printf(sza, "\thost addr width (cap[mgaw]) = %d\n",
 		iommu->haw_cap);
 	value = read32(iommu->regio + DMAR_VER_REG);
 	sza_printf(sza, "\tversion = 0x%x\n", value);

 	value = read64(iommu->regio + DMAR_CAP_REG);
 	sza_printf(sza, "\tcapabilities = %p\n", value);
 	sza_printf(sza, "\t\tmgaw: %d\n", cap_mgaw(value));
 	sza_printf(sza, "\t\tsagaw (paging level): 0x%x\n", cap_sagaw(value));
 	sza_printf(sza, "\t\tcaching mode: %s (%d)\n", cap_caching_mode(value) ?
 		"yes" : "no", cap_caching_mode(value));
 	sza_printf(sza, "\t\tzlr: 0x%x\n", cap_zlr(value));
 	sza_printf(sza, "\t\trwbf: %s\n", cap_rwbf(value) ? "required"
 							  : "not required");
 	sza_printf(sza, "\t\tnum domains: %d\n", cap_ndoms(value));
 	sza_printf(sza, "\t\tsupports protected high-memory region: %s\n",
 		cap_phmr(value) ? "yes" : "no");
 	sza_printf(sza, "\t\tsupports Protected low-memory region: %s\n",
 		cap_plmr(value) ? "yes" : "no");

 	value = read64(iommu->regio + DMAR_ECAP_REG);
 	sza_printf(sza, "\text. capabilities = %p\n", value);
 	sza_printf(sza, "\t\tpass through: %s\n",
 		ecap_pass_through(value) ? "yes" : "no");
 	sza_printf(sza, "\t\tdevice iotlb: %s\n",
 		ecap_dev_iotlb_support(value) ? "yes" : "no");
 	sza_printf(sza, "\t\tiotlb register offset: 0x%x\n",
 		ecap_iotlb_offset(value));
 	sza_printf(sza, "\t\tsnoop control: %s\n",
 		ecap_sc_support(value) ? "yes" : "no");
 	sza_printf(sza, "\t\tcoherency: %s\n",
 		ecap_coherent(value) ? "yes" : "no");
 	sza_printf(sza, "\t\tqueue invalidation support: %s\n",
 		ecap_qis(value) ? "yes" : "no");
 	sza_printf(sza, "\t\tinterrupt remapping support: %s\n",
 		ecap_ir_support(value) ? "yes" : "no");
 	sza_printf(sza, "\t\textended interrupt mode: 0x%x\n",
 		ecap_eim_support(value));

 	value = read32(iommu->regio + DMAR_GSTS_REG);
 	sza_printf(sza, "\tglobal status = 0x%x\n", value);
 	sza_printf(sza, "\t\ttranslation: %s\n",
 		value & DMA_GSTS_TES ? "enabled" : "disabled");
 	sza_printf(sza, "\t\troot table: %s\n",
 		value & DMA_GSTS_RTPS ? "set" : "not set");

 	value = read64(iommu->regio + DMAR_RTADDR_REG);
 	sza_printf(sza, "\troot entry table = %p (phy) or %p (vir)\n",
 			value, KADDR(value));
 }

 static struct sized_alloc *open_info(void)
 {
 	struct sized_alloc *sza = sized_kzmalloc(BUFFERSZ, MEM_WAIT);
 	uint64_t value;
 	struct iommu *iommu;

 	sza_printf(sza, "driver info:\n");

 	value = IOMMU_DID_DEFAULT;
 	sza_printf(sza, "\tdefault did = %d\n", value);
 	sza_printf(sza, "\tstatus = %s\n",
 		iommu_some_is_enabled() ? "enabled" : "disabled");

 	TAILQ_FOREACH(iommu, &iommu_list, iommu_link) {
 		_open_info(iommu, sza);
 	}

 	return sza;
 }

 static char *devname(void)
 {
 	return iommudevtab.name;
 }

 static struct chan *iommuattach(char *spec)
 {
 	return devattach(devname(), spec);
 }

 static struct walkqid *iommuwalk(struct chan *c, struct chan *nc, char **name,
 			 unsigned int nname)
 {
 	return devwalk(c, nc, name, nname, iommudir,
 		       ARRAY_SIZE(iommudir), devgen);
 }

 static size_t iommustat(struct chan *c, uint8_t *dp, size_t n)
 {
 	return devstat(c, dp, n, iommudir, ARRAY_SIZE(iommudir), devgen);
 }

 static struct chan *iommuopen(struct chan *c, int omode)
 {
 	switch (c->qid.path) {
 	case Qmappings:
 		c->synth_buf = open_mappings();
 		break;
 	case Qinfo:
 		c->synth_buf = open_info();
 		break;
 	case Qadddev:
 	case Qremovedev:
 	case Qdir:
 	default:
 		break;
 	}

 	return devopen(c, omode, iommudir, ARRAY_SIZE(iommudir), devgen);
 }

 /*
  * All files are synthetic. Hence we do not need to implement any close
  * function.
  */
 static void iommuclose(struct chan *c)
 {
 	switch (c->qid.path) {
 	case Qmappings:
 	case Qinfo:
 		kfree(c->synth_buf);
 		c->synth_buf = NULL;
 		break;
 	case Qadddev:
 	case Qremovedev:
 	case Qdir:
 	default:
 		break;
 	}
 }

 static size_t iommuread(struct chan *c, void *va, size_t n, off64_t offset)
 {
 	struct sized_alloc *sza = c->synth_buf;

 	switch (c->qid.path) {
 	case Qdir:
 		return devdirread(c, va, n, iommudir,
 				  ARRAY_SIZE(iommudir), devgen);
 	case Qadddev:
 		return readstr(offset, va, n,
 		    "write format: xx:yy.z pid\n"
 		    "   xx  = bus (in hex)\n"
 		    "   yy  = device (in hex)\n"
 		    "   z   = function (in hex)\n"
 		    "   pid = process pid\n"
 		    "\nexample:\n"
 		    "$ echo 00:1f.2 13 >\\#iommu/attach\n");
 	case Qremovedev:
 		return readstr(offset, va, n,
 		    "write format: xx:yy.z\n"
 		    "   xx  = bus (in hex)\n"
 		    "   yy  = device (in hex)\n"
 		    "   z   = function (in hex)\n"
 		    "\nexample:\n"
 		    "$ echo 00:1f.2 >\\#iommu/detach\n");
 	case Qmappings:
 	case Qinfo:
 		return readstr(offset, va, n, sza->buf);
 	default:
 		error(EIO, "read: qid %d is impossible", c->qid.path);
 	}

 	return -1; /* not reached */
 }

 static void get_bdf_pid(struct cmdbuf *cb, int *bus, int *dev, int *func,
 			pid_t *pid)
 {
 	int err;

 	if (cb->nf < 2)
 		error(EFAIL, "bb:dd.f pid");

 	err = sscanf(cb->f[0], "%x:%x.%x", bus, dev, func);
 	if (err != 3)
 		error(EIO,
 		  IOMMU "error parsing bdf %s; nr parsed: %d", cb->f[0], err);

 	*pid = strtoul(cb->f[1], 0, 0);
 }

 static void write_add_dev(struct chan *c, struct cmdbuf *cb)
 {
 	int bus, dev, func;
 	pid_t pid;

 	get_bdf_pid(cb, &bus, &dev, &func, &pid);

 	if (pid == 1)
 		error(EIO, IOMMU "device passthru not supported for pid = 1");

 	assign_device(bus, dev, func, pid);
 }

 static void write_remove_dev(struct chan *c, struct cmdbuf *cb)
 {
 	int bus, dev, func;
 	pid_t pid;

 	get_bdf_pid(cb, &bus, &dev, &func, &pid);

 	unassign_device(bus, dev, func, pid);
 }

 static size_t iommuwrite(struct chan *c, void *va, size_t n, off64_t offset)
 {
 	ERRSTACK(1);
 	struct cmdbuf *cb = parsecmd(va, n);

 	if (waserror()) {
 		kfree(cb);
 		nexterror();
 	}
 	switch (c->qid.path) {
 	case Qadddev:
 		if (!iommu_is_supported)
 			error(EROFS, IOMMU "not supported");
 		write_add_dev(c, cb);
 		break;
 	case Qremovedev:
 		if (!iommu_is_supported)
 			error(EROFS, IOMMU "not supported");
 		write_remove_dev(c, cb);
 		break;
 	case Qmappings:
 	case Qinfo:
 	case Qdir:
 		error(EROFS, IOMMU "cannot modify");
 	default:
 		error(EIO, "write: qid %d is impossible", c->qid.path);
 	}
 	kfree(cb);
 	poperror();
 	return n;
 }

 struct dev iommudevtab __devtab = {
 	.name       = "iommu",
 	.reset      = devreset,
 	.init       = devinit,
 	.shutdown   = devshutdown,
 	.attach     = iommuattach,
 	.walk       = iommuwalk,
 	.stat       = iommustat,
 	.open       = iommuopen,
 	.create     = devcreate,
 	.close      = iommuclose,
 	.read       = iommuread,
 	.bread      = devbread,
 	.write      = iommuwrite,
 	.bwrite     = devbwrite,
 	.remove     = devremove,
 	.wstat      = devwstat,
 };