blob: 4b9019ff7df2c7b4cf7490d065982148cd885659 [file] [log] [blame]
/* Copyright (c) 2019, 2020 Google, Inc.x
*
* Driver for accessing Intel iommu
*
* Aditya Basu <mitthu@google.com>
* Barret Rhoden <brho@cs.berkeley.edu>
*
* (1) proc->dev_qlock => (2) iommu->iommu_lock
* (1) proc->dev_qlock => (2) pdev->qlock
*
* TODO
* ====
* - In iommu_map_pci_devices() assign the correct iommu for scoped DRHD. Right
* now the default iommu is assigned to all devices.
* - IOMMU_DID_DEFAULT = 1; this means pid = 1 cannot have a device passthru
* because we use the pid as "did" or domain ID.
*
* lifecycle of CTE entries:
* - at boot, every CTE (per pdev on an iommu) is set to non-translating. In
* essence, an identity map.
* - pci devices are initially assigned to the kernel.
* - when devices are unassigned, their cte mapping is destroyed.
* - when they are reassigned, their mapping is set to either an identity map
* (kernel) or a process's page table.
*
* - On the topic of disabling the IOMMU, we used to have an option to just
* unset it completely. Disable TE, clear the root pointer. Though the code
* we had was hokey and broken. Even then, if we have a device behind an
* IOMMU and disable the IOMMU, that would just fuck everything up. Maybe if
* we had identity mapped pages in the IPT, so that when translation turned
* off, the device would still work. Seems like a mess.
*
* - We ought to do a domain-selective, context-cache invalidation whenever we
* reuse DIDs. aka, whenever there is a new IPT for a pid, which is every 65k
* processes. Or maybe every 16k, depending on how many pids we have.
*/
#include <stdio.h>
#include <error.h>
#include <common.h>
#include <net/ip.h>
#include <atomic.h>
#include <acpi.h>
#include <arch/intel-iommu.h>
#include <env.h>
#include <arch/pci.h>
#include <linux_compat.h>
#define IOMMU "iommu: "
#define BUFFERSZ 8192
struct dev iommudevtab;
static struct iommu_list_tq iommu_list = TAILQ_HEAD_INITIALIZER(iommu_list);
static bool iommu_is_supported;
/* QID Path */
enum {
Qdir = 0,
Qmappings = 1,
Qadddev = 2,
Qremovedev = 3,
Qinfo = 4,
};
static struct dirtab iommudir[] = {
{".", {Qdir, 0, QTDIR}, 0, 0555},
{"mappings", {Qmappings, 0, QTFILE}, 0, 0755},
{"attach", {Qadddev, 0, QTFILE}, 0, 0755},
{"detach", {Qremovedev, 0, QTFILE}, 0, 0755},
{"info", {Qinfo, 0, QTFILE}, 0, 0755},
};
/* OK, we never actually use these, since we won't support any IOMMU that
* requires RWBF (Required Write Buffer Flushing).
*
* On older hardware, if we updated data structures from software, the IOMMU
* wouldn't necessarily see it. The software write would get held up at various
* write buffers. See 6.8.
*
* Certain operations, such as ctx cache and iotlb flushes, were OK. The HW
* would implicitly do a write buffer flush. Other operations, like changing an
* IPT PTE, which do not necessarily require a command flush, would need the
* WBF.
*
* This is different than caching mode (CM). In CM, hardware (or more often a
* virtual IOMMU) caches negative PTEs, and you need to poke the IOMMU whenever
* changing any PTE. This RWBF isn't about caching old values; it's about not
* seeing new values due to buffering.
*
* Just about any time you want to do a CM operation, you'd also want to check
* for RWBF. Though note that we do not use the IOMMU if it requires either CM
* or RWBF. */
static inline void write_buffer_flush(struct iommu *iommu)
{
uint32_t cmd, status;
if (!iommu->rwbf)
return;
cmd = read32(iommu->regio + DMAR_GCMD_REG) | DMA_GCMD_WBF;
write32(cmd, iommu->regio + DMAR_GCMD_REG);
do {
status = read32(iommu->regio + DMAR_GSTS_REG);
} while (status & DMA_GSTS_WBFS);
}
/* OK, read and write draining on flush. At first I thought this was about
* ops that queued up, but hadn't gone through the IOMMU yet. Instead, this is
* about ops that made it through the IOMMU, but have not made it to main
* memory. i.e., the IOMMU translated to a physical address, but the write to
* that paddr hasn't made it to RAM. The reason we ask for a TLB flush is
* typically to make sure the PTE / translation is no longer in use. Undrained
* operations that made it past the IOMMU are still using the old translation.
* Thus we should always read/write drain. */
static void __iotlb_flush_global(struct iommu *iommu)
{
write64(DMA_TLB_IVT | DMA_TLB_READ_DRAIN | DMA_TLB_WRITE_DRAIN |
DMA_TLB_GLOBAL_FLUSH,
iommu->regio + iommu->iotlb_cmd_offset);
while (read64(iommu->regio + iommu->iotlb_cmd_offset) & DMA_TLB_IVT)
cpu_relax();
}
static void iotlb_flush(struct iommu *iommu, uint16_t did)
{
write64(DMA_TLB_IVT | DMA_TLB_READ_DRAIN | DMA_TLB_WRITE_DRAIN |
DMA_TLB_DSI_FLUSH | DMA_TLB_DID(did),
iommu->regio + iommu->iotlb_cmd_offset);
while (read64(iommu->regio + iommu->iotlb_cmd_offset) & DMA_TLB_IVT)
cpu_relax();
}
static inline struct root_entry *get_root_entry(physaddr_t paddr)
{
return (struct root_entry *) KADDR(paddr);
}
static inline struct context_entry *get_context_entry(physaddr_t paddr)
{
return (struct context_entry *) KADDR(paddr);
}
static void __cte_set_identity_pgtbl(struct context_entry *cte)
{
cte->hi = 0
| (IOMMU_DID_DEFAULT << CTX_HI_DID_SHIFT) // DID bit: 72 to 87
| (CTX_AW_L4 << CTX_HI_AW_SHIFT); // AW
cte->lo = 0 /* assumes page alignment */
| (0x2 << CTX_LO_TRANS_SHIFT)
| (0x1 << CTX_LO_FPD_SHIFT) // disable faults
| (0x1 << CTX_LO_PRESENT_SHIFT); /* mark present */
}
static void __cte_set_proc_pgtbl(struct context_entry *cte, struct proc *p)
{
/* TODO: need to limit PID to 16 bits or come up with an alternative */
warn_on(p->pid & ~0xffff);
cte->hi = 0
| ((uint16_t)p->pid << CTX_HI_DID_SHIFT) // DID bit: 72 to 87
| (CTX_AW_L4 << CTX_HI_AW_SHIFT); // AW
/* The only difference here is PGDIR and the LO_TRANS_SHIFT */
cte->lo = PTE_ADDR(p->env_pgdir.eptp)
| (0x0 << CTX_LO_TRANS_SHIFT)
| (0x1 << CTX_LO_FPD_SHIFT) // disable faults
| (0x1 << CTX_LO_PRESENT_SHIFT); /* mark present */
}
static physaddr_t ct_init(void)
{
struct context_entry *cte;
physaddr_t ct;
cte = (struct context_entry *) kpage_zalloc_addr();
ct = PADDR(cte);
for (int i = 0; i < 32 * 8; i++, cte++) // device * func
__cte_set_identity_pgtbl(cte);
return ct;
}
/* Get a new root_entry table. Allocates all context entries. */
static physaddr_t rt_init(void)
{
struct root_entry *rte;
physaddr_t rt;
physaddr_t ct;
/* Page Align = 0x1000 */
rte = (struct root_entry *) kpage_zalloc_addr();
rt = PADDR(rte);
/* create context table */
for (int i = 0; i < 256; i++, rte++) {
ct = ct_init();
rte->hi = 0;
rte->lo = 0
| ct
| (0x1 << RT_LO_PRESENT_SHIFT);
}
return rt;
}
static struct context_entry *get_ctx_for(struct iommu *iommu,
struct pci_device *pdev)
{
struct root_entry *rte;
physaddr_t cte_phy;
struct context_entry *cte;
uint32_t offset = 0;
rte = get_root_entry(iommu->roottable) + pdev->bus;
cte_phy = rte->lo & 0xFFFFFFFFFFFFF000;
cte = get_context_entry(cte_phy);
offset = (pdev->dev * 8) + pdev->func;
cte += offset;
return cte;
}
static void __iommu_clear_pgtbl(struct pci_device *pdev, uint16_t did)
{
struct iommu *iommu = pdev->iommu;
struct context_entry *cte = get_ctx_for(iommu, pdev);
cte->lo &= ~0x1;
spin_lock_irqsave(&iommu->iommu_lock);
iotlb_flush(iommu, did);
spin_unlock_irqsave(&iommu->iommu_lock);
}
/* Hold the proc's dev_qlock. This returns the linkage for p and i, and inserts
* if it it didn't exist. */
static struct iommu_proc_link *__get_linkage(struct proc *p, struct iommu *i)
{
struct iommu_proc_link *l;
list_for_each_entry(l, &p->iommus, link) {
if (l->i == i)
return l;
}
l = kmalloc(sizeof(struct iommu_proc_link), MEM_WAIT);
l->i = i;
l->p = p;
l->nr_devices = 0;
list_add_rcu(&l->link, &p->iommus);
return l;
}
/* Caller holds the pdev->qlock and if proc, the proc->dev_qlock.
* Careful, this can throw. */
void __iommu_device_assign(struct pci_device *pdev, struct proc *proc)
{
struct iommu *iommu = pdev->iommu;
struct iommu_proc_link *l;
if (!proc) {
__cte_set_identity_pgtbl(get_ctx_for(pdev->iommu, pdev));
return;
}
/* Lockless peek. We hold the dev_qlock, so if we are concurrently
* dying, proc_destroy() will come behind us and undo this. If
* proc_destroy() already removed all devices, we would see DYING. */
if (proc_is_dying(proc))
error(EINVAL, "process is dying");
l = __get_linkage(proc, iommu);
l->nr_devices++;
TAILQ_INSERT_TAIL(&proc->pci_devs, pdev, proc_link);
__cte_set_proc_pgtbl(get_ctx_for(pdev->iommu, pdev), proc);
}
/* Caller holds the pdev->qlock and if proc, the proc->dev_qlock. */
void __iommu_device_unassign(struct pci_device *pdev, struct proc *proc)
{
struct iommu *iommu = pdev->iommu;
struct iommu_proc_link *l;
assert(iommu == pdev->iommu);
if (!proc) {
__iommu_clear_pgtbl(pdev, IOMMU_DID_DEFAULT);
return;
}
l = __get_linkage(proc, iommu);
__iommu_clear_pgtbl(pdev, proc->pid);
l->nr_devices--;
if (!l->nr_devices) {
list_del_rcu(&l->link);
kfree_rcu(l, rcu);
}
TAILQ_REMOVE(&proc->pci_devs, pdev, proc_link);
}
void iommu_unassign_all_devices(struct proc *p)
{
struct pci_device *pdev, *tp;
qlock(&p->dev_qlock);
/* If you want to get clever and try to batch up the iotlb flushes, it's
* probably not worth it. The big concern is that the moment you unlock
* the pdev, it can be reassigned. If you didn't flush the iotlb yet,
* it might have old entries. Note that when we flush, we pass the DID
* (p->pid), which the next user of the pdev won't know. I don't know
* if you need to flush the old DID entry or not before reusing a CTE,
* though probably. */
TAILQ_FOREACH_SAFE(pdev, &p->pci_devs, proc_link, tp) {
qlock(&pdev->qlock);
pci_device_unassign_known(pdev, p);
qunlock(&pdev->qlock);
}
qunlock(&p->dev_qlock);
}
void proc_iotlb_flush(struct proc *p)
{
struct iommu_proc_link *l;
rcu_read_lock();
list_for_each_entry_rcu(l, &p->iommus, link) {
spin_lock_irqsave(&l->i->iommu_lock);
iotlb_flush(l->i, p->pid);
spin_unlock_irqsave(&l->i->iommu_lock);
}
rcu_read_unlock();
}
static void __set_root_table(struct iommu *iommu, physaddr_t roottable)
{
write64(roottable, iommu->regio + DMAR_RTADDR_REG);
write32(DMA_GCMD_SRTP, iommu->regio + DMAR_GCMD_REG);
/* Unlike the write-buffer-flush status and ICC completion check,
* hardware *sets* the bit to 1 when it is done */
while (!(read32(iommu->regio + DMAR_GSTS_REG) & DMA_GSTS_RTPS))
cpu_relax();
}
static void __inval_ctx_cache_global(struct iommu *iommu)
{
write64(DMA_CCMD_ICC | DMA_CCMD_GLOBAL_INVL,
iommu->regio + DMAR_CCMD_REG);
while (read64(iommu->regio + DMAR_CCMD_REG) & DMA_CCMD_ICC)
cpu_relax();
}
static void __enable_translation(struct iommu *iommu)
{
/* see 10.4.4 for some concerns if we want to update multiple fields.
* (read status, mask the one-shot commands we don't want on, then set
* the ones we do want). */
write32(DMA_GCMD_TE, iommu->regio + DMAR_GCMD_REG);
while (!(read32(iommu->regio + DMAR_GSTS_REG) & DMA_GSTS_TES))
cpu_relax();
}
/* Given an iommu with a root table, enable translation. The default root table
* (from rt_init()) is set up to not translate. i.e. IOVA == PA. */
static void iommu_enable_translation(struct iommu *iommu)
{
spin_lock_irqsave(&iommu->iommu_lock);
__set_root_table(iommu, iommu->roottable);
__inval_ctx_cache_global(iommu);
__iotlb_flush_global(iommu);
__enable_translation(iommu);
spin_unlock_irqsave(&iommu->iommu_lock);
}
/* Iterate over all IOMMUs and make sure the "rba" present in DRHD are unique */
static bool iommu_asset_unique_regio(void)
{
struct iommu *outer, *inner;
uint64_t rba;
bool result = true;
TAILQ_FOREACH(outer, &iommu_list, iommu_link) {
rba = outer->rba;
TAILQ_FOREACH(inner, &iommu_list, iommu_link) {
if (outer != inner && rba == inner->rba) {
outer->supported = false;
result = false;
}
}
}
return result;
}
static bool iommu_has_required_capabilities(struct iommu *iommu)
{
uint64_t cap, ecap;
bool support, result = true;
cap = read64(iommu->regio + DMAR_CAP_REG);
ecap = read64(iommu->regio + DMAR_ECAP_REG);
support = (cap_sagaw(cap) & 0x4) >> 2;
if (!support) {
printk(IOMMU "%p: unsupported paging level: 0x%x\n",
iommu, cap_sagaw(cap));
result = false;
}
support = cap_super_page_val(cap) & 0x1;
if (!support) {
printk(IOMMU "%p: 1GB super pages not supported\n", iommu);
result = false;
}
if (cap_rwbf(cap)) {
printk(IOMMU "%p: HW requires RWBF, will abort\n", iommu);
result = false;
}
if (cap_caching_mode(cap)) {
printk(IOMMU "%p: HW requires caching_mode, will abort\n",
iommu);
result = false;
}
support = ecap_pass_through(ecap);
if (!support) {
printk(IOMMU "%p: pass-through translation type in context entries not supported\n", iommu);
result = false;
}
/* max gaw/haw reported by iommu. It's fine if these differ. Spec says
* MGAW must be at least the HAW. It's OK to be more. */
iommu->haw_cap = cap_mgaw(cap);
if (iommu->haw_cap < iommu->haw_dmar) {
printk(IOMMU "%p: HAW mismatch; DMAR reports %d, CAP reports %d, check CPUID\n",
iommu, iommu->haw_dmar, iommu->haw_cap);
}
return result;
}
/* All or nothing */
static bool have_iommu_support(void)
{
struct iommu *iommu;
if (TAILQ_EMPTY(&iommu_list))
return false;
TAILQ_FOREACH(iommu, &iommu_list, iommu_link) {
if (!iommu->supported)
return false;
}
return true;
}
/* Run this function after all individual IOMMUs are initialized. */
void iommu_enable_all(void)
{
struct iommu *iommu;
static bool once = false;
if (once)
warn(IOMMU "Called twice, aborting!");
once = true;
if (!iommu_asset_unique_regio())
warn(IOMMU "same register base addresses detected");
iommu_is_supported = have_iommu_support();
if (!iommu_is_supported) {
printk("No supported IOMMUs detected\n");
return;
}
TAILQ_FOREACH(iommu, &iommu_list, iommu_link) {
printk("IOMMU: enabling translation on %p\n", iommu);
iommu_enable_translation(iommu);
}
}
static bool _iommu_is_enabled(struct iommu *iommu)
{
uint32_t status = 0;
/* Arguably we don't need the lock when reading. */
spin_lock_irqsave(&iommu->iommu_lock);
status = read32(iommu->regio + DMAR_GSTS_REG);
spin_unlock_irqsave(&iommu->iommu_lock);
return status & DMA_GSTS_TES;
}
static bool iommu_some_is_enabled(void)
{
struct iommu *iommu;
TAILQ_FOREACH(iommu, &iommu_list, iommu_link)
if (_iommu_is_enabled(iommu))
return true;
return false;
}
/* grabs the iommu of the first DRHD with INCLUDE_PCI_ALL */
struct iommu *get_default_iommu(void)
{
struct Dmar *dt;
/* dmar is a global variable; see acpi.h */
if (dmar == NULL)
return NULL;
dt = dmar->tbl;
for (int i = 0; i < dmar->nchildren; i++) {
struct Atable *at = dmar->children[i];
struct Drhd *drhd = at->tbl;
if (drhd->all & 1)
return &drhd->iommu;
}
return NULL;
}
void iommu_map_pci_devices(void)
{
struct pci_device *pci_iter;
struct iommu *iommu = get_default_iommu();
if (!iommu)
return;
/* set the default iommu */
STAILQ_FOREACH(pci_iter, &pci_devices, all_dev) {
pci_iter->iommu = iommu;
TAILQ_INSERT_TAIL(&iommu->pci_devs, pci_iter, iommu_link);
}
}
/* This is called from acpi.c to initialize an iommu. */
void iommu_acpi_init(struct iommu *iommu, uint8_t haw, uint64_t rba)
{
uint64_t cap, ecap;
TAILQ_INIT(&iommu->pci_devs);
spinlock_init_irqsave(&iommu->iommu_lock);
iommu->rba = rba;
iommu->regio = (void __iomem *) vmap_pmem_nocache(rba, VTD_PAGE_SIZE);
if (!iommu->regio)
warn("Unable to map the iommu, aborting!");
iommu->haw_dmar = haw;
iommu->supported = iommu_has_required_capabilities(iommu);
cap = read64(iommu->regio + DMAR_CAP_REG);
ecap = read64(iommu->regio + DMAR_ECAP_REG);
/* Creates a root table for non-translating identity maps, but it is not
* enabled / turned on yet. */
iommu->roottable = rt_init();
iommu->iotlb_cmd_offset = ecap_iotlb_offset(ecap) + 8;
iommu->iotlb_addr_offset = ecap_iotlb_offset(ecap);
iommu->rwbf = cap_rwbf(cap);
iommu->device_iotlb = ecap_dev_iotlb_support(ecap);
/* add the iommu to the list of all discovered iommu */
TAILQ_INSERT_TAIL(&iommu_list, iommu, iommu_link);
}
static void assign_device(int bus, int dev, int func, pid_t pid)
{
ERRSTACK(1);
int tbdf = MKBUS(BusPCI, bus, dev, func);
struct pci_device *pdev = pci_match_tbdf(tbdf);
struct proc *p;
if (!pdev)
error(EIO, "cannot find dev %x:%x.%x\n", bus, dev, func);
if (!pid) {
pci_device_assign(pdev, NULL);
return;
}
if (pid == 1)
error(EIO, "device passthru not supported for pid = 1");
p = pid2proc(pid);
if (!p)
error(EIO, "cannot find pid %d\n", pid);
if (waserror()) {
proc_decref(p);
nexterror();
}
pci_device_assign(pdev, p);
proc_decref(p);
poperror();
}
static void unassign_device(int bus, int dev, int func, pid_t pid)
{
ERRSTACK(1);
int tbdf = MKBUS(BusPCI, bus, dev, func);
struct pci_device *pdev = pci_match_tbdf(tbdf);
struct proc *p;
if (!pdev)
error(EIO, "cannot find dev %x:%x.%x\n", bus, dev, func);
if (!pid) {
pci_device_unassign(pdev, NULL);
return;
}
p = pid2proc(pid);
if (!p)
error(EIO, "cannot find pid %d\n", pid);
if (waserror()) {
proc_decref(p);
nexterror();
}
pci_device_unassign(pdev, p);
proc_decref(p);
poperror();
}
static struct sized_alloc *open_mappings(void)
{
struct iommu *iommu;
bool has_dev = false;
struct pci_device *pdev;
struct sized_alloc *sza = sized_kzmalloc(BUFFERSZ, MEM_WAIT);
TAILQ_FOREACH(iommu, &iommu_list, iommu_link) {
sza_printf(sza, "Mappings for iommu@%p\n", iommu);
spin_lock_irqsave(&iommu->iommu_lock);
TAILQ_FOREACH(pdev, &iommu->pci_devs, iommu_link) {
if (!pdev->proc_owner)
continue;
has_dev = true;
sza_printf(sza, "\tdevice %02x:%02x.%x, PID %u\n",
pdev->bus, pdev->dev, pdev->func,
pdev->proc_owner->pid);
}
spin_unlock_irqsave(&iommu->iommu_lock);
if (!has_dev)
sza_printf(sza, "\t<empty>\n");
}
return sza;
}
static void _open_info(struct iommu *iommu, struct sized_alloc *sza)
{
uint64_t value;
sza_printf(sza, "\niommu@%p\n", iommu);
sza_printf(sza, "\trba = %p\n", iommu->rba);
sza_printf(sza, "\tsupported = %s\n", iommu->supported ? "yes" : "no");
sza_printf(sza, "\tregspace = %p\n", iommu->regio);
sza_printf(sza, "\thost addr width (dmar) = %d\n", iommu->haw_dmar);
sza_printf(sza, "\thost addr width (cap[mgaw]) = %d\n",
iommu->haw_cap);
value = read32(iommu->regio + DMAR_VER_REG);
sza_printf(sza, "\tversion = 0x%x\n", value);
value = read64(iommu->regio + DMAR_CAP_REG);
sza_printf(sza, "\tcapabilities = %p\n", value);
sza_printf(sza, "\t\tmgaw: %d\n", cap_mgaw(value));
sza_printf(sza, "\t\tsagaw (paging level): 0x%x\n", cap_sagaw(value));
sza_printf(sza, "\t\tcaching mode: %s (%d)\n", cap_caching_mode(value) ?
"yes" : "no", cap_caching_mode(value));
sza_printf(sza, "\t\tzlr: 0x%x\n", cap_zlr(value));
sza_printf(sza, "\t\trwbf: %s\n", cap_rwbf(value) ? "required"
: "not required");
sza_printf(sza, "\t\tnum domains: %d\n", cap_ndoms(value));
sza_printf(sza, "\t\tsupports protected high-memory region: %s\n",
cap_phmr(value) ? "yes" : "no");
sza_printf(sza, "\t\tsupports Protected low-memory region: %s\n",
cap_plmr(value) ? "yes" : "no");
value = read64(iommu->regio + DMAR_ECAP_REG);
sza_printf(sza, "\text. capabilities = %p\n", value);
sza_printf(sza, "\t\tpass through: %s\n",
ecap_pass_through(value) ? "yes" : "no");
sza_printf(sza, "\t\tdevice iotlb: %s\n",
ecap_dev_iotlb_support(value) ? "yes" : "no");
sza_printf(sza, "\t\tiotlb register offset: 0x%x\n",
ecap_iotlb_offset(value));
sza_printf(sza, "\t\tsnoop control: %s\n",
ecap_sc_support(value) ? "yes" : "no");
sza_printf(sza, "\t\tcoherency: %s\n",
ecap_coherent(value) ? "yes" : "no");
sza_printf(sza, "\t\tqueue invalidation support: %s\n",
ecap_qis(value) ? "yes" : "no");
sza_printf(sza, "\t\tinterrupt remapping support: %s\n",
ecap_ir_support(value) ? "yes" : "no");
sza_printf(sza, "\t\textended interrupt mode: 0x%x\n",
ecap_eim_support(value));
value = read32(iommu->regio + DMAR_GSTS_REG);
sza_printf(sza, "\tglobal status = 0x%x\n", value);
sza_printf(sza, "\t\ttranslation: %s\n",
value & DMA_GSTS_TES ? "enabled" : "disabled");
sza_printf(sza, "\t\troot table: %s\n",
value & DMA_GSTS_RTPS ? "set" : "not set");
value = read64(iommu->regio + DMAR_RTADDR_REG);
sza_printf(sza, "\troot entry table = %p (phy) or %p (vir)\n",
value, KADDR(value));
}
static struct sized_alloc *open_info(void)
{
struct sized_alloc *sza = sized_kzmalloc(BUFFERSZ, MEM_WAIT);
uint64_t value;
struct iommu *iommu;
sza_printf(sza, "driver info:\n");
value = IOMMU_DID_DEFAULT;
sza_printf(sza, "\tdefault did = %d\n", value);
sza_printf(sza, "\tstatus = %s\n",
iommu_some_is_enabled() ? "enabled" : "disabled");
TAILQ_FOREACH(iommu, &iommu_list, iommu_link) {
_open_info(iommu, sza);
}
return sza;
}
static char *devname(void)
{
return iommudevtab.name;
}
static struct chan *iommuattach(char *spec)
{
return devattach(devname(), spec);
}
static struct walkqid *iommuwalk(struct chan *c, struct chan *nc, char **name,
unsigned int nname)
{
return devwalk(c, nc, name, nname, iommudir,
ARRAY_SIZE(iommudir), devgen);
}
static size_t iommustat(struct chan *c, uint8_t *dp, size_t n)
{
return devstat(c, dp, n, iommudir, ARRAY_SIZE(iommudir), devgen);
}
static struct chan *iommuopen(struct chan *c, int omode)
{
switch (c->qid.path) {
case Qmappings:
c->synth_buf = open_mappings();
break;
case Qinfo:
c->synth_buf = open_info();
break;
case Qadddev:
case Qremovedev:
case Qdir:
default:
break;
}
return devopen(c, omode, iommudir, ARRAY_SIZE(iommudir), devgen);
}
/*
* All files are synthetic. Hence we do not need to implement any close
* function.
*/
static void iommuclose(struct chan *c)
{
switch (c->qid.path) {
case Qmappings:
case Qinfo:
kfree(c->synth_buf);
c->synth_buf = NULL;
break;
case Qadddev:
case Qremovedev:
case Qdir:
default:
break;
}
}
static size_t iommuread(struct chan *c, void *va, size_t n, off64_t offset)
{
struct sized_alloc *sza = c->synth_buf;
switch (c->qid.path) {
case Qdir:
return devdirread(c, va, n, iommudir,
ARRAY_SIZE(iommudir), devgen);
case Qadddev:
return readstr(offset, va, n,
"write format: xx:yy.z pid\n"
" xx = bus (in hex)\n"
" yy = device (in hex)\n"
" z = function (in hex)\n"
" pid = process pid\n"
"\nexample:\n"
"$ echo 00:1f.2 13 >\\#iommu/attach\n");
case Qremovedev:
return readstr(offset, va, n,
"write format: xx:yy.z\n"
" xx = bus (in hex)\n"
" yy = device (in hex)\n"
" z = function (in hex)\n"
"\nexample:\n"
"$ echo 00:1f.2 >\\#iommu/detach\n");
case Qmappings:
case Qinfo:
return readstr(offset, va, n, sza->buf);
default:
error(EIO, "read: qid %d is impossible", c->qid.path);
}
return -1; /* not reached */
}
static void get_bdf_pid(struct cmdbuf *cb, int *bus, int *dev, int *func,
pid_t *pid)
{
int err;
if (cb->nf < 2)
error(EFAIL, "bb:dd.f pid");
err = sscanf(cb->f[0], "%x:%x.%x", bus, dev, func);
if (err != 3)
error(EIO,
IOMMU "error parsing bdf %s; nr parsed: %d", cb->f[0], err);
*pid = strtoul(cb->f[1], 0, 0);
}
static void write_add_dev(struct chan *c, struct cmdbuf *cb)
{
int bus, dev, func;
pid_t pid;
get_bdf_pid(cb, &bus, &dev, &func, &pid);
if (pid == 1)
error(EIO, IOMMU "device passthru not supported for pid = 1");
assign_device(bus, dev, func, pid);
}
static void write_remove_dev(struct chan *c, struct cmdbuf *cb)
{
int bus, dev, func;
pid_t pid;
get_bdf_pid(cb, &bus, &dev, &func, &pid);
unassign_device(bus, dev, func, pid);
}
static size_t iommuwrite(struct chan *c, void *va, size_t n, off64_t offset)
{
ERRSTACK(1);
struct cmdbuf *cb = parsecmd(va, n);
if (waserror()) {
kfree(cb);
nexterror();
}
switch (c->qid.path) {
case Qadddev:
if (!iommu_is_supported)
error(EROFS, IOMMU "not supported");
write_add_dev(c, cb);
break;
case Qremovedev:
if (!iommu_is_supported)
error(EROFS, IOMMU "not supported");
write_remove_dev(c, cb);
break;
case Qmappings:
case Qinfo:
case Qdir:
error(EROFS, IOMMU "cannot modify");
default:
error(EIO, "write: qid %d is impossible", c->qid.path);
}
kfree(cb);
poperror();
return n;
}
struct dev iommudevtab __devtab = {
.name = "iommu",
.reset = devreset,
.init = devinit,
.shutdown = devshutdown,
.attach = iommuattach,
.walk = iommuwalk,
.stat = iommustat,
.open = iommuopen,
.create = devcreate,
.close = iommuclose,
.read = iommuread,
.bread = devbread,
.write = iommuwrite,
.bwrite = devbwrite,
.remove = devremove,
.wstat = devwstat,
};