blob: 8e3bbf8f91e64f8e012c2dddcfb12341674ea0fd [file] [log] [blame]
/* Copyright (c) 2019-2020 Google Inc
* Aditya Basu <mitthu@google.com>
* Barret Rhoden <brho@google.com>
* See LICENSE for details.
*
* Useful resources:
* - Intel Xeon E7 2800/4800/8800 Datasheet Vol. 2
* - Purley Programmer's Guide
*
* Acronyms:
* - IOAT: (Intel) I/O Acceleration Technology
* - CDMA: Crystal Beach DMA
*
* TODO
* - Consider something lighter-weight than the qlock for ensuring the device
* doesn't get detached during operation. kref, perhaps. There's also an
* element of "stop new people from coming in", like we do with closing FDs.
* There's also stuff that the dmaengine does in linux. See dma_chan_get().
* - Freeze or handle faults with VA->PA page mappings, till DMA is completed.
* Right now, we could get iommu faults, which was the purpose of this whole
* thing.
* - The dmaengine has helpers for some of this. dma_set_unmap() is a
* "unmap all these things when you're done" approach, called by __cleanup
* -> dma_descriptor_unmap(). the unmap struct is basically a todo list.
* - There's a lot of stuff we could do with the DMA engine to reduce the
* amount of device touches, contention, and other inefficiencies.
* issue_dma() is a minimalist one. No batching, etc. And with the pdev
* qlock, we have only a single request per PCI device, though there may be
* numerous channels.
*/
#include <kmalloc.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <error.h>
#include <net/ip.h>
#include <linux_compat.h>
#include <arch/pci.h>
#include <page_alloc.h>
#include <pmap.h>
#include <arch/pci_regs.h>
#include <linux/dmaengine.h>
/* QID Path */
enum {
Qdir = 0,
Qcbdmaktest = 1,
Qcbdmaucopy = 2,
};
static struct dirtab cbdmadir[] = {
{".", {Qdir, 0, QTDIR}, 0, 0555},
{"ktest", {Qcbdmaktest, 0, QTFILE}, 0, 0555},
{"ucopy", {Qcbdmaucopy, 0, QTFILE}, 0, 0755},
};
/* TODO: this is a device/kernel ABI. ucbdma.c has a copy. It's probably not
* worth putting in its own header, since this is really cheap test code. */
struct ucbdma {
uint64_t dst_addr;
uint64_t src_addr;
uint32_t xfer_size;
char bdf_str[10];
} __attribute__((packed));
#define KTEST_SIZE 64
static struct {
char src[KTEST_SIZE];
char dst[KTEST_SIZE];
char srcfill;
char dstfill;
} ktest = {.srcfill = '0', .dstfill = 'X'};
static inline struct pci_device *dma_chan_to_pci_dev(struct dma_chan *dc)
{
return container_of(dc->device->dev, struct pci_device, linux_dev);
}
/* Filter function for finding a particular PCI device. If
* __dma_request_channel() asks for a particular device, we'll only give it that
* chan. If you don't care, pass NULL, and you'll get any free chan. */
static bool filter_pci_dev(struct dma_chan *dc, void *arg)
{
struct pci_device *pdev = dma_chan_to_pci_dev(dc);
if (arg)
return arg == pdev;
return true;
}
/* Addresses are device-physical. Caller holds the pdev qlock. */
static void issue_dma(struct pci_device *pdev, physaddr_t dst, physaddr_t src,
size_t len, bool async)
{
ERRSTACK(1);
struct dma_chan *dc;
dma_cap_mask_t mask;
struct dma_async_tx_descriptor *tx;
int flags;
struct completion cmp;
unsigned long tmo;
dma_cookie_t cookie;
/* dmaengine_get works for the non-DMA_PRIVATE devices. A lot
* of devices turn on DMA_PRIVATE, in which case they won't be in the
* general pool available to the dmaengine. Instead, we directly
* request DMA channels - particularly since we want specific devices to
* use with the IOMMU. */
dma_cap_zero(mask);
dma_cap_set(DMA_MEMCPY, mask);
dc = __dma_request_channel(&mask, filter_pci_dev, pdev);
if (!dc)
error(EFAIL, "Couldn't get a DMA channel");
if (waserror()) {
dma_release_channel(dc);
nexterror();
}
flags = 0;
if (async)
flags |= DMA_PREP_INTERRUPT;
if (!is_dma_copy_aligned(dc->device, dst, src, len))
error(EINVAL, "Bad copy alignment: %p %p %lu", dst, src, len);
tx = dmaengine_prep_dma_memcpy(dc, dst, src, len, flags);
if (!tx)
error(EFAIL, "Couldn't prep the memcpy!\n");
if (async) {
async_tx_ack(tx);
init_completion(&cmp);
tx->callback = (dma_async_tx_callback)complete;
tx->callback_param = &cmp;
}
cookie = dmaengine_submit(tx);
if (cookie < 0)
error(EFAIL, "Failed to submit the DMA...");
/* You can poke this. dma_sync_wait() also calls this. */
dma_async_issue_pending(dc);
if (async) {
/* Giant warning: the polling methods, like
* dmaengine_tx_status(), might actually trigger the
* tx->callback. At least the IOAT driver does this. */
tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
if (tmo == 0 || dmaengine_tx_status(dc, cookie, NULL)
!= DMA_COMPLETE) {
error(ETIMEDOUT, "timeout or related spurious failure");
}
} else {
dma_wait_for_async_tx(tx);
}
dma_release_channel(dc);
poperror();
}
static void issue_dma_ucbdma(struct ucbdma *u)
{
ERRSTACK(1);
struct pci_device *pdev;
pdev = pci_match_string(u->bdf_str);
if (!pdev)
error(ENODEV, "No device %s", u->bdf_str);
/* The qlock prevents unassignment from happening during an operation.
* If that happened, the driver's reset method would be called while the
* op is ongoing. The driver might be able to handle that. Though when
* the iommu mappings are destroyed, the driver is likely to get wedged.
*
* A kref or something else might work better here, to allow multiple
* DMAs at a time. */
qlock(&pdev->qlock);
if (waserror()) {
qunlock(&pdev->qlock);
nexterror();
}
if (pdev->proc_owner != current)
error(EINVAL, "wrong proc_owner");
issue_dma(pdev, u->dst_addr, u->src_addr, u->xfer_size, true);
qunlock(&pdev->qlock);
poperror();
}
/* Runs a basic test from within the kernel on 0:4.3.
*
* One option would be to have write() set the sza buffer. It won't be static
* through the chan's lifetime (so you'd need to deal with syncing), but it'd
* let you set things. Another would be to have another chan/file for the BDF
* (and you'd sync on that). */
static struct sized_alloc *open_ktest(void)
{
ERRSTACK(2);
struct pci_device *pdev = pci_match_tbdf(MKBUS(0, 0, 4, 3));
struct sized_alloc *sza;
physaddr_t dst, src; /* device addrs */
char *dst_d, *src_d; /* driver addrs */
uintptr_t prev;
if (!pdev)
error(EINVAL, "no 00:04.3");
qlock(&pdev->qlock);
/* We need to get into the address space of the device, which might be
* NULL if it's the kernel's or unassigned. */
prev = switch_to(pdev->proc_owner);
if (waserror()) {
switch_back(pdev->proc_owner, prev);
qunlock(&pdev->qlock);
nexterror();
}
if (pdev->state != DEV_STATE_ASSIGNED_KERNEL &&
pdev->state != DEV_STATE_ASSIGNED_USER)
error(EINVAL, "00:04.3 is unassigned (%d)", pdev->state);
dst_d = dma_alloc_coherent(&pdev->linux_dev, KTEST_SIZE, &dst,
MEM_WAIT);
src_d = dma_alloc_coherent(&pdev->linux_dev, KTEST_SIZE, &src,
MEM_WAIT);
if (waserror()) {
dma_free_coherent(&pdev->linux_dev, KTEST_SIZE, dst_d, dst);
dma_free_coherent(&pdev->linux_dev, KTEST_SIZE, src_d, src);
nexterror();
}
ktest.srcfill += 1;
/* initialize src and dst address */
memset(src_d, ktest.srcfill, KTEST_SIZE);
memset(dst_d, ktest.dstfill, KTEST_SIZE);
src_d[KTEST_SIZE-1] = '\0';
dst_d[KTEST_SIZE-1] = '\0';
issue_dma(pdev, dst, src, KTEST_SIZE, true);
sza = sized_kzmalloc(1024, MEM_WAIT);
sza_printf(sza, "\tCopy Size: %d (0x%x)\n", KTEST_SIZE, KTEST_SIZE);
sza_printf(sza, "\tsrcfill: %c (0x%x)\n", ktest.srcfill, ktest.srcfill);
sza_printf(sza, "\tdstfill: %c (0x%x)\n", ktest.dstfill, ktest.dstfill);
/* %s on a uptr causes a printfmt warning. stop at 20 too. sanity.*/
sza_printf(sza, "\tsrc_str (after copy): ");
for (int i = 0; i < 20; i++)
sza_printf(sza, "%c", src_d[i]);
sza_printf(sza, "\n");
sza_printf(sza, "\tdst_str (after copy): ");
for (int i = 0; i < 20; i++)
sza_printf(sza, "%c", dst_d[i]);
sza_printf(sza, "\n");
dma_free_coherent(&pdev->linux_dev, KTEST_SIZE, dst_d, dst);
dma_free_coherent(&pdev->linux_dev, KTEST_SIZE, src_d, src);
poperror();
switch_back(pdev->proc_owner, prev);
qunlock(&pdev->qlock);
poperror();
return sza;
}
struct dev cbdmadevtab;
static char *devname(void)
{
return cbdmadevtab.name;
}
static struct chan *cbdmaattach(char *spec)
{
return devattach(devname(), spec);
}
struct walkqid *cbdmawalk(struct chan *c, struct chan *nc, char **name,
unsigned int nname)
{
return devwalk(c, nc, name, nname, cbdmadir,
ARRAY_SIZE(cbdmadir), devgen);
}
static size_t cbdmastat(struct chan *c, uint8_t *dp, size_t n)
{
return devstat(c, dp, n, cbdmadir, ARRAY_SIZE(cbdmadir), devgen);
}
static struct chan *cbdmaopen(struct chan *c, int omode)
{
switch (c->qid.path) {
case Qcbdmaktest:
c->synth_buf = open_ktest();
break;
case Qdir:
case Qcbdmaucopy:
break;
default:
error(EIO, "cbdma: qid 0x%x is impossible", c->qid.path);
}
return devopen(c, omode, cbdmadir, ARRAY_SIZE(cbdmadir), devgen);
}
static void cbdmaclose(struct chan *c)
{
switch (c->qid.path) {
case Qcbdmaktest:
kfree(c->synth_buf);
c->synth_buf = NULL;
break;
case Qdir:
case Qcbdmaucopy:
break;
default:
error(EIO, "cbdma: qid 0x%x is impossible", c->qid.path);
}
}
static size_t cbdmaread(struct chan *c, void *va, size_t n, off64_t offset)
{
struct sized_alloc *sza = c->synth_buf;
switch (c->qid.path) {
case Qcbdmaktest:
return readstr(offset, va, n, sza->buf);
case Qcbdmaucopy:
return readstr(offset, va, n,
"Write address of struct ucopy to issue DMA\n");
case Qdir:
return devdirread(c, va, n, cbdmadir, ARRAY_SIZE(cbdmadir),
devgen);
default:
error(EIO, "cbdma: qid 0x%x is impossible", c->qid.path);
}
return -1; /* not reached */
}
static size_t cbdmawrite(struct chan *c, void *va, size_t n, off64_t offset)
{
struct ucbdma ucbdma[1];
switch (c->qid.path) {
case Qdir:
error(EPERM, "writing not permitted");
case Qcbdmaktest:
error(EPERM, ERROR_FIXME);
case Qcbdmaucopy:
if (n != sizeof(struct ucbdma))
error(EINVAL, "Bad ucbdma size %u (%u)", n,
sizeof(struct ucbdma));
if (copy_from_user(ucbdma, va, sizeof(struct ucbdma)))
error(EINVAL, "Bad ucbdma pointer");
issue_dma_ucbdma(ucbdma);
return n;
default:
error(EIO, "cbdma: qid 0x%x is impossible", c->qid.path);
}
return -1; /* not reached */
}
struct dev cbdmadevtab __devtab = {
.name = "cbdma",
.reset = devreset,
.init = devinit,
.shutdown = devshutdown,
.attach = cbdmaattach,
.walk = cbdmawalk,
.stat = cbdmastat,
.open = cbdmaopen,
.create = devcreate,
.close = cbdmaclose,
.read = cbdmaread,
.bread = devbread,
.write = cbdmawrite,
.bwrite = devbwrite,
.remove = devremove,
.wstat = devwstat,
};