ioat: support device reset

This isn't a full PCI reset.  It's more of a driver reset.  PCI things,
like BARs and MSIX tables are still maintained.  The driver undid its
things, back to where it was when the system booted.  (Ideally).

Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
diff --git a/kern/drivers/dma/ioat/init.c b/kern/drivers/dma/ioat/init.c
index ef5fb28..1ab1a5f 100644
--- a/kern/drivers/dma/ioat/init.c
+++ b/kern/drivers/dma/ioat/init.c
@@ -439,17 +439,24 @@
 		irq_h = register_irq(0 /* ignored for msi(x)! */,
 				     ioat_dma_do_interrupt_msix, ioat_chan,
 				     pci_to_tbdf(pdev));
-		/* TODO: this is a mess - we also don't know if we're actually
-		 * MSIX or not!  We don't even know our vector... */
 		if (!irq_h) {
-			warn("MSIX failed (cnt %d), leaking vectors etc!", i);
+			warn("MSIX setup failed (cnt %d)!", i);
 			for (j = 0; j < i; j++) {
 				msix = &ioat_dma->msix_entries[j];
 				ioat_chan = ioat_chan_by_index(ioat_dma, j);
-				//devm_free_irq(dev, msix->vector, ioat_chan);
+				deregister_irq(msix->vector, pci_to_tbdf(pdev));
 			}
 			goto err_no_irq;
 		}
+		/* TODO: this is ugly.  Though really we need register_irq() to
+		 * not fallback on its own here.  This case here is when we did
+		 * get an irq_h, but it wasn't the type we wanted, and this
+		 * driver has different IRQ handlers for different types. */
+		if (strcmp("msi-x", irq_h->type)) {
+			warn("IRQ setup didn't get an MSIX!");
+			goto err_no_irq;
+		}
+		msix->vector = irq_h->apic_vector;
 	}
 	intrctrl |= IOAT_INTRCTRL_MSIX_VECTOR_CONTROL;
 	ioat_dma->irq_mode = IOAT_MSIX;
@@ -1459,26 +1466,6 @@
 	return 0;
 }
 
-/* In lieu of a decent PCI processing system... */
-static void __init ioat_init(void)
-{
-	struct pci_device *p;
-
-	STAILQ_FOREACH(p, &pci_devices, all_dev) {
-		if (p->ven_id != PCI_VENDOR_ID_INTEL)
-			continue;
-		for (int i = 0; ioat_pci_tbl[i].device; i++) {
-			if (p->dev_id == ioat_pci_tbl[i].device) {
-				ioat_pci_probe(p, &ioat_pci_tbl[i]);
-				break;
-			}
-		}
-	}
-}
-/* The 'arch_initcall' setup functions run at level 2. */
-init_func_3(ioat_init);
-
-#if 0 // AKAROS
 static void ioat_remove(struct pci_device *pdev)
 {
 	struct ioatdma_device *device = pci_get_drvdata(pdev);
@@ -1488,15 +1475,133 @@
 
 	dev_err(&pdev->linux_dev, "Removing dma and dca services\n");
 	if (device->dca) {
+#if 0 // AKAROS
 		unregister_dca_provider(device->dca, &pdev->linux_dev);
 		free_dca_provider(device->dca);
+#else
+		warn("Unexpected dca on PCI %x:%x.%x", pdev->bus, pdev->dev,
+		     pdev->func);
+#endif
 		device->dca = NULL;
 	}
 
 	pci_disable_pcie_error_reporting(pdev);
 	ioat_dma_remove(device);
 }
-#endif
+
+/* TODO (DEVM): Akaros doesn't do the 'managed' part of devm_kzalloc and
+ * friends.  This helper will cleanup the things I noticed that were alloced
+ * in this manner.  This was made manually, so YMMV.
+ *
+ * Note that dmaengine.c has a dmam_device_release set up that calls
+ * dma_async_device_unregister, but this driver doesn't use the 'managed'
+ * dmaenginem_async_device_register(). */
+static void devm_cleanup(struct pci_device *pdev)
+{
+	struct ioatdma_device *ioat_dma = pci_get_drvdata(pdev);
+	struct ioatdma_chan *ioat_chan;
+
+	if (!ioat_dma)
+		return;
+	pci_set_drvdata(pdev, NULL);
+	for (int i = 0; i < IOAT_MAX_CHANS; i++) {
+		ioat_chan = ioat_dma->idx[i];
+		if (!ioat_chan)
+			continue;
+		kfree(ioat_chan);
+	}
+	kfree(ioat_dma);
+}
+
+/* TODO (DEVM): Akaros doesn't do any of the 'managed' pci/dev stuff, so we'll
+ * have to free things if probe fails. */
+static int ioat_pci_probe_wrapper(struct pci_device *pdev,
+				  const struct pci_device_id *id)
+{
+	int ret;
+
+	ret = ioat_pci_probe(pdev, id);
+	if (ret < 0) {
+		devm_cleanup(pdev);
+		/* Might be a bug in the linux driver, but there are error paths
+		 * that happen after BME is set. */
+		pci_clr_bus_master(pdev);
+	}
+	return ret;
+}
+
+/* In lieu of a decent PCI processing system... */
+static bool ioat_pci_init(struct pci_device *pdev)
+{
+	const struct pci_device_id *pci_id;
+
+	pci_id = srch_linux_pci_tbl(ioat_pci_tbl, pdev);
+	if (!pci_id)
+		return false;
+	if (ioat_pci_probe_wrapper(pdev, pci_id) < 0)
+		return false;
+	return true;
+}
+
+/* We have support to stop individual IRQs, but the device is still somewhat
+ * initialized from a PCI perspective.  It's not torn down completely:
+ *
+ * We do:
+ * - Turn off and free specific MSI-X vectors.
+ * - Deregister and free the IRQ handler
+ * - Clear bus master enabled
+ * We do not:
+ * - Tear down pci_msi stuff, which is managed by the PCI layer.  Like the msix
+ *   table, or the msix_ready flag
+ * - Tear down the BAR mmio mappings.  Those are managed by the PCI layer.
+ */
+static bool ioat_pci_reset(struct pci_device *pdev)
+{
+	struct ioatdma_device *ioat_dma = pci_get_drvdata(pdev);
+	int msixcnt = ioat_dma->dma_dev.chancnt;
+	struct msix_entry *msix;
+
+	ioat_shutdown(pdev);
+	ioat_remove(pdev);
+
+	/* Assuming MSIX, which is enforced elsewhere.
+	 *
+	 * In Linux, devm resources are freed in reverse order, so the IRQs are
+	 * freed before the channels are freed.
+	 *
+	 * I'm a little reluctant to do this in devm_cleanup, since probe
+	 * failures clean up their own IRQs already.  (Or at least warn if they
+	 * need to. */
+	for (int i = 0; i < msixcnt; i++) {
+		msix = &ioat_dma->msix_entries[i];
+		deregister_irq(msix->vector, pci_to_tbdf(pdev));
+	}
+	devm_cleanup(pdev);
+	pci_clr_bus_master(pdev);
+	return true;
+}
+
+static struct pci_ops ioat_pci_ops = {
+	.driver_name	= "ioat",
+	.init		= ioat_pci_init,
+	.reset		= ioat_pci_reset,
+};
+
+static void __init ioat_init(void)
+{
+	struct pci_device *p;
+	const struct pci_device_id *pci_id;
+
+	STAILQ_FOREACH(p, &pci_devices, all_dev) {
+		if (p->ven_id != PCI_VENDOR_ID_INTEL)
+			continue;
+		if (ioat_pci_init(p))
+			pci_set_ops(p, &ioat_pci_ops,
+				    DEV_STATE_ASSIGNED_KERNEL);
+	}
+}
+/* The 'arch_initcall' setup functions run at level 2. */
+init_func_3(ioat_init);
 
 static int __init ioat_init_module(void)
 {