blob: e17b0f59116bdc03cc5ebb283f7c50308785a2e3 [file] [log] [blame]
/*
* Copyright (c) 2016 Google Inc
* Author: Kanoj Sarcar <kanoj@google.com>
* See LICENSE for details.
*/
#include <err.h>
#include <kmalloc.h>
#include <kref.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <error.h>
#include <pmap.h>
#include <smp.h>
#include <linux/rdma/ib_user_verbs.h>
#include "uverbs.h"
#include <ros/procinfo.h>
static unsigned long pgprot_val(int vmprot)
{
unsigned long prot = PTE_P | PTE_U | PTE_A;
if (vmprot & PROT_WRITE)
prot |= PTE_W | PTE_D;
return prot;
}
unsigned long pgprot_noncached(int vmprot)
{
return pgprot_val(vmprot) | PTE_NOCACHE;
}
unsigned long pgprot_writecombine(int vmprot)
{
return pgprot_val(vmprot) | PTE_WRITECOMB;
}
/*
* Our version knocked off from kern/src/mm.c version + uncaching logic from
* vmap_pmem_nocache(). This routine is expected to be invoked as part of mmap()
* handler.
*/
int map_upage_at_addr(struct proc *p, physaddr_t paddr, uintptr_t addr,
int pteprot, int dolock)
{
pte_t pte;
int rv = -1;
struct page *pp;
/* __vmr_free_pgs() assumes mapped pte is backed by "struct page" */
if (paddr > max_paddr) {
printk("[akaros]: map_upage_at_addr(): paddr=0x%llx "
"max_paddr=0x%llx\n", paddr, max_paddr);
return -1;
}
pp = pa2page(paddr);
/* __vmr_free_pgs() refcnt's pagemap pages differently */
if (atomic_read(&pp->pg_flags) & PG_PAGEMAP) {
printk("[akaros]: map_upage_at_addr(): mapPA=0x%llx\n",
paddr);
return -1;
}
spin_lock(&p->pte_lock);
/*
* Free any existing page backing uva, drop in this page, and
* acquire refcnt on page on behalf of user. Note though that we
* do not expect an existing page, since we are invoked in mmap
* path (page_insert() does not handle PG_PAGEMAP refcnt's).
*/
rv = page_insert(p->env_pgdir, pp, (void *)addr, pteprot);
spin_unlock(&p->pte_lock);
return rv;
}
void set_page_dirty_lock(struct page *pagep)
{
atomic_or(&pagep->pg_flags, PG_DIRTY);
}
void put_page(struct page *pagep)
{
if (atomic_read(&pagep->pg_flags) & PG_PAGEMAP)
printk("[akaros]: put_page() on pagemap page!!!\n");
page_decref(pagep);
}
int get_user_page(struct proc *p, unsigned long uvastart, int write, int force,
struct page **plist)
{
pte_t pte;
int ret = -1;
struct page *pp;
spin_lock(&p->pte_lock);
pte = pgdir_walk(p->env_pgdir, (void*)uvastart, TRUE);
if (!pte_walk_okay(pte))
goto err1;
if (!pte_is_present(pte)) {
unsigned long prot = PTE_P | PTE_U | PTE_A | PTE_W | PTE_D;
#if 0
printk("[akaros]: get_user_page() uva=0x%llx pte absent\n",
uvastart);
#endif
/*
* TODO: ok to allocate with pte_lock? "prot" needs to be
* based on VMR writability, refer to pgprot_noncached().
*/
if (upage_alloc(p, &pp, 0))
goto err1;
pte_write(pte, page2pa(pp), prot);
} else {
pp = pa2page(pte_get_paddr(pte));
/* __vmr_free_pgs() refcnt's pagemap pages differently */
if (atomic_read(&pp->pg_flags) & PG_PAGEMAP) {
printk("[akaros]: get_user_page(): uva=0x%llx\n",
uvastart);
goto err1;
}
}
if (write && (!pte_has_perm_urw(pte))) {
/* TODO: How is Linux using the "force" parameter */
printk("[akaros]: get_user_page() uva=0x%llx pte ro\n",
uvastart);
goto err1;
}
/* TODO (GUP): change the interface such that devices provide the memory
* and the user mmaps it, instead of trying to pin arbitrary user
* memory. */
warn_once("Extremely unsafe, unpinned memory mapped! If your process dies, you might scribble on RAM!");
plist[0] = pp;
ret = 1;
err1:
spin_unlock(&p->pte_lock);
return ret;
}
int sg_alloc_table(struct sg_table *ptr, unsigned int npages, gfp_t mask)
{
ptr->sgl = kmalloc((sizeof(struct scatterlist) * npages), mask);
ptr->nents = ptr->orig_nents = npages;
sg_init_table(ptr->sgl, npages);
return 0;
}
void sg_free_table(struct sg_table *ptr)
{
kfree(ptr->sgl);
}
void idr_remove(struct idr *idp, int id)
{
BUG_ON((id < 0) || (id >= MAXITEMS));
idp->values[id] = NULL;
}
void *idr_find(struct idr *idp, int id)
{
BUG_ON((id < 0) || (id >= MAXITEMS));
BUG_ON(idp->values[id] == NULL);
return idp->values[id];
}
int idr_alloc(struct idr *idp, void *ptr, int start, int end, gfp_t gfp_mask)
{
int i;
/* We use values[] == NULL as an indicator that slot is free */
BUG_ON(ptr == NULL);
spin_lock_irqsave(&idp->lock);
for (i = 0; i < MAXITEMS; i++) {
if (idp->values[i] == NULL) {
idp->values[i] = ptr;
goto done;
}
}
i = -1; /* error return */
done:
spin_unlock_irqsave(&idp->lock);
return i;
}
/* START: Linux /sys support for lib/apps */
/* Callers must pass in null terminated strings */
static ssize_t sysfs_read(char __user *buf, size_t ucount, loff_t *pos,
char *src)
{
int slen = strlen(src) + 1; /* + 1 for terminating null */
unsigned long off = *pos, nb = slen - off;
if (off >= slen)
return 0;
if (copy_to_user(buf, (src + off), nb))
return -EFAULT;
*pos += nb;
return nb;
}
static ssize_t ib_api_ver_read(struct file *filp, char __user *buf,
size_t count, loff_t *pos)
{
char src[4] = { 0, 0, 0, 0};
src[0] = '0' + IB_USER_VERBS_ABI_VERSION;
return sysfs_read(buf, count, pos, src);
}
#if 0 // AKAROS_PORT (9ns hooks)
static const struct file_operations ib_api_ver = {
.read = ib_api_ver_read,
.open = kfs_open,
.release= kfs_release,
};
#endif
static ssize_t mlx4_mgm_read(struct file *filp, char __user *buf, size_t count,
loff_t *pos)
{
#if CONFIG_MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE == -1
char src[4] = { '-', '1', 0, 0 };
#else
char src[4] = { '1', '0', 0, 0 };
#endif
return sysfs_read(buf, count, pos, src);
}
#if 0 // AKAROS_PORT
static const struct file_operations mlx4_mgm = {
.read = mlx4_mgm_read,
.open = kfs_open,
.release= kfs_release,
};
#endif
#if 0
static void stradd(char *dest, int val, int num)
{
int tval = val, i = 0, fac = 1;
while (tval) {
tval /= 10;
fac *= 10;
i++;
}
fac /= 10;
tval = val;
while (tval && num) {
int dig = tval / fac;
*dest++ = dig + '0';
tval -= (dig * fac);
fac /= 10;
num--;
}
}
static ssize_t cpu_read(struct file *filp, char __user *buf, size_t count,
loff_t *pos)
{
char cpu_info_str[128];
long freq = __proc_global_info.tsc_freq, idx;
strncpy(cpu_info_str, "cpu MHz\t\t: ", 16);
idx = strlen(cpu_info_str);
stradd(cpu_info_str + idx, freq / 1000000, 4);
idx += 4;
strncpy(cpu_info_str + idx, ".", 1);
idx++;
stradd(cpu_info_str + idx, freq % 1000000, 3);
idx += 3;
cpu_info_str[idx] = 0;
return sysfs_read(buf, count, pos, cpu_info_str);
}
static const struct file_operations cpuinfo = {
.read = cpu_read,
.open = kfs_open,
.release= kfs_release,
};
#endif
void sysfs_init(void)
{
#if 1 // AKAROS_PORT
warn("mlx4: udrvr stuff requires various files, implement for 9ns!");
#else
do_mkdir("/dev_vfs/infiniband", S_IRWXU | S_IRWXG | S_IRWXO);
do_mkdir("/sys", S_IRWXU | S_IRWXG | S_IRWXO);
do_mkdir("/sys/class", S_IRWXU | S_IRWXG | S_IRWXO);
do_mkdir("/sys/class/infiniband_verbs", S_IRWXU | S_IRWXG | S_IRWXO);
do_mkdir("/sys/class/infiniband", S_IRWXU | S_IRWXG | S_IRWXO);
make_device("/sys/class/infiniband_verbs/abi_version",
S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
__S_IFCHR, (struct file_operations *)&ib_api_ver);
do_mkdir("/sys/module", S_IRWXU | S_IRWXG | S_IRWXO);
do_mkdir("/sys/module/mlx4_core", S_IRWXU | S_IRWXG | S_IRWXO);
do_mkdir("/sys/module/mlx4_core/parameters", S_IRWXU | S_IRWXG |
S_IRWXO);
make_device("/sys/module/mlx4_core/parameters/log_num_mgm_entry_size",
S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
__S_IFCHR, (struct file_operations *)&mlx4_mgm);
#if 0
/* Do this thru init scripts */
do_mkdir("/proc", S_IRWXU | S_IRWXG | S_IRWXO);
make_device("/proc/cpuinfo", S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR |
S_IRGRP | S_IROTH, __S_IFCHR, (struct file_operations *)&cpuinfo);
#endif
#endif
}
static ssize_t dver_read(struct file *filp, char __user *buf, size_t count,
loff_t *pos)
{
struct ib_uverbs_device *uvp;
char src[4] = { 0, 0, 0, 0};
uvp = (struct ib_uverbs_device *)get_fs_info(filp);
src[0] = '0' + uvp->ib_dev->uverbs_abi_ver;
return sysfs_read(buf, count, pos, src);
}
static ssize_t dname_read(struct file *filp, char __user *buf, size_t count,
loff_t *pos)
{
struct ib_uverbs_device *uvp;
uvp = (struct ib_uverbs_device *)get_fs_info(filp);
return sysfs_read(buf, count, pos, uvp->ib_dev->name);
}
static ssize_t ntype_read(struct file *filp, char __user *buf, size_t count,
loff_t *pos)
{
char src[] = "1";
return sysfs_read(buf, count, pos, src);
}
static ssize_t ddev_read(struct file *filp, char __user *buf, size_t count,
loff_t *pos)
{
char src[] = "0x1003";
return sysfs_read(buf, count, pos, src);
}
static ssize_t dven_read(struct file *filp, char __user *buf, size_t count,
loff_t *pos)
{
char src[] = "0x15b3";
return sysfs_read(buf, count, pos, src);
}
static ssize_t vsd_read(struct file *filp, char __user *buf, size_t count,
loff_t *pos)
{
char *src = "puma20_A1-10.2.3.0";
return sysfs_read(buf, count, pos, src);
}
#if 0 // AKAROS_PORT
static const struct file_operations dver_fops = {
.read = dver_read,
.open = kfs_open,
.release= kfs_release,
};
static const struct file_operations dname_fops = {
.read = dname_read,
.open = kfs_open,
.release= kfs_release,
};
static const struct file_operations ddev_fops = {
.read = ddev_read,
.open = kfs_open,
.release= kfs_release,
};
static const struct file_operations dven_fops = {
.read = dven_read,
.open = kfs_open,
.release= kfs_release,
};
static const struct file_operations ntype_fops = {
.read = ntype_read,
.open = kfs_open,
.release= kfs_release,
};
static const struct file_operations vsd_fops = {
.read = vsd_read,
.open = kfs_open,
.release= kfs_release,
};
#endif
void sysfs_create(int devnum, const struct file_operations *verb_fops,
void *ptr)
{
#if 1 // AKAROS_PORT
warn("mlx4: udrvr stuff requires various files, implement for 9ns!");
#else
char sysname[256] = "/sys/class/infiniband_verbs/uverbs0";
char devname[] = "/dev_vfs/infiniband/uverbs0";
char drvname[64] = "/sys/class/infiniband/";
int sysnameidx = strlen(sysname), drvidx;
struct file *fp;
struct ib_uverbs_device *uvp = (struct ib_uverbs_device *)ptr;
/* Create correct name */
if (devnum > 9)
panic("Too many devs");
devname[strlen(devname) - 1] = '0' + devnum;
sysname[sysnameidx - 1] = '0' + devnum;
/* Foll fops need to come from caller */
fp = make_device(devname,
S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
__S_IFCHR, (struct file_operations *)verb_fops);
set_fs_info(fp, ptr);
/* /sys/class/infiniband/mlx4_0 */
strncpy((drvname + strlen(drvname)), uvp->ib_dev->name, 12);
do_mkdir(drvname, S_IRWXU | S_IRWXG | S_IRWXO);
drvidx = strlen(drvname);
/* /sys/class/infiniband/mlx4_0/node_type */
strncpy(drvname + drvidx, "/node_type", 11);
make_device(drvname,
S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
__S_IFCHR, (struct file_operations *)&ntype_fops);
/* /sys/class/infiniband/mlx4_0/vsd */
strncpy(drvname + drvidx, "/vsd", 5);
fp = make_device(drvname,
S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
__S_IFCHR, (struct file_operations *)&vsd_fops);
set_fs_info(fp, ptr);
/* /sys/class/infiniband_verbs/uverbs0 */
do_mkdir(sysname, S_IRWXU | S_IRWXG | S_IRWXO);
/* /sys/class/infiniband_verbs/uverbs0/device */
strncpy(sysname + sysnameidx, "/device", 16);
do_mkdir(sysname, S_IRWXU | S_IRWXG | S_IRWXO);
/* /sys/class/infiniband_verbs/uverbs0/device/device */
strncpy(sysname + sysnameidx, "/device/device", 16);
fp = make_device(sysname,
S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
__S_IFCHR, (struct file_operations *)&ddev_fops);
set_fs_info(fp, ptr);
/* /sys/class/infiniband_verbs/uverbs0/device/vendor */
strncpy(sysname + sysnameidx, "/device/vendor", 16);
fp = make_device(sysname,
S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
__S_IFCHR, (struct file_operations *)&dven_fops);
set_fs_info(fp, ptr);
/* /sys/class/infiniband_verbs/uverbs0/ibdev */
strncpy(sysname + sysnameidx, "/ibdev", 16);
fp = make_device(sysname,
S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
__S_IFCHR, (struct file_operations *)&dname_fops);
set_fs_info(fp, ptr);
/* /sys/class/infiniband_verbs/uverbs0/abi_version */
strncpy(sysname + sysnameidx, "/abi_version", 16);
fp = make_device(sysname,
S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
__S_IFCHR, (struct file_operations *)&dver_fops);
set_fs_info(fp, ptr);
#endif
}
/* END: Linux /sys support for lib/apps */
/* START: Support older version of libibverbs */
/* in_words and provider_in_words are in terms of 4-byte words, not 8-byte */
struct ib_uverbs_ex_cmd_hdr_compat {
__u16 provider_in_words;
__u16 provider_out_words;
__u32 cmd_hdr_reserved;
__u32 comp_mask;
/* __u32 dummy; */
__u64 response;
__u32 qp_handle;
};
static ssize_t compat_ex(struct ib_uverbs_file *file, size_t count,
const char __user *buf)
{
struct ib_uverbs_cmd_hdr hdr;
struct ib_uverbs_ex_cmd_hdr_compat ex_hdr;
struct ib_udata ucore;
struct ib_udata uhw;
__u32 command;
int err;
unsigned long tmpbuf[16];
struct ib_uverbs_create_flow *ptr;
if (copy_from_user(&hdr, buf, sizeof hdr))
return -EFAULT;
command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
command -= 2;
if (command == IB_USER_VERBS_EX_CMD_DESTROY_FLOW) {
INIT_UDATA_BUF_OR_NULL(&ucore, buf + 8, 0, 8, 0);
err = ib_uverbs_ex_destroy_flow(file, &ucore, &uhw);
goto next;
}
/*
* "struct ibv_create_flow" is 56 bytes, "struct ibv_kern_spec" is
* 48 bytes, so at a minimum we expect 56 + (n x 48), n >= 1.
*/
if (count < 104)
return -EINVAL;
if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
return -EFAULT;
if ((hdr.in_words + ex_hdr.provider_in_words) * 4 != count)
return -EINVAL;
if (ex_hdr.cmd_hdr_reserved)
return -EINVAL;
if (ex_hdr.comp_mask)
return -EINVAL;
if (ex_hdr.response) {
if (!hdr.out_words && !ex_hdr.provider_out_words)
return -EINVAL;
if (!access_ok(VERIFY_WRITE,
(void __user *) (unsigned long) ex_hdr.response,
(hdr.out_words + ex_hdr.provider_out_words) * 4))
return -EFAULT;
} else {
if (hdr.out_words || ex_hdr.provider_out_words)
return -EINVAL;
}
ptr = (struct ib_uverbs_create_flow *)tmpbuf;
ptr->comp_mask = 0; /* user input already validated above */
ptr->qp_handle = ex_hdr.qp_handle;
if ((count-36) > 120)
BUG();
/* Copy 16 bytes worth "struct ibv_kern_flow_attr" */
copy_from_user(&tmpbuf[1], buf+36, sizeof(struct ib_uverbs_flow_attr));
ptr->flow_attr.size -= 56; /* Comes in as 96 = 56 + 40 */
/* Copy "struct ibv_kern_spec"s */
copy_from_user(&tmpbuf[3], buf+56, count-56);
/*
* Copy : count-56 "struct ibv_kern_spec"s,
* 16 bytes "struct ibv_kern_flow_attr", 16 bytes comp_mask/qp_handle.
*/
copy_to_user((char __user *)buf, tmpbuf, count-24);
INIT_UDATA_BUF_OR_NULL(&ucore, buf,
(unsigned long) ex_hdr.response, count - 24,
hdr.out_words * 4);
err = ib_uverbs_ex_create_flow(file, &ucore, &uhw);
next:
if (err)
return err;
return count;
}
static ssize_t compat(struct ib_uverbs_file *file, size_t count,
const char __user *buf)
{
unsigned long tmpbuf[17];
struct ib_uverbs_cmd_hdr *p = (struct ib_uverbs_cmd_hdr *)tmpbuf;
char __user *dst = (char __user *)buf;
int insz, outsz;
/*
* User "struct ibv_qp_dest" is 40 bytes, passes in 136 bytes.
* Kernel "struct ib_uverbs_qp_dest" is 32 bytes, expects 120.
* Last 8 bytes of user "struct ibv_qp_dest" not used by kernel.
* Kernel expects this layout:
* struct ib_uverbs_cmd_hdr (8)
* struct ib_uverbs_qp_dest (32 <- 40)
* struct ib_uverbs_qp_dest (32 <- 40)
* Rest of qp_mod inputs (48)
*/
if (count > 136)
BUG();
if (copy_from_user(tmpbuf, buf, count))
return -EFAULT;
insz = p->in_words * 4;
outsz = p->out_words * 4;
copy_to_user(dst, &tmpbuf[1], sizeof(struct ib_uverbs_qp_dest));
dst += sizeof(struct ib_uverbs_qp_dest);
copy_to_user(dst, &tmpbuf[6], sizeof(struct ib_uverbs_qp_dest));
dst += sizeof(struct ib_uverbs_qp_dest);
copy_to_user(dst, &tmpbuf[11], 48);
return ib_uverbs_modify_qp(file, buf, insz, outsz);
}
/*
* Request structure is:
* ib_uverbs_cmd_hdr :: (almost) ib_uverbs_ex_cmd_hdr_compat.
* Response structure is:
* 8B comp_mask :: ib_uverbs_query_device_resp :: 8B timestamp_mask ::
* 8B hca_core_clock
*/
static ssize_t compat_query(struct ib_uverbs_file *file, size_t count,
const char __user *buf)
{
unsigned long tmpbuf[17], tval = 0;
struct ib_uverbs_cmd_hdr *p = (struct ib_uverbs_cmd_hdr *)tmpbuf;
char __user *dst = (char __user *)buf;
int insz, outsz;
if (copy_from_user(tmpbuf, buf, count))
return -EFAULT;
insz = p->in_words * 4;
outsz = p->out_words * 4;
/* Zero out expected comp_mask field in response */
copy_to_user((void *)tmpbuf[3], &tval, 8);
/* Kernel writes out after expected comp_mask field */
tmpbuf[3] += 8;
/* Move "response" upwards to "buf" */
copy_to_user(dst, &tmpbuf[3], sizeof(struct ib_uverbs_query_device));
return ib_uverbs_query_device(file, buf, insz, outsz);
}
/*
* Compat hack for applications/libraries we care about. Retrofit Linux 3.12
* style APIs.
*/
ssize_t check_old_abi(struct file *filp, const char __user *buf, size_t count)
{
struct ib_uverbs_cmd_hdr hdr;
int tmp;
struct ib_uverbs_file *file = filp->private_data;
if (copy_from_user(&hdr, buf, sizeof hdr))
return -EFAULT;
tmp = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
if ((tmp >= 52) && (tmp <= 53)) {
return compat_ex(file, count, buf);
} else if (tmp == IB_USER_VERBS_CMD_MODIFY_QP) {
return compat(file, count, buf);
} else if (tmp == 56) {
return compat_query(file, count, buf);
} else if (tmp == IB_USER_VERBS_CMD_QUERY_QP) {
panic("query_qp API difference not handled\n");
}
/* Continue with processing this command */
return 0;
}
/* END: Support older version of libibverbs */