/*
 * Copyright (c) 2016 Google Inc
 * Author: Kanoj Sarcar <kanoj@google.com>
 * See LICENSE for details.
 */

#include <err.h>
#include <kmalloc.h>
#include <kref.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <error.h>
#include <pmap.h>
#include <smp.h>
#include <devfs.h>
#include <linux/rdma/ib_user_verbs.h>
#include "uverbs.h"
#include <ros/procinfo.h>

static unsigned long pgprot_val(int vmprot)
{
	unsigned long	prot = PTE_P | PTE_U | PTE_A;

	if (vmprot & PROT_WRITE)
		prot |= PTE_W | PTE_D;
	return prot;
}

unsigned long pgprot_noncached(int vmprot)
{
	return pgprot_val(vmprot) | PTE_NOCACHE;
}

unsigned long pgprot_writecombine(int vmprot)
{
	return pgprot_val(vmprot) | PTE_WRITECOMB;
}

/*
 * Our version knocked off from kern/src/mm.c version + uncaching logic from
 * vmap_pmem_nocache(). This routine is expected to be invoked as part of mmap()
 * handler.
 */
int map_upage_at_addr(struct proc *p, physaddr_t paddr, uintptr_t addr, int pteprot, int dolock)
{
	pte_t		pte;
	int		rv = -1;
	struct page	*pp;

	/* __vmr_free_pgs() assumes mapped pte is backed by "struct page" */
	if (paddr > max_paddr) {
		printk("[akaros]: map_upage_at_addr(): paddr=0x%llx "
		    "max_paddr=0x%llx\n", paddr, max_paddr);
		return -1;
	}

	/* Ensure kernel has not put such special pages into free pool */
	if (page_is_free(paddr >> PAGE_SHIFT)) {
		printk("[akaros]: map_upage_at_addr(): FreePA=0x%llx\n",
		    paddr);
		return -1;
	}

	pp = pa2page(paddr);

	/* __vmr_free_pgs() refcnt's pagemap pages differently */
	if (atomic_read(&pp->pg_flags) & PG_PAGEMAP) {
		printk("[akaros]: map_upage_at_addr(): mapPA=0x%llx\n",
		    paddr);
		return -1;
	}

	spin_lock(&p->pte_lock);

	/*
	 * Free any existing page backing uva, drop in this page, and
	 * acquire refcnt on page on behalf of user. Note though that we
	 * do not expect an existing page, since we are invoked in mmap
	 * path (page_insert() does not handle PG_PAGEMAP refcnt's).
	 */
	rv = page_insert(p->env_pgdir, pp, (void *)addr, pteprot);
	spin_unlock(&p->pte_lock);
	return rv;
}

void set_page_dirty_lock(struct page *pagep)
{
	atomic_or(&pagep->pg_flags, PG_DIRTY);
}

void put_page(struct page *pagep)
{
	if (atomic_read(&pagep->pg_flags) & PG_PAGEMAP)
		printk("[akaros]: put_page() on pagemap page!!!\n");
	page_decref(pagep);
}

int get_user_page(struct proc *p, unsigned long uvastart, int write, int force,
    struct page **plist)
{
	pte_t		pte;
	int		ret = -1;
	struct page	*pp;

	spin_lock(&p->pte_lock);

	pte = pgdir_walk(p->env_pgdir, (void*)uvastart, TRUE);

	if (!pte_walk_okay(pte))
		goto err1;

	if (!pte_is_present(pte)) {
		unsigned long prot = PTE_P | PTE_U | PTE_A | PTE_W | PTE_D;
#if 0
		printk("[akaros]: get_user_page() uva=0x%llx pte absent\n",
		    uvastart);
#endif
		/*
		 * TODO: ok to allocate with pte_lock? "prot" needs to be
		 * based on VMR writability, refer to pgprot_noncached().
		 */
		if (upage_alloc(p, &pp, 0))
			goto err1;
		pte_write(pte, page2pa(pp), prot);
	} else {
		pp = pa2page(pte_get_paddr(pte));

		/* __vmr_free_pgs() refcnt's pagemap pages differently */
		if (atomic_read(&pp->pg_flags) & PG_PAGEMAP) {
			printk("[akaros]: get_user_page(): uva=0x%llx\n",
			    uvastart);
			goto err1;
		}
	}

	if (write && (!pte_has_perm_urw(pte))) {
		/* TODO: How is Linux using the "force" parameter */
		printk("[akaros]: get_user_page() uva=0x%llx pte ro\n",
		    uvastart);
		goto err1;
	}

	page_incref(pp);
	plist[0] = pp;
	ret = 1;
err1:
	spin_unlock(&p->pte_lock);
	return ret;
}

int sg_alloc_table(struct sg_table *ptr, unsigned int npages, gfp_t mask)
{
	ptr->sgl = kmalloc((sizeof(struct scatterlist) * npages), mask);
	ptr->nents = ptr->orig_nents = npages;
	sg_init_table(ptr->sgl, npages);
	return 0;
}

void sg_free_table(struct sg_table *ptr)
{
	kfree(ptr->sgl);
}

void idr_remove(struct idr *idp, int id)
{
	BUG_ON((id < 0) || (id >= MAXITEMS));
	idp->values[id] = NULL;
}

void *idr_find(struct idr *idp, int id)
{
	BUG_ON((id < 0) || (id >= MAXITEMS));
	BUG_ON(idp->values[id] == NULL);
	return idp->values[id];
}

int idr_alloc(struct idr *idp, void *ptr, int start, int end, gfp_t gfp_mask)
{
	int	i;

	/* We use values[] == NULL as an indicator that slot is free */
	BUG_ON(ptr == NULL);

	spin_lock_irqsave(&idp->lock, f);

	for (i = 0; i < MAXITEMS; i++) {
		if (idp->values[i] == NULL) {
			idp->values[i] = ptr;
			goto done;
		}
	}

	i = -1;			/* error return */

done:
	spin_unlock_irqsave(&idp->lock);
	return i;
}

/* START: Linux /sys support for lib/apps */

/* Callers must pass in null terminated strings */
static ssize_t sysfs_read(char __user *buf, size_t ucount, loff_t *pos,
    char *src)
{
	int		slen = strlen(src) + 1;	/* + 1 for terminating null */
	unsigned long	off = *pos, nb = slen - off;

	if (off >= slen)
		return 0;

	if (copy_to_user(buf, (src + off), nb))
		return -EFAULT;

	*pos += nb;
	return nb;
}

static ssize_t ib_api_ver_read(struct file *filp, char __user *buf,
    size_t count, loff_t *pos)
{
	char		src[4] = { 0, 0, 0, 0};

	src[0] = '0' + IB_USER_VERBS_ABI_VERSION;

	return sysfs_read(buf, count, pos, src);
}

static const struct file_operations ib_api_ver = {
	.read	= ib_api_ver_read,
	.open	= kfs_open,
	.release= kfs_release,
};

static ssize_t mlx4_mgm_read(struct file *filp, char __user *buf,
    size_t count, loff_t *pos)
{
#if CONFIG_MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE == -1
	char		src[4] = { '-', '1', 0, 0 };
#else
	char		src[4] = { '1', '0', 0, 0 };
#endif

	return sysfs_read(buf, count, pos, src);
}

static const struct file_operations mlx4_mgm = {
	.read	= mlx4_mgm_read,
	.open	= kfs_open,
	.release= kfs_release,
};

#if 0
static void stradd(char *dest, int val, int num)
{
	int	tval = val, i = 0, fac = 1;

	while (tval) {
		tval /= 10;
		fac *= 10;
		i++;
	}
	fac /= 10;
	tval = val;
	while (tval && num) {
		int dig = tval / fac;
		*dest++ = dig + '0';
		tval -= (dig * fac);
		fac /= 10;
		num--;
	}
}

static ssize_t cpu_read(struct file *filp, char __user *buf,
    size_t count, loff_t *pos)
{
	char cpu_info_str[128];
	long freq = __proc_global_info.tsc_freq, idx;

	strncpy(cpu_info_str, "cpu MHz\t\t: ", 16);
	idx = strlen(cpu_info_str);

	stradd(cpu_info_str + idx, freq / 1000000, 4);
	idx += 4;

	strncpy(cpu_info_str + idx, ".", 1);
	idx++;

	stradd(cpu_info_str + idx, freq % 1000000, 3);
	idx += 3;

	cpu_info_str[idx] = 0;

	return sysfs_read(buf, count, pos, cpu_info_str);
}

static const struct file_operations cpuinfo = {
	.read	= cpu_read,
	.open	= kfs_open,
	.release= kfs_release,
};
#endif

void sysfs_init(void)
{
	do_mkdir("/dev/infiniband", S_IRWXU | S_IRWXG | S_IRWXO);
	do_mkdir("/sys", S_IRWXU | S_IRWXG | S_IRWXO);
	do_mkdir("/sys/class", S_IRWXU | S_IRWXG | S_IRWXO);
	do_mkdir("/sys/class/infiniband_verbs", S_IRWXU | S_IRWXG | S_IRWXO);
	do_mkdir("/sys/class/infiniband", S_IRWXU | S_IRWXG | S_IRWXO);

	make_device("/sys/class/infiniband_verbs/abi_version",
		    S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
		    __S_IFCHR, (struct file_operations *)&ib_api_ver);

	do_mkdir("/sys/module", S_IRWXU | S_IRWXG | S_IRWXO);
	do_mkdir("/sys/module/mlx4_core", S_IRWXU | S_IRWXG | S_IRWXO);
	do_mkdir("/sys/module/mlx4_core/parameters", S_IRWXU | S_IRWXG |
	    S_IRWXO);
	make_device("/sys/module/mlx4_core/parameters/log_num_mgm_entry_size",
		    S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
		    __S_IFCHR, (struct file_operations *)&mlx4_mgm);

#if 0
	/* Do this thru init scripts */
	do_mkdir("/proc", S_IRWXU | S_IRWXG | S_IRWXO);
	make_device("/proc/cpuinfo", S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR |
	    S_IRGRP | S_IROTH, __S_IFCHR, (struct file_operations *)&cpuinfo);
#endif
}

static ssize_t dver_read(struct file *filp, char __user *buf,
    size_t count, loff_t *pos)
{
	struct ib_uverbs_device *uvp;
	char		src[4] = { 0, 0, 0, 0};

	uvp = (struct ib_uverbs_device *)get_fs_info(filp);
	src[0] = '0' + uvp->ib_dev->uverbs_abi_ver;

	return sysfs_read(buf, count, pos, src);
}

static ssize_t dname_read(struct file *filp, char __user *buf,
    size_t count, loff_t *pos)
{
	struct ib_uverbs_device *uvp;

	uvp = (struct ib_uverbs_device *)get_fs_info(filp);
	return sysfs_read(buf, count, pos, uvp->ib_dev->name);
}

static ssize_t ntype_read(struct file *filp, char __user *buf,
    size_t count, loff_t *pos)
{
	char	src[] = "1";

	return sysfs_read(buf, count, pos, src);
}

static ssize_t ddev_read(struct file *filp, char __user *buf,
    size_t count, loff_t *pos)
{
	char	src[] = "0x1003";

	return sysfs_read(buf, count, pos, src);
}

static ssize_t dven_read(struct file *filp, char __user *buf,
    size_t count, loff_t *pos)
{
	char	src[] = "0x15b3";

	return sysfs_read(buf, count, pos, src);
}

static ssize_t vsd_read(struct file *filp, char __user *buf,
    size_t count, loff_t *pos)
{
	char	*src = "puma20_A1-10.2.3.0";

	return sysfs_read(buf, count, pos, src);
}

static const struct file_operations dver_fops = {
	.read	= dver_read,
	.open	= kfs_open,
	.release= kfs_release,
};

static const struct file_operations dname_fops = {
	.read	= dname_read,
	.open	= kfs_open,
	.release= kfs_release,
};

static const struct file_operations ddev_fops = {
	.read	= ddev_read,
	.open	= kfs_open,
	.release= kfs_release,
};

static const struct file_operations dven_fops = {
	.read	= dven_read,
	.open	= kfs_open,
	.release= kfs_release,
};

static const struct file_operations ntype_fops = {
	.read	= ntype_read,
	.open	= kfs_open,
	.release= kfs_release,
};

static const struct file_operations vsd_fops = {
	.read	= vsd_read,
	.open	= kfs_open,
	.release= kfs_release,
};

void sysfs_create(int devnum, const struct file_operations *verb_fops,
    void *ptr)
{
	char		sysname[256] = "/sys/class/infiniband_verbs/uverbs0";
	char		devname[] = "/dev/infiniband/uverbs0";
	char		drvname[64] = "/sys/class/infiniband/";
	int		sysnameidx = strlen(sysname), drvidx;
	struct file	*fp;
	struct ib_uverbs_device *uvp = (struct ib_uverbs_device *)ptr;

	/* Create correct name */
	if (devnum > 9)
		panic("Too many devs");
	devname[strlen(devname) - 1] = '0' + devnum;
	sysname[sysnameidx - 1] = '0' + devnum;

	/* Foll fops need to come from caller */
	fp = make_device(devname,
	    S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
	    __S_IFCHR, (struct file_operations *)verb_fops);
	set_fs_info(fp, ptr);

	/* /sys/class/infiniband/mlx4_0 */
	strncpy((drvname + strlen(drvname)), uvp->ib_dev->name, 12);
	do_mkdir(drvname, S_IRWXU | S_IRWXG | S_IRWXO);
	drvidx = strlen(drvname);

	/* /sys/class/infiniband/mlx4_0/node_type */
	strncpy(drvname + drvidx, "/node_type", 11);
	make_device(drvname,
	    S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
	    __S_IFCHR, (struct file_operations *)&ntype_fops);

	/* /sys/class/infiniband/mlx4_0/vsd */
	strncpy(drvname + drvidx, "/vsd", 5);
	fp = make_device(drvname,
	    S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
	    __S_IFCHR, (struct file_operations *)&vsd_fops);
	set_fs_info(fp, ptr);

	/* /sys/class/infiniband_verbs/uverbs0 */
	do_mkdir(sysname, S_IRWXU | S_IRWXG | S_IRWXO);

	/* /sys/class/infiniband_verbs/uverbs0/device */
	strncpy(sysname + sysnameidx, "/device", 16);
	do_mkdir(sysname, S_IRWXU | S_IRWXG | S_IRWXO);

	/* /sys/class/infiniband_verbs/uverbs0/device/device */
	strncpy(sysname + sysnameidx, "/device/device", 16);
	fp = make_device(sysname,
	    S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
	    __S_IFCHR, (struct file_operations *)&ddev_fops);
	set_fs_info(fp, ptr);

	/* /sys/class/infiniband_verbs/uverbs0/device/vendor */
	strncpy(sysname + sysnameidx, "/device/vendor", 16);
	fp = make_device(sysname,
	    S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
	    __S_IFCHR, (struct file_operations *)&dven_fops);
	set_fs_info(fp, ptr);

	/* /sys/class/infiniband_verbs/uverbs0/ibdev */
	strncpy(sysname + sysnameidx, "/ibdev", 16);
	fp = make_device(sysname,
	    S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
	    __S_IFCHR, (struct file_operations *)&dname_fops);
	set_fs_info(fp, ptr);

	/* /sys/class/infiniband_verbs/uverbs0/abi_version */
	strncpy(sysname + sysnameidx, "/abi_version", 16);
	fp = make_device(sysname,
	    S_IWUSR | S_IWGRP | S_IWOTH | S_IRUSR | S_IRGRP | S_IROTH,
	    __S_IFCHR, (struct file_operations *)&dver_fops);
	set_fs_info(fp, ptr);
}

/* END: Linux /sys support for lib/apps */

/* START: Support older version of libibverbs */

/* in_words and provider_in_words are in terms of 4-byte words, not 8-byte */
struct ib_uverbs_ex_cmd_hdr_compat {
	__u16 provider_in_words;
	__u16 provider_out_words;
	__u32 cmd_hdr_reserved;
	__u32 comp_mask;
	/* __u32 dummy; */
	__u64 response;
	__u32 qp_handle;
};

static ssize_t compat_ex(struct ib_uverbs_file *file, size_t count,
    const char __user *buf)
{
	struct ib_uverbs_cmd_hdr hdr;
	struct ib_uverbs_ex_cmd_hdr_compat ex_hdr;
	struct ib_udata ucore;
	struct ib_udata uhw;
	__u32 command;
	int err;
	unsigned long	tmpbuf[16];
	struct ib_uverbs_create_flow *ptr;

	if (copy_from_user(&hdr, buf, sizeof hdr))
		return -EFAULT;

	command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
	command -= 2;

	if (command == IB_USER_VERBS_EX_CMD_DESTROY_FLOW) {
		INIT_UDATA_BUF_OR_NULL(&ucore, buf + 8, 0, 8, 0);
		err = ib_uverbs_ex_destroy_flow(file, &ucore, &uhw);
		goto next;
	}

	/*
	 * "struct ibv_create_flow" is 56 bytes, "struct ibv_kern_spec" is
	 * 48 bytes, so at a minimum we expect 56 + (n x 48), n >= 1.
	 */
	if (count < 104)
		return -EINVAL;

	if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
		return -EFAULT;

	if ((hdr.in_words + ex_hdr.provider_in_words) * 4 != count)
		return -EINVAL;

	if (ex_hdr.cmd_hdr_reserved)
		return -EINVAL;

	if (ex_hdr.comp_mask)
		return -EINVAL;

	if (ex_hdr.response) {
		if (!hdr.out_words && !ex_hdr.provider_out_words)
			return -EINVAL;

		if (!access_ok(VERIFY_WRITE,
			       (void __user *) (unsigned long) ex_hdr.response,
			       (hdr.out_words + ex_hdr.provider_out_words) * 4))
			return -EFAULT;
	} else {
		if (hdr.out_words || ex_hdr.provider_out_words)
			return -EINVAL;
	}

	ptr = (struct ib_uverbs_create_flow *)tmpbuf;
	ptr->comp_mask = 0;	/* user input already validated above */
	ptr->qp_handle = ex_hdr.qp_handle;

	if ((count-36) > 120)
		BUG();

	/* Copy 16 bytes worth "struct ibv_kern_flow_attr" */
	copy_from_user(&tmpbuf[1], buf+36, sizeof(struct ib_uverbs_flow_attr));

	ptr->flow_attr.size -= 56;		/* Comes in as 96 = 56 + 40 */

	/* Copy "struct ibv_kern_spec"s */
	copy_from_user(&tmpbuf[3], buf+56, count-56);

	/*
	 * Copy : count-56 "struct ibv_kern_spec"s,
	 * 16 bytes "struct ibv_kern_flow_attr", 16 bytes comp_mask/qp_handle.
	 */
	copy_to_user((char __user *)buf, tmpbuf, count-24);

	INIT_UDATA_BUF_OR_NULL(&ucore, buf,
	    (unsigned long) ex_hdr.response, count - 24,
	    hdr.out_words * 4);

	err = ib_uverbs_ex_create_flow(file, &ucore, &uhw);

next:
	if (err)
		return err;

	return count;
}

static ssize_t compat(struct ib_uverbs_file *file, size_t count,
    const char __user *buf)
{
	unsigned long			tmpbuf[17];
	struct ib_uverbs_cmd_hdr	*p = (struct ib_uverbs_cmd_hdr *)tmpbuf;
	char __user			*dst = (char __user *)buf;
	int				insz, outsz;

	/*
	 * User "struct ibv_qp_dest" is 40 bytes, passes in 136 bytes.
	 * Kernel "struct ib_uverbs_qp_dest" is 32 bytes, expects 120.
	 * Last 8 bytes of user "struct ibv_qp_dest" not used by kernel.
	 * Kernel expects this layout:
	 * 	struct ib_uverbs_cmd_hdr (8)
	 *	struct ib_uverbs_qp_dest (32 <- 40)
	 *	struct ib_uverbs_qp_dest (32 <- 40)
	 *	Rest of qp_mod inputs	 (48)
	 */

	if (count > 136)
		BUG();

	if (copy_from_user(tmpbuf, buf, count))
		return -EFAULT;
	insz = p->in_words * 4;
	outsz = p->out_words * 4;

	copy_to_user(dst, &tmpbuf[1], sizeof(struct ib_uverbs_qp_dest));
	dst += sizeof(struct ib_uverbs_qp_dest);
	copy_to_user(dst, &tmpbuf[6], sizeof(struct ib_uverbs_qp_dest));
	dst += sizeof(struct ib_uverbs_qp_dest);
	copy_to_user(dst, &tmpbuf[11], 48);


	return ib_uverbs_modify_qp(file, buf, insz, outsz);
}

/*
 * Request structure is:
 * ib_uverbs_cmd_hdr :: (almost) ib_uverbs_ex_cmd_hdr_compat.
 * Response structure is:
 * 8B comp_mask :: ib_uverbs_query_device_resp :: 8B timestamp_mask ::
 * 8B hca_core_clock
 */
static ssize_t compat_query(struct ib_uverbs_file *file, size_t count,
    const char __user *buf)
{
	unsigned long			tmpbuf[17], tval = 0;
	struct ib_uverbs_cmd_hdr	*p = (struct ib_uverbs_cmd_hdr *)tmpbuf;
	char __user			*dst = (char __user *)buf;
	int				insz, outsz;

	if (copy_from_user(tmpbuf, buf, count))
		return -EFAULT;
	insz = p->in_words * 4;
	outsz = p->out_words * 4;

	/* Zero out expected comp_mask field in response */
	copy_to_user((void *)tmpbuf[3], &tval, 8);
	/* Kernel writes out after expected comp_mask field */
	tmpbuf[3] += 8;
	/* Move "response" upwards to "buf" */
	copy_to_user(dst, &tmpbuf[3], sizeof(struct ib_uverbs_query_device));

	return ib_uverbs_query_device(file, buf, insz, outsz);
}

/*
 * Compat hack for applications/libraries we care about. Retrofit Linux 3.12
 * style APIs.
 */
ssize_t check_old_abi(struct file *filp, const char __user *buf, size_t count)
{
	struct ib_uverbs_cmd_hdr hdr;
	int			 tmp;
	struct ib_uverbs_file *file = filp->private_data;

	if (copy_from_user(&hdr, buf, sizeof hdr))
		return -EFAULT;

	tmp = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
	if ((tmp >= 52) && (tmp <= 53)) {
		return compat_ex(file, count, buf);
	} else if (tmp == IB_USER_VERBS_CMD_MODIFY_QP) {
		return compat(file, count, buf);
	} else if (tmp == 56) {
		return compat_query(file, count, buf);
	} else if (tmp == IB_USER_VERBS_CMD_QUERY_QP) {
		panic("query_qp API difference not handled\n");
	}

	/* Continue with processing this command */
	return 0;
}

/* END: Support older version of libibverbs */
