/*
 * Copyright 2011 Tilera Corporation. All Rights Reserved.
 *
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation, version 2.
 *
 *   This program is distributed in the hope that it will be useful, but
 *   WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 *   NON INFRINGEMENT.  See the GNU General Public License for
 *   more details.
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <linux/poll.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/pci.h>
#include <linux/gfp.h>
#include <linux/version.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

#if defined(TILEPCI_HOST)
#include "tilepci.h"
#include "drv_pcie_channel_intf.h"
#elif defined(TILEPCI_ENDP)
#include <asm/tilepci.h>
#include <hv/drv_pcie_channel_intf.h>
#else
#error You must define either TILEPCI_HOST or TILEPCI_ENDP
#endif
#include "tilepci_shared_code.h"


/****************************************************************/
/*		      Zero-Copy Helper Functions                */
/****************************************************************/

static void *tlr_alloc_mem(size_t size)
{
	int		 order;
	void		*ret;

	ret = NULL;

	/*
	 * This gives us the number of pages necessary to
	 * hold the request
	 */
	order = get_order(size);
	if (order > 9)
		goto exit;

	ret = (void *)__get_free_pages(GFP_KERNEL, order);
	if (ret == NULL)
		goto exit;

	memset(ret, 'T', size);

exit:
	return ret;
}

static int tlr_free_mem(void *kern_addr, size_t size)
{
	int		 ret = 0;
	int		 order;


	/*
	 * This gives us the number of pages necessary to
	 * hold the request
	 */
	order = get_order(size);
	if (order > 9)
		goto exit;

	free_pages((unsigned long)kern_addr, order);
exit:
	return ret;
}


static int tlr_xfer_validate(struct tlr_zc_cmd_q *q,
			     tilepci_xfer_req_t	*from_xfer)
{
	int	 ret;

	ret = FALSE;
	ret = access_ok(VERIFY_WRITE, (void __force __user *)(from_xfer->addr),
			from_xfer->len);

	return ret;
}

static int tlr_xfer_cvt(struct tlr_zc_cmd_q *q,
			tilepci_xfer_req_t *xfer,
			struct tlr_zc_cmd	*cmd)
{
	int		 ret;
	int		 writable;

	ret = 0;

	/* Always clear the eop bits for a new command. */
	cmd->cmd.must_eop = 0;
	cmd->cmd.may_eop = 0;

	switch (cmd->dma_dir) {
	case DMA_TO_DEVICE:
		writable = FALSE;
		if (xfer->flags & TILEPCI_SEND_EOP)
			cmd->cmd.must_eop = 1;
		break;
	case DMA_FROM_DEVICE:
		writable = TRUE;
		if (xfer->flags & TILEPCI_RCV_MUST_EOP)
			cmd->cmd.must_eop = 1;
		if (xfer->flags & TILEPCI_RCV_MAY_EOP)
			cmd->cmd.may_eop = 1;
		break;
	default:
		writable = TRUE;
		break;
	}

	ret = tlr_map_cmd(cmd, (unsigned long)xfer->addr, xfer->len, writable);
	if (ret != 0)
		goto exit;

	/* dma_addr and nmapped_pages are set, set the other fields. */
	cmd->post_len = xfer->len;
	cmd->flags = xfer->flags;
	cmd->usr_addr = xfer->addr;
	cmd->cookie = xfer->cookie;

	/* tlr_flush_buffer(cmd->tlr, cmd->dma_addr, cmd->post_len); */

	HID_CMD_TRACE("cmd %p "
		      "chan %d "
		      "size %d "
		      "dma_addr %llx "
		      "usr_addr %p "
		      "\n",
		      cmd,
		      cmd->cmd_q->chan,
		      cmd->post_len,
		      cmd->dma_addr,
		      cmd->usr_addr
		);

#if defined(TILEPCI_HOST)
	cmd->cmd.buffer_addr_lo32 = DMA_ADDR_LO32(cmd->dma_addr);
	cmd->cmd.buffer_addr_hi32 = DMA_ADDR_HI32(cmd->dma_addr);
#else
	cmd->cmd.buffer_addr_lo32 = DMA_ADDR_LO32(cmd->dma_addr);
	cmd->cmd.buffer_addr_hi16 = DMA_ADDR_HI32(cmd->dma_addr);
	cmd->cmd.completion_queue_id = cmd->cmd_q->cpl_queue_id;
#endif
	cmd->cmd.size = cmd->post_len;
	cmd->cmd.channel = cmd->cmd_q->chan;
	cmd->cmd.tag = cmd->cookie;

	/* Always clear the soc bit. Will be set in post_cmds() if needed. */
	cmd->cmd.soc = 0;

exit:
	return ret;
}


static inline int __tlr_zc_cmd_q_ncmd_set(struct tlr_zc_cmd_q *q,
					      u32 ncmd_wanted)
{
	int			 ret;
	unsigned long		 tlr_lock_flags;
	unsigned long		 q_lock_flags;
	struct tlr_pcie_dev		*tlr;
	int			 ncmd_diff;
	u32		 i;
	struct tlr_zc_cmd	*cmd;
	struct tlr_list		*from_list;
	struct tlr_list		*to_list;
	u32		 tlr_free_list_len;
	int			 ncmd_have_inc;

	tlr = q->tlr;
	tlr_free_list_len = 0;

	/* Abandon this operation if the queue is poisoned due to reset. */
	if (q->chip_reset_poison) {
		HID_ERR_TRACE("Exit  ENXIO\n");
		return -ENXIO;
	}

	/* Limit the number of commands that can be posted to c2c channels. */
	if (q->chan >= PCIE_FIRST_C2C_SEND_CHANNEL &&
	    ncmd_wanted > TILEPCI_MAX_C2C_NCMD) {
		HID_ERR_TRACE("Exit EINVAL\n");
		return -EINVAL;
	}

	q->ncmd_wanted = ncmd_wanted;
	ncmd_diff = ncmd_wanted - q->ncmd_have;
	if (ncmd_diff == 0) {
		HID_EX_TRACE("Exit OK\n");
		return 0;
	}

	spin_lock_irqsave(&q->lock, q_lock_flags);
	spin_lock_irqsave(&tlr->zc_state.cmd_q_lock, tlr_lock_flags);

	ret = 0;
	if (ncmd_diff < 0) {
		from_list = &q->free_q;
		to_list = &tlr->zc_state.cmd_q_free_list;
		ncmd_diff = -ncmd_diff;
		ncmd_have_inc = -1;
	} else {
		from_list = &tlr->zc_state.cmd_q_free_list;
		to_list = &q->free_q;
		tlr_free_list_len =
			__tlr_list_len(&tlr->zc_state.cmd_q_free_list);
		if (tlr_free_list_len < ncmd_diff) {
			ret = -ENOBUFS;
			HID_ERR_TRACE("Exit ENOBUFS\n");
			goto exit;
		}
		ncmd_have_inc = 1;
	}

	for (i = 0; i < ncmd_diff; i++) {
		cmd = __tlr_list_rem_zc_cmd(from_list);
		if (cmd == NULL)
			break;
		tlr_zc_cmd_init(cmd, q, tlr, q->type, 0,
				    to_list, NULL);
		q->ncmd_have += ncmd_have_inc;
	}

 exit:
	spin_unlock_irqrestore(&tlr->zc_state.cmd_q_lock,
			       tlr_lock_flags);
	spin_unlock_irqrestore(&q->lock, q_lock_flags);

	if (ret == 0)
		ret = __tlr_zc_cmd_q_alloc_xfers(q);
	return ret;
}

static int tlr_zc_cmd_q_ncmd_set(struct tlr_zc_cmd_q *q,
					    u32 ncmd)
{
	int			 ret;
	int			 err;
	int			 have_rd_mutex;
	int			 have_wr_mutex;

	ret = 0;
	have_rd_mutex = FALSE;
	have_wr_mutex = FALSE;

	/*
	 * Grab both the read and write semaphores so that this operation is
	 * ordered with respect to any other processes that may be reading
	 * or writing.  Are we allowed to return -ERESTARTSYS here?  Can't
	 * seem to find the appropriate documentation...
	 */
	err = down_interruptible(&q->rd_xfer_mutex);
	if (err != 0) {
		ret = -ERESTARTSYS;
		HID_ERR_TRACE("Exit ERESTARTSYS\n");
		goto exit;
	}
	have_rd_mutex = TRUE;

	err = down_interruptible(&q->wr_xfer_mutex);
	if (err != 0) {
		ret = -ERESTARTSYS;
		HID_ERR_TRACE("Exit ERESTARTSYS\n");
		goto exit;
	}
	have_wr_mutex = TRUE;

	ret = __tlr_zc_cmd_q_ncmd_set(q, ncmd);

 exit:
	if (have_wr_mutex) {
		up(&q->wr_xfer_mutex);
		have_wr_mutex = FALSE;
	}

	if (have_rd_mutex) {
		up(&q->rd_xfer_mutex);
		have_rd_mutex = FALSE;
	}

	return ret;
}

/*
 * Put a particular command queue into reset.  If the command queue is
 * already in reset, this function has no effect.
 */
static int tlr_zc_cmd_q_start_reset(struct tlr_zc_cmd_q *q)
{
	unsigned long irq_flags;
	struct tlr_pcie_dev *tlr = q->tlr;
	int res = 0;

	spin_lock_irqsave(&q->lock, irq_flags);

	/* We can only start reset if we're currently connected. */
	if (q->state != CMD_Q_ST_CONNECTED) {
		res = -EINVAL;
		goto exit;
	}

	/* Issue the channel reset command. */
#if defined(TILEPCI_HOST)
	writel(q->chan, &tlr->regs->channel_reset);
#else
	{
		res = hv_dev_pwrite(tlr->hv_channel_ctl_fd, 0,
				    (HV_VirtAddr)&q->chan,
				    sizeof(q->chan),
				    PCIE_CHANNEL_CTL_CHANNEL_RESET_OFF);
		if (res != sizeof(q->chan))
			ERR("Tile channel reset failed, %d\n", q->chan);
	}
#endif

	q->state = CMD_Q_ST_RESET_STARTED;

 exit:
	spin_unlock_irqrestore(&q->lock, irq_flags);
	return res;
}


/*
 * If the specified command queue has started reset, inform the
 * completion handler that it should discard all incoming completions.
 * Once this function completes, it is safe to release all the command
 * queue lists.
 */
static int tlr_zc_cmd_q_enable_reset_discard(struct tlr_zc_cmd_q *q)
{
	unsigned long irq_flags;
	int res = 0;

	spin_lock_irqsave(&q->lock, irq_flags);

	/*
	 * Only makes sense to discard if we've started (and not
	 * completed) a channel reset.
	 */
	if (q->state == CMD_Q_ST_RESET_STARTED) {
		q->state = CMD_Q_ST_RESET_DISCARD;
		res = 0;

		/*
		 * We're now discarding completions, so free all our
		 * cmd lists that aren't currently in flight.  The
		 * 'posted queue' cannot be freed at this point
		 * because those commands haven't come back from the
		 * HV yet; some of them will still be doing DMA and
		 * some will come back complete-with-reset.
		 */
		__tlr_zc_cmd_q_free_list(q, &q->pend_q);
		__tlr_zc_cmd_q_free_list(q, &q->comp_q);
	}
	/*
	 * If multiple threads want to wait-for-discard, we shouldn't
	 * return errors to those that don't call in first.
	 */
	else if (q->state != CMD_Q_ST_RESET_DISCARD)
		res = -EINVAL;

	spin_unlock_irqrestore(&q->lock, irq_flags);
	return res;
}


static int tlr_zc_cmd_q_wait_for_need_soc(struct tlr_zc_cmd_q *q)
{
	unsigned long irq_flags;
	int res = 0;

	spin_lock_irqsave(&q->lock, irq_flags);

	/*
	 * If this is called on a command queue that is already
	 * connected or hasn't gotten into discard mode, something's
	 * gone very wrong.
	 */
	if (q->state == CMD_Q_ST_CONNECTED ||
	    q->state == CMD_Q_ST_RESET_STARTED) {
		res = -EIO;
		goto exit;
	}

	/* Check to see if we're already ready to connect. */
	if (q->state == CMD_Q_ST_NEED_SOC) {
		res = 0;
		goto exit;
	}

	/*
	 * At this point, we need to wait for any outstanding
	 * completions to return.  When that's done, we can advance to
	 * the 'need SoC' state.
	 */
	while (q->ncomp_tot != q->nreq_tot) {
		int wret;

		spin_unlock_irqrestore(&q->lock, irq_flags);
		wret = wait_event_interruptible(q->reset_drain_queue,
						(q->ncomp_tot == q->nreq_tot));
		spin_lock_irqsave(&q->lock, irq_flags);

		if (wret != 0) {
			res = -ERESTARTSYS;
			goto exit;
		}
	}

	q->state = CMD_Q_ST_NEED_SOC;

 exit:
	spin_unlock_irqrestore(&q->lock, irq_flags);
	return res;
}


/****************************************************************/
/*		       Zero-Copy mmap() Region                  */
/****************************************************************/

static int tlr_zc_mmap_check_num_frags(struct tlr_mmap_state *map, int frag_idx)
{
	int min_frags = frag_idx + 1;
	int new_num_frags;
	struct tlr_buf_fragment *frags;

	if (min_frags <= map->num_frags)
		return 0;

	/* Always grow to some power of two. */
	new_num_frags = 1;
	while (new_num_frags < min_frags)
		new_num_frags *= 2;

	frags = kmalloc(sizeof(*frags) * new_num_frags, GFP_KERNEL);
	if (frags == NULL)
		return -ENOMEM;

	memset(frags, 0, sizeof(*frags) * new_num_frags);
	if (map->num_frags > 0) {
		memcpy(frags, map->frags, sizeof(*frags) * map->num_frags);
		kfree(map->frags);
	}

	map->frags = frags;
	map->num_frags = new_num_frags;

	return 0;
}

static int tlr_zc_mmap_alloc_frag(struct tlr_buf_fragment *frag)
{
	int order;
	int i;

	order = get_order(TILEPCI_MMAP_GRANULARITY);

	/*
	 * Older kernels don't provide __GFP_ZERO, so we zero the
	 * pages manually.
	 */
	frag->page = alloc_pages(GFP_HIGHUSER, order);
	HID_INT_TRACE("alloced page = %p\n", frag->page);
	if (frag->page == NULL) {
		return -ENOMEM;
	} else {
		/*
		 * Zero the pages and bump refcnts for those that
		 * alloc_pages() hasn't already bumped.
		 */
		BUG_ON(in_interrupt());
		clear_highpage(frag->page);
		for (i = 1; i < (1 << order); i++) {
			get_page(frag->page + i);
			clear_highpage(frag->page + i);
		}
		return 0;
	}
}

static void tlr_zc_mmap_release(struct tlr_mmap_state *map)
{
	HID_INT_TRACE("release num_frags = %d\n", map->num_frags);

	if (map->num_frags) {
		int order = get_order(TILEPCI_MMAP_GRANULARITY);
		int i, j;
		for (i = 0; i < map->num_frags; i++) {
			struct page *page = map->frags[i].page;
			if (page != NULL) {
				for (j = 0; j < (1 << order); j++)
					put_page(page + j);
				map->frags[i].page = NULL;
			}
		}

		kfree(map->frags);
		map->frags = NULL;
		map->num_frags = 0;
	}
}

static void tlr_zc_vma_open(struct vm_area_struct *vma)
{
	struct tlr_mmap_state *map = vma->vm_private_data;

	down(&map->mutex);

	map->ref_cnt++;

	up(&map->mutex);
}

static void tlr_zc_vma_close(struct vm_area_struct *vma)
{
	struct tlr_mmap_state *map = vma->vm_private_data;

	down(&map->mutex);

	map->ref_cnt--;
	if (map->ref_cnt == 0)
		tlr_zc_mmap_release(map);

	up(&map->mutex);
}


#ifdef USE_VM_FAULT
static int tlr_zc_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
#else
static struct page *tlr_zc_vma_nopage(struct vm_area_struct *vma,
				      unsigned long address, int *type)
#endif
{
#ifdef USE_VM_FAULT
	unsigned long address = (unsigned long)vmf->virtual_address;
	int ret;
#else
	struct page *ret;
#endif
	unsigned long addr_off;
	unsigned long mmap_off;
	unsigned long offset;
	unsigned long frag_idx;
	unsigned long frag_off;
	struct page *page;
	int err;
	int have_mutex;
	struct tlr_mmap_state *map;

	ret = RETURN_SIGBUS;

	have_mutex = FALSE;
	map = vma->vm_private_data;

	/* Figure out the fragment index and offset within that fragment. */
	mmap_off = vma->vm_pgoff << PAGE_SHIFT;
	addr_off = (address - vma->vm_start);
	offset = mmap_off + addr_off;
	frag_idx = offset / TILEPCI_MMAP_GRANULARITY;
	frag_off = offset % TILEPCI_MMAP_GRANULARITY;

	HID_INT_TRACE("map = %p, frag_idx = %lx, frag_off = %lx\n",
		      map, frag_idx, frag_off);


	err = down_interruptible(&map->mutex);
	if (err != 0) {
		HID_ERR_TRACE("Interrupted\n");
		goto exit;
	}
	have_mutex = TRUE;

	/* Lookup fragment info or allocate a new fragment. */
	if (tlr_zc_mmap_check_num_frags(map, frag_idx) == -ENOMEM) {
		HID_ERR_TRACE("Couldn't resize frag array.\n");
		ret = RETURN_OOM;
		goto exit;
	}
	page = map->frags[frag_idx].page;
	if (page == NULL) {
		if (tlr_zc_mmap_alloc_frag(&map->frags[frag_idx]) == -ENOMEM) {
			HID_ERR_TRACE("Couldn't alloc pages.\n");
			ret = RETURN_OOM;
			goto exit;
		}
		page = map->frags[frag_idx].page;
	}

	/* Return the appropriate page within that fragment. */
	page += (frag_off >> PAGE_SHIFT);
	HID_INT_TRACE("returning PA = %llx\n",
		      (unsigned long long) page_to_phys(page));
	get_page(page);

#ifdef USE_VM_FAULT
	ret = 0;
	vmf->page = page;
#else
	ret = page;
	if (type != NULL)
		*type = VM_FAULT_MINOR;
#endif

 exit:
	if (have_mutex) {
		up(&map->mutex);
		have_mutex = FALSE;
	}

	return ret;
}

static struct vm_operations_struct tlr_zc_vm_ops = {
	.open	= tlr_zc_vma_open,
	.close	= tlr_zc_vma_close,
#ifdef USE_VM_FAULT
	.fault  = tlr_zc_vma_fault,
#else
	.nopage	= tlr_zc_vma_nopage,
#endif
};

static int tlr_zc_mmap(struct file *filp, struct vm_area_struct *vma)
{
	struct tlr_zc_cmd_q *q;

	q = filp->private_data;

	/* All mappings must be performed with MAP_SHARED. */
	if (!(vma->vm_flags & VM_SHARED)) {
		HID_ERR_TRACE("mmap flags must include VM_SHARED\n");
		return -EINVAL;
	}

	vma->vm_ops = &tlr_zc_vm_ops;
	vma->vm_flags |= VM_LOCKED | VM_RESERVED;
	vma->vm_private_data = &q->mmap_state;

	tlr_zc_vma_open(vma);

	return 0;
}



/****************************************************************/
/*		      Zero-Copy Syscall Support                 */
/****************************************************************/


#ifdef HAVE_UNLOCKED_IOCTL
static long tlr_zc_ioctl(struct file *filp,
			 unsigned int cmd, unsigned long arg)
#else
static int tlr_zc_ioctl(struct inode *inode, struct file *filp,
			unsigned int cmd, unsigned long arg)
#endif
{
#ifdef HAVE_UNLOCKED_IOCTL
	long			 ret;
#else
	int			 ret;
#endif
	struct tlr_zc_cmd_q	*q;
	u32		 ncmd;
	u32		 ncmd_min_read;

	ret = 0;
	q = filp->private_data;

	switch (cmd) {
	case TILEPCI_IOC_GET_NCMD:
		ret = put_user(q->ncmd_wanted, (u32 __user *)arg);
		break;
	case TILEPCI_IOC_SET_NCMD:
		ncmd = (u32)arg;
		HID_WRK_TRACE("Setting ZC channel %d ncmds to %d\n",
			      q->chan, ncmd);
		ret = tlr_zc_cmd_q_ncmd_set(q, ncmd);
		break;
	case TILEPCI_IOC_GET_MIN_READ_COMPS:
		HID_WRK_TRACE("Getting ZC channel %d ncmd_min_read %d\n",
			      q->chan, q->ncmd_min_read);
		ret = put_user(q->ncmd_min_read, (u32 __user *)arg);
		break;
	case TILEPCI_IOC_SET_MIN_READ_COMPS:
		ncmd_min_read = (u32)arg;
		HID_WRK_TRACE("Setting ZC channel %d ncmds to %d\n",
			      q->chan, ncmd_min_read);
		ret = 0;
		q->ncmd_min_read = ncmd_min_read;
		if (tlr_list_len(&q->comp_q, &q->lock) >= q->ncmd_min_read) {
			wmb();  /* ensure visibility before waking */
			wake_up_interruptible(&q->comp_queue);
		}
		break;
	case TILEPCI_IOC_START_RESET:
		ret = tlr_zc_cmd_q_start_reset(q);
		break;
	case TILEPCI_IOC_FINISH_RESET:
		ret = tlr_zc_cmd_q_enable_reset_discard(q);
		if (ret != 0)
			break;
		ret = tlr_zc_cmd_q_wait_for_need_soc(q);
		break;
	default:
		ret = -ENOTTY;
		HID_ERR_TRACE("Exit ENOTTY\n");
		goto exit;
		break;
	}

 exit:
	return ret;
}


static ssize_t tlr_zc_read(struct file *filp, char __user *buf, size_t count,
			   loff_t *f_pos)
{
	ssize_t			 ret;
	struct tlr_pcie_dev	*tlr;
	struct tlr_zc_cmd_q	*q;
	int			 err;
	int			 i;
	int			 have_mutex;
	u32		 nxfer;
	tilepci_xfer_comp_t	*xfer;
	struct tlr_zc_cmd	*cmd;
	int			 wait;

	ret = 0;
	q = filp->private_data;
	tlr = q->tlr;
	have_mutex = FALSE;
	wait = (filp->f_flags & O_NONBLOCK) == 0;

	HID_FOP_TRACE("Entered ZC channel %d]: wait %d\n",
		      q->chan, wait);

	HID_FOP_TRACE("q[%p]: type %d state %d chan %d\n",
		      q, q->type, q->state, q->chan);
	if (count == 0) {
		ret = 0;
		HID_EX_TRACE("Exit\n");
		goto exit;
	}

	if (!access_ok(VERIFY_WRITE, buf, count)) {
		ret = -EFAULT;
		HID_ERR_TRACE("Exit EFAULT\n");
		goto exit;
	}

	nxfer = count / sizeof(tilepci_xfer_comp_t);

	if ((nxfer * sizeof(tilepci_xfer_comp_t)) != count) {
		ret = -EINVAL;
		HID_ERR_TRACE("Exit EINVAL\n");
		goto exit;
	}

	/* Grab the command queue read lock. */
	err = down_interruptible(&q->rd_xfer_mutex);
	if (err != 0) {
		HID_ERR_TRACE("Exit ERESTARTSYS\n");
		ret = -ERESTARTSYS;
		goto exit;
	}
	have_mutex = TRUE;

	/* Abandon this operation if the queue is poisoned due to reset. */
	if (q->chip_reset_poison) {
		ret = -ENXIO;
		HID_ERR_TRACE("Exit ENXIO\n");
		goto exit;
	}

	for (i = 0; i < nxfer; i++) {
		xfer = &q->rd_xfer_comps[i];

		if (i >= q->ncmd_min_read)
			wait = FALSE;

		err = tlr_zc_cmd_q_get_comp(q, wait, nxfer - i, &cmd);
		if (err != 0) {
			if (i == 0) {
				ret = err;
				HID_ERR_TRACE("Exit %d\n", (int)ret);
				goto exit;
			} else {
				ret = 0;
				break;
			}
		}

		/* tlr_inv_buffer(tlr, cmd->dma_addr, cmd->comp_len); */

		xfer->addr = cmd->usr_addr;
		xfer->len = cmd->comp_len;
		xfer->cookie = cmd->cookie;
		xfer->flags = cmd->flags;
		tlr_zc_cmd_q_put_free(q, cmd);
	}

	ret = i * sizeof(*xfer);

	err = copy_to_user(buf, q->rd_xfer_comps, ret);
	if (err != 0) {
		HID_ERR_TRACE("Exit EFAULT\n");
		ret = -EFAULT;
		goto exit;

	}

	HID_EX_TRACE("Exit %d\n", (int)ret);

 exit:
	if (have_mutex) {
		up(&q->rd_xfer_mutex);
		have_mutex = FALSE;
	}

	return ret;
}


static ssize_t tlr_zc_write(struct file *filp, const char __user *buf,
			    size_t count, loff_t *f_pos)
{
	ssize_t			 ret;
	int			 err;
	struct tlr_pcie_dev	*tlr;
	u32		 i;
	int			 have_mutex;
	struct tlr_zc_cmd_q	*q;
	u32		 nxfer;
	int			 valid;
	tilepci_xfer_req_t	*xfer;
	struct tlr_zc_cmd	*cmd;
	int			 wait;
	struct tlr_list		 tmp_cmds;

	ret = 0;
	q = filp->private_data;
	tlr = q->tlr;
	have_mutex = FALSE;
	wait = (filp->f_flags & O_NONBLOCK) == 0;
	tlr_list_init(&tmp_cmds);

	HID_FOP_TRACE("Entered ZC channel %d: wait %d\n",
		      q->chan, wait);

	if (count == 0) {
		ret = 0;
		HID_EX_TRACE("Exit\n");
		goto exit;
	}

	nxfer = count / sizeof(tilepci_xfer_req_t);

	if ((nxfer * sizeof(tilepci_xfer_req_t)) != count) {
		ret = -EINVAL;
		HID_ERR_TRACE("Exit EINVAL\n");
		goto exit;
	}

	if (nxfer > q->ncmd_wanted) {
		ret = -ENOBUFS;
		HID_ERR_TRACE("Exit ENOBUF\n");
		goto exit;
	}

	/* Grab the command queue write lock. */
	err = down_interruptible(&q->wr_xfer_mutex);
	if (err != 0) {
		HID_ERR_TRACE("Exit ERESTARTSYS\n");
		ret = -ERESTARTSYS;
		goto exit;
	}
	have_mutex = TRUE;

	/* Abandon this operation if the queue is poisoned due to reset. */
	if (q->chip_reset_poison) {
		ret = -ENXIO;
		HID_ERR_TRACE("Exit ENXIO\n");
		goto exit;
	}

	err = copy_from_user(q->wr_xfer_reqs, buf, count);
	if (err < 0) {
		ret = -EFAULT;
		HID_ERR_TRACE("Exit EFAULT\n");
		goto exit;
	}

	for (i = 0; i < nxfer; i++) {
		xfer = &q->wr_xfer_reqs[i];
		valid = tlr_xfer_validate(q, xfer);
		if (!valid) {
			ret = -EFAULT;
			HID_ERR_TRACE("Exit  EFAULT\n");
			goto exit;
		}
	}

	/*
	 * At this point, the xfers are validated, so everything should
	 * proceed without static problems. Dynamic problems may still occur
	 * i.e. the user could still interrupt, etc
	 */

	for (i = 0; i < nxfer; i++) {
		xfer = &q->wr_xfer_reqs[i];

		err = tlr_zc_cmd_q_get_free(q, wait, &cmd);
		if (err != 0) {
			if ((i != 0) && (err == -EAGAIN)) {
				break;
			} else {
				ret = err;
				goto exit;
			}
		}

		__tlr_list_add_zc_cmd(&tmp_cmds, cmd);

		err = tlr_xfer_cvt(q, xfer, cmd);
		if (err != 0) {
			ret = err;
			goto exit;
		}
	}

	tlr_zc_cmd_q_pend_list(q, &tmp_cmds);

	/* ...and then post the buffers, adding SoC if needed. */
	post_cmds(q);
	ret = i * sizeof(tilepci_xfer_req_t);

	HID_EX_TRACE("Exit  ret %d\n", (int)ret);

 exit:
	tlr_zc_cmd_q_free_list(q, &tmp_cmds);
	if (have_mutex) {
		up(&q->wr_xfer_mutex);
		have_mutex = FALSE;
	}

	return ret;
}

static unsigned int tlr_zc_poll(struct file *filp, poll_table *table)
{
	unsigned int		 ret;
	int			 err;
	struct tlr_zc_cmd_q	*q;
	int			 have_rd_mutex;
	int			 have_wr_mutex;

	HID_FOP_TRACE("Entered\n");

	ret = 0;
	q = filp->private_data;
	have_rd_mutex = FALSE;
	have_wr_mutex = FALSE;

	/* Add wait queues to the poll table; we don't actually wait here. */
	poll_wait(filp, &q->free_queue, table);
	poll_wait(filp, &q->comp_queue, table);

	/*
	 * Grab both the read and write semaphores so that this operation is
	 * ordered with respect to any other processes that may be reading
	 * or writing.  Are we allowed to return -ERESTARTSYS here?  Can't
	 * seem to find the appropriate documentation...
	 */
	err = down_interruptible(&q->rd_xfer_mutex);
	if (err != 0) {
		ret = -ERESTARTSYS;
		HID_ERR_TRACE("Exit ERESTARTSYS\n");
		goto exit;
	}
	have_rd_mutex = TRUE;

	err = down_interruptible(&q->wr_xfer_mutex);
	if (err != 0) {
		ret = -ERESTARTSYS;
		HID_ERR_TRACE("Exit ERESTARTSYS\n");
		goto exit;
	}
	have_wr_mutex = TRUE;

	/* Abandon this operation if the queue is poisoned due to reset. */
	if (q->chip_reset_poison) {
		ret = POLLERR;
		HID_ERR_TRACE("Exit  ENXIO\n");
		goto exit;
	}

	if (!tlr_list_empty(&q->comp_q, &q->lock))
		ret |= (POLLIN | POLLRDNORM);

	if (!tlr_list_empty(&q->free_q, &q->lock))
		ret |= (POLLOUT | POLLWRNORM);


 exit:
	if (have_wr_mutex) {
		up(&q->wr_xfer_mutex);
		have_wr_mutex = FALSE;
	}

	if (have_rd_mutex) {
		up(&q->rd_xfer_mutex);
		have_rd_mutex = FALSE;
	}

	HID_EX_TRACE("Exit\n");
	return ret;
}

static int tlr_zc_release(struct inode *inode, struct file *filp)
{
	int			 ret;
	struct tlr_zc_cmd_q	*q;

	ret = 0;
	q = filp->private_data;

	HID_FOP_TRACE("Entered ZC channel %d\n", q->chan);

	/* Grab the queue read and write locks. */
	down(&q->rd_xfer_mutex);
	down(&q->wr_xfer_mutex);

	if (--q->open_count == 0) {
		/*
		 * Send a channel reset, then start discarding any completions
		 * that come back.  If we're already in reset, this has no
		 * effect.
		 */
		tlr_zc_cmd_q_start_reset(q);
		tlr_zc_cmd_q_enable_reset_discard(q);

		/*
		 * Clear chip_reset_poison before ncmd_set() so it doesn't
		 * assert.
		 */
		q->chip_reset_poison = 0;

		ret = __tlr_zc_cmd_q_ncmd_set(q, TLR_ZC_CMD_Q_NCMD_INIT);

		/*
		 * Clear the command queue ready flag so that upon the next
		 * initial open, the queue is re-initialized by
		 * tlr_zc_cmd_q_init().
		 */
		q->is_ready = FALSE;

		wmb();  /* ensure memory visibility */
	}

	HID_EX_TRACE("Exit OK\n");

	up(&q->wr_xfer_mutex);
	up(&q->rd_xfer_mutex);
	return ret;
}


#if defined(TILEPCI_ENDP)
int link_is_down(struct tlr_pcie_dev *tlr, int chan)
{
	u8 result = 1;

	int err = hv_dev_pread(tlr->hv_channel_ctl_fd, 0,
			       (HV_VirtAddr)&result, sizeof(result),
			       PCIE_CHANNEL_CTL_LINK_DOWN_OFF(chan));

	if (err != sizeof(result))
		ERR("Failed to read PCIE_CHANNEL_CTL_LINK_DOWN_OFF(%d)\n",
		    chan);

	return (int)result;
}
#endif


static struct file_operations tlr_zc_ops = {
	.owner = THIS_MODULE,
	.read = tlr_zc_read,
	.write = tlr_zc_write,
	.poll = tlr_zc_poll,
	.release = tlr_zc_release,
#ifdef HAVE_UNLOCKED_IOCTL
	.unlocked_ioctl = tlr_zc_ioctl,
#else
	.ioctl = tlr_zc_ioctl,
#endif
	.mmap = tlr_zc_mmap,
};

int tlr_zc_open(struct tlr_zc_state *zc_state, struct file *filp, int chan)
{
	int			 ret;
	int			 err;
	struct tlr_pcie_dev	*tlr;
	struct tlr_zc_cmd_q	*q;
	int			 have_rd_mutex;
	int			 have_wr_mutex;

	/* Set the private data to point at our queue. */
	ret = 0;
	have_rd_mutex = FALSE;
	have_wr_mutex = FALSE;
	tlr = zc_state->tlr;
	q = zc_state->cmd_queues[chan];
	if (q == NULL) {
		ERR("NULL cmd_q during open: pci channel = %d\n", chan);
		ret = -ENXIO;
		HID_ERR_TRACE("Exit ENXIO\n");
		goto exit;
	}
	filp->private_data = q;

	/* Use the zero copy read, write, etc. */
	filp->f_op = &tlr_zc_ops;

	HID_FOP_TRACE("Enter: pci channel = %d\n", chan);

	if (!tlr_is_ready(tlr)) {
		ret = -ENXIO;
		HID_ERR_TRACE("Exit ENXIO - chip isn't up\n");
		goto exit;
	}

#if defined(TILEPCI_ENDP)
	if (chan >= PCIE_FIRST_C2C_SEND_CHANNEL) {
		if (link_is_down(tlr, chan)) {
			ret = -ENXIO;
			HID_ERR_TRACE("Exit ENXIO - link is down\n");
			goto exit;
		}
	}
#endif

	/* Grab the queue read lock. */
	err = down_interruptible(&q->rd_xfer_mutex);
	if (err != 0) {
		HID_ERR_TRACE("Exit ERESTARTSYS\n");
		ret = -ERESTARTSYS;
		goto exit;
	}
	have_rd_mutex = TRUE;

	/* Grab the queue write lock. */
	err = down_interruptible(&q->wr_xfer_mutex);
	if (err != 0) {
		HID_ERR_TRACE("Exit ERESTARTSYS\n");
		ret = -ERESTARTSYS;
		goto exit;
	}
	have_wr_mutex = TRUE;

	/*
	 * If 'chip_reset_poison', some other file handle is
	 * still open on a dead queue; don't allow any more
	 * opens until that one goes away.
	 */
	if (q->chip_reset_poison) {
		ret = -ENXIO;
		HID_ERR_TRACE("Exit ENXIO - chip_reset_poison\n");
		goto exit;
	}

	if (!q->is_ready) {
		/*
		 * make sure the queue is actually initialized;
		 * we may have reset the chip and lost our connection.
		 */
		ret = tlr_zc_cmd_q_init(q);
		if (ret != 0) {
			HID_ERR_TRACE("Exit, init failed\n");
			goto exit;
		}
	}

	/*
	 * If this device is opened via the direct-to-HV interface,
	 * don't allow this interface to be opened.
	 */
	if (q->direct_hv_count > 0) {
		ret = -EBUSY;
		goto exit;
	}

	if (++q->open_count == 1) {
		/*
		 * Wait for the channel to reach the 'need start of
		 * connection' state.
		 */
		ret = tlr_zc_cmd_q_wait_for_need_soc(q);
		if (ret != 0) {
			HID_ERR_TRACE("Exit, need soc\n");
			goto exit;
		}

		ret = __tlr_zc_cmd_q_ncmd_set(q, TLR_ZC_CMD_Q_NCMD_OPEN);
		if (ret != 0) {
			HID_ERR_TRACE("Exit, ncmd_set()\n");
			goto exit;
		}

		q->nreq_cur = 0;
		q->ncomp_cur = 0;
		q->ncomp_bytes_cur = 0;
		q->nreq_bytes_cur = 0;
	}

	HID_EX_TRACE("Exit ret %d\n", ret);

 exit:
	if (have_wr_mutex) {
		up(&q->wr_xfer_mutex);
		have_wr_mutex = FALSE;
	}
	if (have_rd_mutex) {
		up(&q->rd_xfer_mutex);
		have_rd_mutex = FALSE;
	}

	return ret;
}


/*
 * Helper function for releasing some or all of the zero-copy semaphores.
 * 'count' specifies the number of command queues to be released; this will
 * unlock queues 0 through count - 1.
 */
void release_zc_semaphores(struct tlr_zc_state *zc_state, int count)
{
	int i;
	for (i = count - 1; i >= 0; i--) {
		struct tlr_zc_cmd_q *q = zc_state->cmd_queues[i];
		if (q) {
			up(&q->wr_xfer_mutex);
			up(&q->rd_xfer_mutex);
		}
	}
}


/* Helper function for grabbing all the zero-copy semaphores. */
int grab_all_zc_semaphores(struct tlr_zc_state *zc_state)
{
	int		 i;
	int		 ret;

	ret = 0;
	for (i = 0; i < PCIE_CHANNELS; i++) {
		struct tlr_zc_cmd_q *q = zc_state->cmd_queues[i];
		if (q) {
			if (down_interruptible(&q->rd_xfer_mutex)) {
				ret = -ERESTARTSYS;
				goto err_exit;
			}

			if (down_interruptible(&q->wr_xfer_mutex)) {
				up(&q->rd_xfer_mutex);
				ret = -ERESTARTSYS;
				goto err_exit;
			}
		}
	}
	return ret;

 err_exit:
	release_zc_semaphores(zc_state, i);
	return ret;
}


int tlr_zc_init(struct tlr_pcie_dev *tlr)
{
	struct tlr_zc_state *zc_state = &tlr->zc_state;
	struct tlr_zc_cmd	*cmds;
	struct tlr_zc_cmd	*cmd;
	int		 ncmds;
	int		 cmds_sz;
	int err;
	int i;

	tlr_list_init(&zc_state->cmd_q_free_list);
	spin_lock_init(&zc_state->cmd_q_lock);
	zc_state->tlr = tlr;

	/* Allocate zero-copy commands. */
	ncmds = (PCIE_CMD_QUEUE_ENTRIES - TLR_PUBLIC_NCMD);
	cmds_sz = ncmds * sizeof(*cmds);
	cmds = tlr_alloc_mem(cmds_sz);
	if (cmds == NULL) {
		err = -ENOMEM;
		goto cmds_alloc_failed;
	}
	zc_state->ncmd = ncmds;
	zc_state->cmds_sz = cmds_sz;
	zc_state->cmds = cmds;
	memset(cmds, 0, cmds_sz);

	/* Initialize the new commands. */
	for (i = 0; i < ncmds; i++) {
		cmd = &cmds[i];

		tlr_zc_cmd_init(cmd, NULL, tlr,
				TLR_ZC_CMD_UNKNOWN, TLR_COOKIE_UNKNOWN,
				&zc_state->cmd_q_free_list,
				&zc_state->cmd_q_lock);
	}

	/* Initialize the command queues. */
	for (i = 0; i < PCIE_CHANNELS; i++) {
		struct tlr_zc_cmd_q *q;
		enum tlr_zc_cmd_type_e cmd_type;

		if ((i >= FIRST_ZC_H2T_CHAN) &&
		    (i <= LAST_ZC_H2T_CHAN)) {
			cmd_type = TLR_ZC_CMD_H2T;
		} else if ((i >= FIRST_ZC_T2H_CHAN) &&
			 (i <= LAST_ZC_T2H_CHAN)) {
			cmd_type = TLR_ZC_CMD_T2H;
		} else if ((i >= PCIE_FIRST_C2C_SEND_CHANNEL) &&
			 (i <= LAST_C2C_SEND_CHAN)) {
			cmd_type = TLR_ZC_CMD_C2C_SEND;
		} else if ((i >= PCIE_FIRST_C2C_RECV_CHANNEL) &&
			 (i <= LAST_C2C_RECV_CHAN)) {
			cmd_type = TLR_ZC_CMD_C2C_RECV;
		} else if ((i >= PCIE_FIRST_DMA_READ_CHANNEL) &&
			 (i <= LAST_DMA_READ_CHAN)) {
			cmd_type = TLR_DMA_READ;
		} else if ((i >= PCIE_FIRST_DMA_WRITE_CHANNEL) &&
			 (i <= LAST_DMA_WRITE_CHAN)) {
			cmd_type = TLR_DMA_WRITE;
		} else {
			zc_state->cmd_queues[i] = NULL;
			continue;
		}

		/* We want a zero-copy Q for this channel, alloc and init. */
		q = kmalloc(sizeof(*q), GFP_KERNEL);
		if (q == NULL) {
			err = -ENOMEM;
			goto q_alloc_failed;
		}
		zc_state->cmd_queues[i] = q;
		err = tlr_zc_cmd_q_startup(q, cmd_type, tlr, i);
		if (err)
			goto q_alloc_failed;
	}

	return 0;

 q_alloc_failed:
	for (i = 0; i < PCIE_CHANNELS; i++) {
		struct tlr_zc_cmd_q *q = zc_state->cmd_queues[i];
		if (q != NULL) {
			kfree(q);
			zc_state->cmd_queues[i] = NULL;
		}
	}

	if (zc_state->cmds != NULL) {
		tlr_free_mem(tlr->zc_state.cmds, tlr->zc_state.cmds_sz);
		tlr->zc_state.cmds = NULL;
	}
 cmds_alloc_failed:
	return err;
}


/*
 * Reset all write (and read?) streams so that they contain no data.
 * Also, poison any open file handles so that the holders must close
 * and open a new session before getting more data.
 *
 * This method must be called while the following are true:
 *  - the caller holds all read and write mutexes
 *  - the worker thread has been descheduled
 *  - is_ready = 0 so that no interrupts will be processed
 *  - the chip has been reset but not booted
 */
void tlr_zc_chip_reset(struct tlr_zc_state *zc_state)
{
	int i;
	for (i = 0; i < PCIE_CHANNELS; i++) {
		struct tlr_zc_cmd_q *q = zc_state->cmd_queues[i];
		if (q) {
			if (q->open_count > 0)
				q->chip_reset_poison = 1;
			tlr_zc_cmd_q_free(q);
		}

	}
}

void tlr_zc_free(struct tlr_zc_state *zc_state)
{
	int i;

	if (zc_state->cmds != NULL)
		tlr_free_mem(zc_state->cmds, zc_state->cmds_sz);

	for (i = 0; i < PCIE_CHANNELS; i++) {
		struct tlr_zc_cmd_q *q = zc_state->cmd_queues[i];
		if (q != NULL) {
			kfree(q);
			zc_state->cmd_queues[i] = NULL;
		}
	}
}


/***********************************************************************
 *                              Debug Support                          *
 ***********************************************************************/

static void *tlr_zc_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
	struct tlr_zc_state *zc_state = (struct tlr_zc_state *)s->private;

	/* Scan the array looking for the next valid queue. */
	(*pos)++;
	while ((*pos) < PCIE_CHANNELS) {
		struct tlr_zc_cmd_q *q = zc_state->cmd_queues[*pos];
		if (q != NULL)
			return q;
		(*pos)++;
	}

	/* We're reached the end of the array, so we're done. */
	return NULL;
}

static void *tlr_zc_seq_start(struct seq_file *s, loff_t *pos)
{
	struct tlr_zc_state *zc_state = (struct tlr_zc_state *)s->private;
	struct tlr_zc_cmd_q *q;

	if (*pos >= PCIE_CHANNELS)
		return NULL;

	/* We may need to 'fast-forward' to a non-NULL queue. */
	q = zc_state->cmd_queues[*pos];
	if (q == NULL)
		return tlr_zc_seq_next(s, q, pos);

	return q;
}

static void tlr_zc_seq_stop(struct seq_file *s, void *v)
{
}

static int tlr_zc_seq_show(struct seq_file *s, void *v)
{
	struct tlr_zc_cmd_q *q = (struct tlr_zc_cmd_q *)v;

	u32		 free_q_len;
	u32		 pend_q_len;
	u32		 post_q_len;
	u32		 comp_q_len;
	u32		 free_q_waiting;
	u32		 comp_q_waiting;
	u32		 reset_drain_q_waiting;
	char prefix[64];
	char *type_prefix;
	int type_index;

	free_q_len = tlr_list_len(&q->free_q, &q->lock);
	pend_q_len = tlr_list_len(&q->pend_q, &q->lock);
	post_q_len = tlr_list_len(&q->post_q, &q->lock);
	comp_q_len = tlr_list_len(&q->comp_q, &q->lock);

	free_q_waiting = !list_empty(&q->free_queue.task_list);
	comp_q_waiting = !list_empty(&q->comp_queue.task_list);
	reset_drain_q_waiting = !list_empty(&q->reset_drain_queue.task_list);

	/* Create a prefix string for this queue. */
	if (q->type == TLR_ZC_CMD_H2T) {
		type_prefix = "h2t";
		type_index = q->chan - FIRST_ZC_H2T_CHAN;
	} else if (q->type == TLR_ZC_CMD_T2H) {
		type_prefix = "t2h";
		type_index = q->chan - FIRST_ZC_T2H_CHAN;
	} else if (q->type == TLR_ZC_CMD_C2C_SEND) {
		type_prefix = "c2c_send";
		type_index = q->chan - PCIE_FIRST_C2C_SEND_CHANNEL;
	} else {
		type_prefix = "c2c_recv";
		type_index = q->chan - PCIE_FIRST_C2C_RECV_CHANNEL;
	}

	sprintf(prefix, "%s_%d", type_prefix, type_index);

	seq_printf(s, "%s_open_count: %d\n", prefix, q->open_count);
	seq_printf(s, "%s_is_ready: %d\n", prefix, q->is_ready);
	seq_printf(s, "%s_chip_reset_poison: %d\n", prefix,
		   q->chip_reset_poison);

	seq_printf(s, "%s_free_q_len: %d\n", prefix, free_q_len);
	seq_printf(s, "%s_pend_q_len: %d\n", prefix, pend_q_len);
	seq_printf(s, "%s_post_q_len: %d\n", prefix, post_q_len);
	seq_printf(s, "%s_comp_q_len: %d\n", prefix, comp_q_len);

	seq_printf(s, "%s_have_free_q_waiters: %d\n", prefix, free_q_waiting);
	seq_printf(s, "%s_have_comp_q_waiters: %d\n", prefix, comp_q_waiting);
	seq_printf(s, "%s_have_reset_drain_q_waiters: %d\n", prefix,
		   reset_drain_q_waiting);

	seq_printf(s, "%s_ncmd_wanted: %d\n", prefix, q->ncmd_wanted);
	seq_printf(s, "%s_ncmd_have: %d\n", prefix, q->ncmd_have);
	seq_printf(s, "%s_ncmd_min_read: %d\n", prefix, q->ncmd_min_read);
	seq_printf(s, "%s_ncmd_comp_wakeup: %d\n", prefix, q->ncmd_comp_wakeup);

	seq_printf(s, "%s_nreq_tot: %d\n", prefix, q->nreq_tot);
	seq_printf(s, "%s_ncomp_tot: %d\n", prefix, q->ncomp_tot);
	seq_printf(s, "%s_ncomp_mismatch: %d\n", prefix, q->ncomp_mismatch);

	return 0;
}

static struct seq_operations tlr_zc_seq_ops = {
	.start = tlr_zc_seq_start,
	.next = tlr_zc_seq_next,
	.stop = tlr_zc_seq_stop,
	.show = tlr_zc_seq_show,
};

static struct file_operations tlr_zc_debug_ops = {
	.owner = THIS_MODULE,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = seq_release,
};

int tlr_zc_debug_open(struct tlr_zc_state *zc_state, struct file *file)
{
	int result;

	file->f_op = &tlr_zc_debug_ops;
	result = seq_open(file, &tlr_zc_seq_ops);
	if (result == 0) {
		/* file->private_data was initialised by seq_open */
		struct seq_file *s = (struct seq_file *)file->private_data;
		s->private = zc_state;
	}
	return result;
}
