/*
 * Copyright 2011 Tilera Corporation. All Rights Reserved.
 *
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation, version 2.
 *
 *   This program is distributed in the hope that it will be useful, but
 *   WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 *   NON INFRINGEMENT.  See the GNU General Public License for
 *   more details.
 *
 * Tilera TILE64 target-side driver
 *
 * This source code is derived from code provided in "Linux Device
 * Drivers" by Alessandro Rubini and Jonathan Corbet, published by
 * O'Reilly & Associates.
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <linux/poll.h>
#include <linux/uaccess.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/io.h>
#include <linux/pci.h>
#include <asm/hv_driver.h>
#include <asm/tilepci.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

#include "tilepci_endp.h"


/*
 * Per CPU data, including per CPU completion data,
 * and per CPU last completion.
 */
static DEFINE_PER_CPU(
	struct pcie_tile_local_init_read[MAX_PCIE_LINKS_PER_CHIP],
	init_read
	);
static DEFINE_PER_CPU(u32[MAX_PCIE_LINKS_PER_CHIP], last_complete);

static const char driver_name[] = DRIVER_NAME_STRING;
static struct tlr_pcie_dev *the_pcie_devs[MAX_PCIE_LINKS_PER_CHIP];

#define TLR_IRQ_NOT_FOUND (-1)
#define TLR_IRQ_MISMATCH (-2)

/**********************************************************************/
/*                   Interrupt Handler and Worker Thread              */
/**********************************************************************/

/*
 * Code to be invoked for each completed communication operation.
 * This is called by the interrupt handler for each completion; each
 * completion specifies a particular channel that needs to be updated
 * to reflect the data transfer.
 */
static void tlr_handle_completion(struct tlr_pcie_dev *dev,
				  struct pcie_tile_completion_queue *queue,
				  u32 last)
{
	u32 index = last % PCIE_CMD_QUEUE_ENTRIES;
	int	size, channel;
	pcie_tile_completion_t *cmp;
	struct tlr_stream *stream;
	u64 cpa;

	INT_TRACE("index = %d\n", index);

	cmp = &queue->completion_array[index];
	cpa = (((u64) cmp->buffer_addr_hi16) << 32) |
		((u64) cmp->buffer_addr_lo32);

	INT_TRACE("CPA = %#llx, channel = %d, size = %d\n",
		  cpa, cmp->channel, cmp->size);

	size = cmp->size;
	channel = cmp->channel;

	if ((channel >= FIRST_PUBLIC_T2H_CHAN) &&
	    (channel <= LAST_PUBLIC_T2H_CHAN)) {
		/* Write is complete; advance counter and wake. */
		stream = tlr_get_stream(dev,
					channel - PCIE_HOST_TO_TILE_CHANNELS);
		stream->writes_completed++;
		wake_up_interruptible(&stream->write_queue);
	} else if ((channel >= FIRST_PUBLIC_H2T_CHAN) &&
		 (channel <= LAST_PUBLIC_H2T_CHAN)) {
		/* Read is complete; save size, advance counter, and wake. */
		u32 completed, i;
		stream = tlr_get_stream(dev, channel);
		completed = stream->reads_completed;
		i = completed % BUFFERS_PER_STREAM;

		stream->read_sizes[i] = size;
		wmb();  /* ensure visibility before incrementing count */
		stream->reads_completed = completed + 1;
		wake_up_interruptible(&stream->read_queue);
	} else {
		tlr_zc_cmd_q_comp(&dev->zc_state, channel, cmp, size);
	}
	return;
}


/*
 *tlr_hv_intr - the interrupt handler
 */
static irqreturn_t tlr_hv_intr(int irq, void *dev_id)
{
	struct tlr_pcie_dev *dev = (struct tlr_pcie_dev *) dev_id;
	int link_index = dev->link_index;

	/* Get this tile's completion queue. */
	struct pcie_tile_completion_queue *queue =
		__get_cpu_var(init_read)[link_index].completion_queue;
	u32 last   = __get_cpu_var(last_complete)[link_index];
	u32 delta = 0;

	/* Wake up tasks waiting for CSR notifications. */
	if (dev->csr_write_queue->writes_posted >
	    dev->csr_write_queue->writes_consumed) {
		wake_up_interruptible(&dev->csr_wait_queue);
	}

	/*
	 * Now, handle all the ZC completions.  Interrupts have
	 * already been reenabled - this Linux generic IRQ
	 * infrastructure doesn't let us reenable during the irq
	 * handler.
	 */
	while (queue->completion_posted_count - last != 0) {
		tlr_handle_completion(dev, queue, last);
		last++;
		delta++;
	}

	INT_TRACE("Received interrupt: %u completions %d delta.\n",
		  queue->completion_posted_count, delta);

	__get_cpu_var(last_complete)[link_index] = last;

	if (delta)
		return IRQ_HANDLED;
	else
		return IRQ_NONE;
}


/* Post a set of read buffers to the iBound. */
static void post_read_buffers(struct tlr_stream *stream, u32 start, u32 stop)
{
	struct tlr_pcie_dev *tlr = stream->dev;
	unsigned long flags;
	u32 stream_posted;
	u32 cmds_posted;
	u32 cmd_index;

	/* Build a template command on the local stack. */
	pcie_tile_buffer_cmd_t cmd = { 0 };
	cmd.completion_queue_id = stream->cpl_queue_id;
	cmd.channel = stream->index;
	cmd.size = BUFFER_SIZE;
	cmd.must_eop = 1;
	cmd.may_eop = 1;

	/* The command queue is shared between all tiles, so lock it. */
	spin_lock_irqsave(&tlr->cmd_queue_lock, flags);
	cmds_posted = tlr->shm_state->buffer_cmd_posted_count;

	/*
	 * This loop posts buffers from the per-stream ring buffer
	 * into the global command ring buffer.  We keep a separate
	 * 'posted' count for each ring, and calculate the index
	 * within each ring as (posted % ring_entries).
	 */
	for (stream_posted = start; stream_posted != stop; stream_posted++) {
		u32 index = stream_posted % BUFFERS_PER_STREAM;
		HV_PhysAddr pa = __pa(stream->read_buffers[index]);

		cmd.buffer_addr_lo32 = (u32) pa;
		cmd.buffer_addr_hi16 = (u16) (pa >> 32);

		if (!stream->need_read_soc)
			cmd.soc = 0;
		else {
			cmd.soc = 1;
			stream->need_read_soc = 0;
		}

		CMD_TRACE("READ Q= %d, channel= %d, size= %d, CPA= %#llx\n",
			  cmd.completion_queue_id,
			  cmd.channel, cmd.size, pa);

		cmd_index = cmds_posted % PCIE_CMD_QUEUE_ENTRIES;
		tlr->shm_state->buffer_cmd_array[cmd_index] = cmd;
		cmds_posted++;
	}

	/* Make sure the commands are visible, then update the posted count. */
	wmb();
	tlr->shm_state->buffer_cmd_posted_count = cmds_posted;

	spin_unlock_irqrestore(&tlr->cmd_queue_lock, flags);
}


/* Post a set of write buffers to the iBound. */
static void post_write_buffers(struct tlr_stream *stream, u32 start, u32 stop)
{
	struct tlr_pcie_dev *tlr = stream->dev;
	unsigned long flags;
	u32 stream_posted;
	u32 cmds_posted;
	u32 cmd_index;

	/* Build a template command on the local stack. */
	pcie_tile_buffer_cmd_t cmd = { 0 };
	cmd.completion_queue_id = stream->cpl_queue_id;
	cmd.channel = stream->index + PCIE_HOST_TO_TILE_CHANNELS;
	cmd.must_eop = 1;

	/* The command queue is shared between all tiles, so lock it. */
	spin_lock_irqsave(&tlr->cmd_queue_lock, flags);
	cmds_posted = tlr->shm_state->buffer_cmd_posted_count;

	/*
	 * This loop posts buffers from the per-stream ring buffer
	 * into the global command ring buffer.  We keep a separate
	 * 'posted' count for each ring, and calculate the index
	 * within each ring as (posted % ring_entries).
	 */
	for (stream_posted = start; stream_posted != stop; stream_posted++) {
		u32 index = stream_posted % BUFFERS_PER_STREAM;
		HV_PhysAddr pa = __pa(stream->write_buffers[index]);

		cmd.buffer_addr_lo32 = (u32) pa;
		cmd.buffer_addr_hi16 = (u16) (pa >> 32);
		cmd.size = stream->write_sizes[index];

		if (!stream->need_write_soc)
			cmd.soc = 0;
		else {
			cmd.soc = 1;
			stream->need_write_soc = 0;
		}

		CMD_TRACE("WRITE Q= %d, channel= %d, size= %d, CPA= %#llx\n",
			  cmd.completion_queue_id,
			  cmd.channel, cmd.size, pa);

		cmd_index = cmds_posted % PCIE_CMD_QUEUE_ENTRIES;
		tlr->shm_state->buffer_cmd_array[cmd_index] = cmd;
		cmds_posted++;
	}

	/* Make sure the commands are visible, then update the posted count. */
	wmb();
	tlr->shm_state->buffer_cmd_posted_count = cmds_posted;

	spin_unlock_irqrestore(&tlr->cmd_queue_lock, flags);
}

/* Post a set of write buffers to the iBound. */
void post_cmds(struct tlr_zc_cmd_q *q)
{
	struct tlr_pcie_dev	*tlr;
	unsigned long		 q_flags;
	unsigned long		 tlr_flags;
	u32			 cmds_posted;
	u32			 cmd_index;
	struct tlr_zc_cmd	*cmd;
	tlr_dma_addr_t		 dma_addr;

	tlr = q->tlr;

	/* The command queue is shared between all tiles, so lock it. */
	spin_lock_irqsave(&tlr->cmd_queue_lock, tlr_flags);
	spin_lock_irqsave(&q->lock, q_flags);

	/* Make sure we actually have commands. */
	if (__tlr_list_empty(&q->pend_q))
		goto exit;

	/* If this channel needs SoC, add that to the first command. */
	if (q->state == CMD_Q_ST_NEED_SOC) {
		list_entry(q->pend_q.q.next, struct tlr_zc_cmd, list)->
			cmd.soc = 1;
		q->state = CMD_Q_ST_CONNECTED;
	}

	cmds_posted = tlr->shm_state->buffer_cmd_posted_count;

	/*
	 * This loop posts buffers from the per-stream ring buffer
	 * into the global command ring buffer.  We keep a separate
	 * 'posted' count for each ring, and calculate the index
	 * within each ring as (posted % ring_entries).
	 */
	while (!__tlr_list_empty(&q->pend_q)) {
		cmd = __tlr_list_rem_zc_cmd(&q->pend_q);

		q->nreq_cur++;
		q->nreq_tot++;
		q->nreq_bytes_cur += cmd->post_len;
		q->nreq_bytes_tot += cmd->post_len;

		dma_addr = DMA_ADDR_GEN(cmd->cmd.buffer_addr_hi16,
					cmd->cmd.buffer_addr_lo32);
		HID_CMD_TRACE("CMD: Q= %d, "
			      "channel= %d, "
			      "size= %d, "
			      "CPA= %#llx\n",
			      cmd->cmd.completion_queue_id,
			      cmd->cmd.channel,
			      cmd->cmd.size,
			      dma_addr);
		cmd_index = cmds_posted % PCIE_CMD_QUEUE_ENTRIES;
		tlr->shm_state->buffer_cmd_array[cmd_index] = cmd->cmd;

		cmds_posted++;

		__tlr_list_add_zc_cmd(&q->post_q, cmd);
	}

	/* Make sure the commands are visible, then update the posted count.*/
	wmb();
	tlr->shm_state->buffer_cmd_posted_count = cmds_posted;

 exit:
	spin_unlock_irqrestore(&q->lock, q_flags);
	spin_unlock_irqrestore(&tlr->cmd_queue_lock, tlr_flags);
}

int tlr_get_cpl_queue_id(struct tlr_pcie_dev *tlr)
{
	/* All completions for this stream will go to this tile's queue. */
	int ret = get_cpu_var(init_read)[tlr->link_index].completion_queue_id;
	put_cpu_var(init_read);

	return ret;
}


/* Initialize a newly opened stream. */
static int init_stream(struct tlr_stream *stream, int link_index)
{
	int i;
	struct page *page;
	char *buffer;

	/* All completions for this stream will go to this tile's queue. */
	stream->cpl_queue_id =
		get_cpu_var(init_read)[link_index].completion_queue_id;
	put_cpu_var(init_read);

	/*
	 * FIXME: deal with parameters that require > one page.  We
	 * can fit all of our data onto a single 64k page, but the
	 * system can be configured with other page sizes.
	 */
	if (BUFFERS_PER_STREAM * BUFFER_SIZE * 2 > PAGE_SIZE) {
		ERR("Multiple pages per stream is unimplemented.\n");
		return -ENOMEM;
	}

	/*
	 * Allocate memory for data buffers and fill in the rings.  We
	 * modify the PTEs so that the memory has
	 * 'user-managed-coherence' - i.e. we must explicitly flush
	 * and/or inval to achieve coherence.
	 */
	page = homecache_alloc_page(GFP_KERNEL, PAGE_HOME_INCOHERENT);
	if (!page)
		return ENOMEM;
	stream->buffer_page = (unsigned long) page_address(page);

	buffer = (char *)stream->buffer_page;
	for (i = 0; i < BUFFERS_PER_STREAM; i++) {
		stream->read_buffers[i] = buffer;
		buffer += BUFFER_SIZE;
	}
	for (i = 0; i < BUFFERS_PER_STREAM; i++) {
		stream->write_buffers[i] = buffer;
		buffer += BUFFER_SIZE;
	}

	stream->need_write_soc = 1;
	stream->need_read_soc = 1;

	/* Post buffers for incoming read data. */
	post_read_buffers(stream, 0, BUFFERS_PER_STREAM);

	return 0;
}


/***********************************************************************
 *                        Character Device Routines                    *
 ***********************************************************************/

static ssize_t tlr_cdev_read(struct file *filp, char __user *buf, size_t count,
			     loff_t *f_pos)
{
	struct tlr_stream *stream = filp->private_data;
	size_t already_read;
	u32 reads_consumed;
	size_t bytes_read;

	FOP_TRACE("Entered tlr_cdev_read\n");

	if (count == 0)
		return 0;

	/* Grab the stream read lock. */
	if (down_interruptible(&stream->read_mutex)) {
		EX_TRACE("Exit tlr_cdev_read -ERESTARTSYS\n");
		return -ERESTARTSYS;
	}

	/* Wait for data to appear in the read FIFO. */
	while (stream->reads_completed == stream->reads_consumed) {
		up(&stream->read_mutex);
		if (filp->f_flags & O_NONBLOCK) {
			EX_TRACE("Exit tlr_cdev_read -EAGAIN\n");
			return -EAGAIN;
		}

		/* Wait for the worker loop to put some data into the FIFO. */
		FOP_TRACE("Waiting on read_queue\n");
		if (wait_event_interruptible(stream->read_queue,
					     (stream->reads_completed !=
					      stream->reads_consumed))) {
			EX_TRACE("Exit tlr_cdev_read -ERESTARTSYS\n");
			return -ERESTARTSYS;
		}
		FOP_TRACE("Woke from read_queue\n");

		/* Get the read lock again. */
		if (down_interruptible(&stream->read_mutex)) {
			EX_TRACE("Exit tlr_cdev_read -ERESTARTSYS\n");
			return -ERESTARTSYS;
		}
	}

	/*
	 * At this point we hold the read mutex and we know that there
	 * is at least one buffer of data available.  Copy as many
	 * buffers as possible to userspace.
	 */
	already_read = stream->partial_read_bytes;
	bytes_read = 0;
	for (reads_consumed = stream->reads_consumed;
	     reads_consumed != stream->reads_completed;
	     reads_consumed++) {
		u32 index = reads_consumed % BUFFERS_PER_STREAM;
		size_t buf_remaining =
			stream->read_sizes[index] - already_read;
		size_t to_copy = min(count, buf_remaining);
		int err;

		err = copy_to_user(buf + bytes_read,
				   stream->read_buffers[index] + already_read,
				   to_copy);
		inv_buffer(stream->read_buffers[index] + already_read,
			   to_copy);
		if (err) {
			if (bytes_read > 0)
				break;
			else {
				up(&stream->read_mutex);
				return -EFAULT;
			}
		}
		bytes_read += to_copy;
		count -= to_copy;

		if (to_copy == buf_remaining) {
			/*
			 * We've completely drained that buffer; inval
			 * before reposting.
			 */
			already_read = 0;
		} else {
			/* User only asked for part of the buffer. */
			already_read += to_copy;
			break;
		}
	}
	stream->partial_read_bytes = already_read;

	/* Re-post any buffers that we completely consumed. */
	post_read_buffers(stream, stream->reads_consumed, reads_consumed);
	stream->reads_consumed = reads_consumed;

	up(&stream->read_mutex);
	EX_TRACE("Exit tlr_cdev_read %d\n", (int) bytes_read);
	return bytes_read;
}


static ssize_t tlr_cdev_write(struct file *filp, const char __user *buf,
			      size_t count, loff_t *f_pos)
{
	struct tlr_stream *stream = filp->private_data;
	size_t bytes_written, written;
	u32 writes_posted;

	FOP_TRACE("Entered tlr_cdev_write\n");

	if (count == 0) {
		EX_TRACE("Exit tlr_cdev_write\n");
		return 0;
	}

	/* Grab the stream write lock. */
	if (down_interruptible(&stream->write_mutex)) {
		EX_TRACE("Exit tlr_cdev_write\n");
		return -ERESTARTSYS;
	}

	/*
	 * Wait for a NULL write_buffer, indicating we can allocate and fill
	 * a new one.
	 */
	while ((stream->writes_posted - stream->writes_completed)
	       >= BUFFERS_PER_STREAM) {
		up(&stream->write_mutex);
		if (filp->f_flags & O_NONBLOCK) {
			EX_TRACE("Exit tlr_cdev_write -EAGAIN\n");
			return -EAGAIN;
		}

		/*
		 * Wait for the worker loop to indicate that we're ready
		 * for a new buffer.
		 */
		FOP_TRACE("Waiting on write_queue\n");
		if (wait_event_interruptible(stream->write_queue,
					     ((stream->writes_posted -
					       stream->writes_completed) <
					      BUFFERS_PER_STREAM))) {
			EX_TRACE("Exit tlr_cdev_write -ERESTARTSYS\n");
			return -ERESTARTSYS;
		}
		FOP_TRACE("Woke from write_queue\n");

		/* Get the write lock again. */
		if (down_interruptible(&stream->write_mutex)) {
			EX_TRACE("Exit tlr_cdev_write -ERESTARTSYS\n");
			return -ERESTARTSYS;
		}
	}

	/*
	 * At this point we hold the write mutex and we know that
	 * there is at least one write buffer available.  Copy as much
	 * data as possible into buffers...
	 */
	written = 0;
	bytes_written = 0;
	for (writes_posted = stream->writes_posted;
	     writes_posted - stream->writes_completed < BUFFERS_PER_STREAM;
	     writes_posted++) {
		u32 index = writes_posted % BUFFERS_PER_STREAM;
		size_t size = min(count, (size_t)BUFFER_SIZE);

		if (size == 0)
			break;

		stream->write_sizes[index] = size;
		if (copy_from_user(stream->write_buffers[index],
				   buf + written, size)) {
			if (written > 0)
				break;
			else {
				up(&stream->write_mutex);
				return -EFAULT;
			}
		}
		flush_buffer(stream->write_buffers[index], size);
		written += size;
		count -= size;

	}

	/* ...and then post the buffers. */
	post_write_buffers(stream, stream->writes_posted, writes_posted);
	stream->writes_posted = writes_posted;

	up(&stream->write_mutex);
	EX_TRACE("Exit tlr_cdev_write %d\n", (int) written);
	return written;
}


static long tlr_cdev_ioctl(struct file *filp,
			   unsigned int cmd, unsigned long arg)
{
	struct tlr_stream *stream = filp->private_data;
	struct tlr_pcie_dev *tlr = stream->dev;
	int res;
	u32 channel;

	switch (cmd) {
	case TILEPCI_IOC_CHANNEL_RESET:
		/* Reset the two channels associated with this stream. */

		channel = stream->index;
		res = hv_dev_pwrite(tlr->hv_channel_ctl_fd, 0,
			    (HV_VirtAddr)&channel, sizeof(channel),
			    PCIE_CHANNEL_CTL_CHANNEL_RESET_OFF);
		if (res != sizeof(channel)) {
			ERR("Tile channel reset failed, %d\n", channel);
			return -EIO;
		}

		channel = stream->index + PCIE_HOST_TO_TILE_CHANNELS;
		res = hv_dev_pwrite(tlr->hv_channel_ctl_fd, 0,
			    (HV_VirtAddr)&channel, sizeof(channel),
			    PCIE_CHANNEL_CTL_CHANNEL_RESET_OFF);
		if (res != sizeof(channel)) {
			ERR("Tile channel reset failed, %d\n", channel);
			return -EIO;
		}

		stream->need_write_soc = 1;
		stream->need_read_soc = 1;

		break;
	default:
		return -EINVAL;
	}
	return 0;
}

#ifdef CONFIG_COMPAT
static long tlr_cdev_compat_ioctl(struct file *filp,
				 unsigned int a, unsigned long b)
{
	/* Sign-extend the argument so it can be used as a pointer. */
	return tlr_cdev_ioctl(filp, a, (int)(long)b);
}
#endif

static unsigned int tlr_cdev_poll(struct file *filp, poll_table *table)
{
	struct tlr_stream *stream = filp->private_data;
	unsigned int mask = 0;

	FOP_TRACE("Entered tlr_cdev_poll\n");

	/* Add wait queues to the poll table; we don't actually wait here. */
	poll_wait(filp, &stream->read_queue, table);
	poll_wait(filp, &stream->write_queue, table);

	/*
	 * Grab both the read and write semaphores so that this operation is
	 * ordered with respect to any other processes that may be reading
	 * or writing.  Are we allowed to return -ERESTARTSYS here?  Can't
	 * seem to find the appropriate documentation...
	 */
	if (down_interruptible(&stream->read_mutex)) {
		EX_TRACE("Exit tlr_cdev_poll\n");
		return -ERESTARTSYS;
	}
	if (down_interruptible(&stream->write_mutex)) {
		up(&stream->read_mutex);
		EX_TRACE("Exit tlr_cdev_poll\n");
		return -ERESTARTSYS;
	}

	if (stream->reads_consumed != stream->reads_completed)
		mask |= (POLLIN | POLLRDNORM); /* readable */
	if ((stream->writes_posted - stream->writes_completed) <
	    BUFFERS_PER_STREAM)
		mask |= (POLLOUT | POLLWRNORM); /* writable */

	up(&stream->write_mutex);
	up(&stream->read_mutex);

	EX_TRACE("Exit tlr_cdev_poll\n");
	return mask;
}

static int tlr_cdev_open(struct inode *inode, struct file *filp);

static const struct file_operations tlr_cdev_ops = {
	.owner = THIS_MODULE,
	.open = tlr_cdev_open,
	.read = tlr_cdev_read,
	.write = tlr_cdev_write,
	.unlocked_ioctl = tlr_cdev_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl = tlr_cdev_compat_ioctl,
#endif
	.poll = tlr_cdev_poll
};

static int tlr_cdev_open(struct inode *inode, struct file *filp)
{
	int result = 0;
	struct tlr_pcie_dev *tlr =
		container_of(inode->i_cdev, struct tlr_pcie_dev, cdev);
	struct tlr_stream *stream;
	int stream_index;

	/* See whether we've got a device to attach to. */
	if (!tlr_is_ready(tlr))
		return -ENXIO;

	/* Set the private data to point at our stream. */
	stream_index = MINOR(inode->i_rdev);
	stream = tlr_get_stream(tlr, stream_index);
	filp->private_data = stream;

	/* Use the stream read, write, etc. */
	filp->f_op = &tlr_cdev_ops;

	FOP_TRACE("Enter tlr_cdev_open\n");

	/* Initialize the stream if this is the first time we've opened it. */
	if (stream->buffer_page == 0)
		result = init_stream(stream, tlr->link_index);

	EX_TRACE("Exit tlr_cdev_open\n");
	return result;
}


/**********************************************************************/
/*                        CSR Device Routines                     */
/**********************************************************************/

static ssize_t tlr_csr_read(struct file *filp, char __user *buf,
			    size_t count, loff_t *ppos)
{
	struct tlr_pcie_dev *tlr = filp->private_data;
	u32 consumed_index;
	u32 posted_index;
	u32 notify_count;
	u32 notify_read = 0;
	u32 err = 0;
	ssize_t res;

	if (count % sizeof(pcie_csr_write_notify_t))
		return -EINVAL;

	notify_count = count / sizeof(pcie_csr_write_notify_t);

	while (notify_count) {

		if (down_interruptible(&tlr->csr_notify_mutex)) {
			EX_TRACE("Exit csr_notify_mutex -ERESTARTSYS\n");
			err = -ERESTARTSYS;
			goto out;
		}

		while (tlr->csr_write_queue->writes_posted ==
			tlr->csr_write_queue->writes_consumed) {

			up(&tlr->csr_notify_mutex);

			if (filp->f_flags & O_NONBLOCK) {
				EX_TRACE("Exit tlr_csr_read -EAGAIN\n");
				err = -EAGAIN;
				goto out;
			}

			/* Wait for the CSR write notification. */
			FOP_TRACE("Waiting on csr_wait_queue\n");
			if (wait_event_interruptible(tlr->csr_wait_queue,
				(tlr->csr_write_queue->writes_posted !=
				tlr->csr_write_queue->writes_consumed))) {

				EX_TRACE("Exit csr_wait_queue -ERESTARTSYS\n");
				err = -ERESTARTSYS;
				goto out;
			}
			FOP_TRACE("Woke from csr_wait_queue\n");

			if (down_interruptible(&tlr->csr_notify_mutex)) {
				EX_TRACE("Exit csr_notify_mutex"
					 " -ERESTARTSYS\n");
				err = -ERESTARTSYS;
				goto out;
			}
		}

		consumed_index = tlr->csr_write_queue->writes_consumed %
			PCIE_CSR_WRITE_QUEUE_ENTRIES;
		posted_index = tlr->csr_write_queue->writes_posted %
			PCIE_CSR_WRITE_QUEUE_ENTRIES;

		if (posted_index > consumed_index) {

			int copy_cnt = min(notify_count,
					   posted_index - consumed_index);
			res = copy_to_user((void __user *)buf,
				   (tlr->csr_write_queue->notify_array +
				    consumed_index),
				   copy_cnt * sizeof(pcie_csr_write_notify_t));
			if (res) {
				up(&tlr->csr_notify_mutex);
				EX_TRACE("Exit GET_CSR_NOTIFY -EFAULT\n");
				err = -EFAULT;
				goto out;
			}
			tlr->csr_write_queue->writes_consumed += copy_cnt;
			notify_read += copy_cnt;
			notify_count -= copy_cnt;
			buf += copy_cnt * sizeof(pcie_csr_write_notify_t);
		} else {
			/* The write queue wraps around. */
			int total;
			int copy_cnt = min(notify_count,
					PCIE_CSR_WRITE_QUEUE_ENTRIES -
					consumed_index);
			res = copy_to_user((void __user *)buf,
				   (tlr->csr_write_queue->notify_array +
				    consumed_index),
				   copy_cnt * sizeof(pcie_csr_write_notify_t));
			if (res) {
				up(&tlr->csr_notify_mutex);
				EX_TRACE("Exit GET_CSR_NOTIFY -EFAULT\n");
				err = -EFAULT;
				goto out;
			}
			total = copy_cnt;
			notify_count -= copy_cnt;
			buf += copy_cnt * sizeof(pcie_csr_write_notify_t);

			copy_cnt = min(notify_count, posted_index);
			res = copy_to_user((void __user *)buf,
				   tlr->csr_write_queue->notify_array,
				   copy_cnt * sizeof(pcie_csr_write_notify_t));
			if (res) {
				up(&tlr->csr_notify_mutex);
				EX_TRACE("Exit GET_CSR_NOTIFY -EFAULT\n");
				err = -EFAULT;
				goto out;
			}
			total += copy_cnt;
			notify_count -= copy_cnt;
			buf += copy_cnt * sizeof(pcie_csr_write_notify_t);
			tlr->csr_write_queue->writes_consumed += total;
			notify_read += total;
		}
		up(&tlr->csr_notify_mutex);
	}

out:
	return notify_read ?
		(notify_read * sizeof(pcie_csr_write_notify_t)) : err;
}

static unsigned int tlr_csr_poll(struct file *filp, poll_table *table)
{
	struct tlr_pcie_dev *tlr = filp->private_data;
	unsigned int mask = 0;

	FOP_TRACE("Entered tlr_csr_poll\n");

	/* Add wait queue to the poll table; we don't actually wait here. */
	poll_wait(filp, &tlr->csr_wait_queue, table);

	if (down_interruptible(&tlr->csr_notify_mutex)) {
		EX_TRACE("Exit tlr_csr_poll\n");
		return -ERESTARTSYS;
	}

	if (tlr->csr_write_queue->writes_posted !=
			tlr->csr_write_queue->writes_consumed)
		mask |= (POLLIN | POLLRDNORM); /* readable */

	up(&tlr->csr_notify_mutex);

	EX_TRACE("Exit tlr_csr_poll\n");

	return mask;
}

static long tlr_csr_ioctl(struct file *filp,
			  unsigned int cmd, unsigned long arg)
{
	struct tlr_pcie_dev *tlr = filp->private_data;
	int res;
	u32 intr_bits;

	switch (cmd) {
	case TILEPCI_IOC_TILE_TO_HOST_INTR:
		/* Generate a host interrupt. */

		if (copy_from_user(&intr_bits, (void __user *)arg,
				   sizeof(u32))) {
			EX_TRACE("Host intr copy failed\n");
			return -EFAULT;
		}

		res = hv_dev_pwrite(tlr->hv_channel_ctl_fd, 0,
			    (HV_VirtAddr)&intr_bits, sizeof(intr_bits),
			    PCIE_CTL_ASSERT_HOST_INTR_OFF);
		if (res != sizeof(intr_bits)) {
			ERR("Host intr pwrite failed\n");
			return -EIO;
		}

		break;
	default:
		return -EINVAL;
	}
	return 0;
}

#ifdef CONFIG_COMPAT
static long tlr_csr_compat_ioctl(struct file *filp,
				 unsigned int a, unsigned long b)
{
	/* Sign-extend the argument so it can be used as a pointer. */
	return tlr_csr_ioctl(filp, a, (int)(long)b);
}
#endif

static int tlr_csr_mmap(struct file *file, struct vm_area_struct *vma)
{
	struct tlr_pcie_dev *tlr = file->private_data;
	size_t size = vma->vm_end - vma->vm_start;
	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
	unsigned long pfn;

	if ((size + offset) > TILE_CSR_MEMORY_PAGE_SIZE) {
		WARNING("CSR mmap (size+offset) exceeding 0x%x bytes.\n",
			TILE_CSR_MEMORY_PAGE_SIZE);
		return -EINVAL;
	}

	if (!(vma->vm_flags & VM_SHARED)) {
		WARNING("CSR mmap flags must include VM_SHARED\n");
		return -EINVAL;
	}

	vma->vm_flags |= VM_LOCKED | VM_RESERVED;

	pfn = kaddr_to_pfn(tlr->csr_memory) + vma->vm_pgoff;
	if (remap_pfn_range(vma,
			    vma->vm_start,
			    pfn,
			    size,
			    PAGE_SHARED))
		return -EAGAIN;

	return 0;
}

static const struct file_operations tlr_csr_ops = {
	.owner = THIS_MODULE,
	.read = tlr_csr_read,
	.poll = tlr_csr_poll,
	.unlocked_ioctl = tlr_csr_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl = tlr_csr_compat_ioctl,
#endif
	.mmap = tlr_csr_mmap,
};

static int tlr_csr_open(struct inode *inode, struct file *filp)
{
	struct tlr_pcie_dev *tlr =
		container_of(inode->i_cdev, struct tlr_pcie_dev, cdev);

	filp->private_data = tlr;
	filp->f_op = &tlr_csr_ops;

	if (!tlr_is_ready(tlr))
		return -ENXIO;

	return 0;
}


/***********************************************************************
 *                              Debug Support                          *
 ***********************************************************************/
static void *tlr_seq_start(struct seq_file *s, loff_t *pos)
{
	int cpu;
	struct tlr_pcie_dev *tlr = (struct tlr_pcie_dev *)s->private;

	if (*pos >= NUM_CHAR_STREAMS)
		return NULL;

	if (*pos == 0) {
		for_each_online_cpu(cpu) {
			u32 last_cpl_count =
				per_cpu(last_complete, cpu)[tlr->link_index];
			seq_printf(s, "cpu_%d_last_cpl_count = %u\n",
				   cpu, last_cpl_count);
		}
	}

	return tlr_get_stream(tlr, *pos);
}

static void *tlr_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
	struct tlr_pcie_dev *tlr = (struct tlr_pcie_dev *)s->private;
	(*pos)++;
	if (*pos >= NUM_CHAR_STREAMS)
		return NULL;
	return tlr_get_stream(tlr, *pos);
}

static void tlr_seq_stop(struct seq_file *s, void *v)
{
}

static int tlr_seq_show_public(struct seq_file *s, struct tlr_stream *stream)
{
	seq_printf(s, "stream%d_writers_waiting: %d\n", stream->index,
		   !list_empty(&stream->read_queue.task_list));
	seq_printf(s, "stream%d_writes_posted: %d\n", stream->index,
		   stream->writes_posted);
	seq_printf(s, "stream%d_writes_completed: %d\n", stream->index,
		   stream->writes_completed);

	seq_printf(s, "stream%d_readers_waiting: %d\n", stream->index,
		   !list_empty(&stream->read_queue.task_list));
	seq_printf(s, "stream%d_reads_completed: %d\n", stream->index,
		   stream->reads_completed);
	seq_printf(s, "stream%d_reads_consumed: %d\n", stream->index,
		   stream->reads_consumed);
	seq_printf(s, "stream%d_partial_read_bytes: %d\n", stream->index,
		   (int)stream->partial_read_bytes);

	return 0;
}

static int tlr_seq_show(struct seq_file *s, void *v)
{
	struct tlr_stream *stream = (struct tlr_stream *)v;

	return tlr_seq_show_public(s, stream);
}

static const struct seq_operations tlr_cdev_seq_ops = {
	.start = tlr_seq_start,
	.next = tlr_seq_next,
	.stop = tlr_seq_stop,
	.show = tlr_seq_show,
};

static const struct file_operations tlr_cdev_debug_ops = {
	.owner = THIS_MODULE,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = seq_release,
};

static int tlr_cdev_debug_open(struct tlr_pcie_dev *tlr, struct file *file)
{
	int result;

	file->f_op = &tlr_cdev_debug_ops;
	result = seq_open(file, &tlr_cdev_seq_ops);
	if (result == 0) {
		/* file->private_data was initialised by seq_open */
		struct seq_file *s = (struct seq_file *)file->private_data;
		s->private = tlr;
	}
	return result;
}


/***********************************************************************
 *                              Info File                              *
 ***********************************************************************/

static int tlr_info_seq_show(struct seq_file *s, void *v)
{
	struct tlr_pcie_dev *tlr = s->private;

	seq_printf(s, "Host_Link_Index %d\n", tlr->host_link_index);
	seq_printf(s, "Max_Payload_Size %zd\n", tlr->max_payload_size);
	seq_printf(s, "Max_Read_Size %zd\n", tlr->max_read_size);
	seq_printf(s, "Link_Width %d\n", tlr->link_width);

	if (tlr->link_bar1_size > 0) {
		seq_printf(s, "LINK_BAR1_SIZE %llu\n", tlr->link_bar1_size);
		seq_printf(s, "LINK_BAR1_ADDRESS %#llx\n",
			   tlr->link_bar1_address);
	}

	/* Check whether a prebooter was used, and which version. */
	if (tlr->prebooter_info.bits.valid) {
		/* Just call pre-releases "0.1". */
		if (tlr->prebooter_info.word == 0x839af4c9) {
			tlr->prebooter_info.bits.major = 0;
			tlr->prebooter_info.bits.minor = 1;
		}
	
		seq_printf(s, "PREBOOTER_VERSION %d.%d\n",
			   tlr->prebooter_info.bits.major,
			   tlr->prebooter_info.bits.minor);
	}
	else
		seq_printf(s, "PREBOOTER_VERSION none\n");

	return 0;
}

const static struct file_operations tlr_info_ops = {
	.owner = THIS_MODULE,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

static int tlr_info_open(struct tlr_pcie_dev *tlr, struct file *file)
{
	file->f_op = &tlr_info_ops;
	return single_open(file, tlr_info_seq_show, tlr);
}


/***********************************************************************
 *                   Module Loading and Device Probe                   *
 ***********************************************************************/


static int tlr_generic_open(struct inode *inode, struct file *filp)
{
	unsigned int	 minor = MINOR(inode->i_rdev);
	int		 result;
	int channel;
	struct tlr_pcie_dev *tlr =
		container_of(inode->i_cdev, struct tlr_pcie_dev, cdev);

	FOP_TRACE("Enter minor %d\n", minor);


	if ((minor >= FIRST_PUBLIC_MINOR) &&
	    (minor <= LAST_PUBLIC_MINOR)) {
		result = tlr_cdev_open(inode, filp);
	} else if ((minor >= FIRST_ZC_H2T_MINOR) &&
		 (minor <= LAST_ZC_H2T_MINOR)) {
		channel = (minor - FIRST_ZC_H2T_MINOR + FIRST_ZC_H2T_CHAN);
		result = tlr_zc_open(&tlr->zc_state, filp, channel);
	} else if ((minor >= FIRST_ZC_T2H_MINOR) &&
		 (minor <= LAST_ZC_T2H_MINOR)) {
		channel = (minor - FIRST_ZC_T2H_MINOR + FIRST_ZC_T2H_CHAN);
		result = tlr_zc_open(&tlr->zc_state, filp, channel);
	} else if ((minor >= TILEPCI_FIRST_C2C_SEND_MINOR) &&
		 (minor <= LAST_C2C_SEND_MINOR)) {
		channel = ((minor - TILEPCI_FIRST_C2C_SEND_MINOR) +
			   PCIE_FIRST_C2C_SEND_CHANNEL);
		result = tlr_zc_open(&tlr->zc_state, filp, channel);
	} else if ((minor >= TILEPCI_FIRST_C2C_RECV_MINOR) &&
		 (minor <= LAST_C2C_RECV_MINOR)) {
		channel = ((minor - TILEPCI_FIRST_C2C_RECV_MINOR) +
			   PCIE_FIRST_C2C_RECV_CHANNEL);
		result = tlr_zc_open(&tlr->zc_state, filp, channel);
	} else if (minor == TILEPCI_CSR_MINOR) {
		result = tlr_csr_open(inode, filp);
	} else if (minor == TILEPCI_CHAR_STREAM_DEBUG_MINOR) {
		result = tlr_cdev_debug_open(tlr, filp);
	} else if (minor == TILEPCI_ZERO_COPY_DEBUG_MINOR) {
		result = tlr_zc_debug_open(&tlr->zc_state, filp);
	} else if (minor == TILEPCI_INFO_MINOR) {
		result = tlr_info_open(tlr, filp);
	} else if (minor == TILEPCI_HV_DIRECT_ZC_MINOR) {
		int cpu = get_cpu();
		result = tlr_hv_direct_zc_open(
			inode, filp, cpu,
			&(__get_cpu_var(init_read)[tlr->link_index]));
		put_cpu();
	} else if (minor == TILEPCI_BARMEM_MINOR) {
		result = tlr_barmem_open(tlr, filp);
	} else {
		result = -ENODEV;
	}

	EX_TRACE("Exit result %d\n", result);

	return result;
}

static const struct file_operations tlr_generic_ops = {
	.owner = THIS_MODULE,
	.open = tlr_generic_open,
};

static int tlr_cdev_setup(struct tlr_pcie_dev *tlr)
{
	struct cdev *cdev = &tlr->cdev;

	/* Allocate some major/minor numbers. */
	dev_t first;
	int err = alloc_chrdev_region(&first, 0, NUM_MINOR_DEVICES,
				      driver_name);
	if (err != 0)
		return err;

	/* Register the device. */
	cdev_init(cdev, &tlr_generic_ops);
	cdev->owner = THIS_MODULE;
	err = cdev_add(cdev, first, NUM_MINOR_DEVICES);
	if (err != 0) {
		unregister_chrdev_region(first, NUM_MINOR_DEVICES);
		return err;
	}
	tlr->first_dev = first;

	return 0;
}


/*
 * A function to be invoked on every cpu in order to perform local
 * initialization.  This is invoked after global initialization is
 * complete.
 */
static void local_init_callback(void *dev)
{
	int cpu = smp_processor_id();
	int irq, intr_id_bit_mask;
	struct tlr_pcie_dev *tlr = dev;
	struct pcie_tile_local_init_read *pdata =
		&(__get_cpu_var(init_read)[tlr->link_index]);

	/* Read from the driver to get per cpu completion Q */
	if (hv_dev_pread(tlr->hv_channel_ctl_fd, 0,
			 (HV_VirtAddr) pdata, sizeof(*pdata),
			 PCIE_CHANNEL_CTL_LOCAL_INIT_OFF) != sizeof(*pdata))
		ERR("Local PCIE init failed on cpu %d\n", cpu);
	else {
		TRACE("Local init of CPU %d: queue_va = %p; "
		      "id = %d, irq_mask = %#x\n",
		      cpu, pdata->completion_queue,
		      pdata->completion_queue_id,
		      pdata->completion_queue->irq_mask);
	}

	/* Set starting per cpu complete index. */
	__get_cpu_var(last_complete)[tlr->link_index] =
		pdata->completion_queue->completion_posted_count;

	/*
	 * Get the index into the interrupt handler table from the
	 * interrupt bit mask which should have only one bit set.
	 */
	intr_id_bit_mask = pdata->completion_queue->irq_mask;
	irq = __ffs(intr_id_bit_mask);

	/* Make sure irq number is the same on all cores. */
	spin_lock(&tlr->irq_lock);
	if (tlr->irq == TLR_IRQ_NOT_FOUND)
		tlr->irq = irq;
	else if (tlr->irq != irq &&
		 tlr->irq != TLR_IRQ_MISMATCH) {
		ERR("Mismatched irqs: %d vs. %d\n", tlr->irq, irq);
		tlr->irq = TLR_IRQ_MISMATCH;
	}
	spin_unlock(&tlr->irq_lock);
}

/*
 * Performs both global and local initialization for the PCIE
 * subsystem.  This should only be called when tlr->hv_channel_ctl_fd
 * is valid.  Returns 1 if the device is ready, 0 otherwise.
 */
static int global_and_local_init(struct tlr_pcie_dev *tlr)
{
	int res, cpu;
	unsigned int width, x, y;
	HV_LOTAR shm_state_lotar, csr_memory_lotar, csr_write_queue_lotar;

	/* Figure out where the page is homed. */
	cpu = page_home(virt_to_page(tlr->shm_state));
	if (cpu < 0) {
		ERR("shm_state page is globally cached.\n");
		return 0;
	}
	width = smp_width;
	x = cpu % width;
	y = cpu / width;
	shm_state_lotar = HV_XY_TO_LOTAR(x, y);

	/* Initialize the shm_state object. */
	tlr->shm_state->buffer_cmd_posted_count = 0xffff0000;

	cpu = page_home(virt_to_page(tlr->csr_memory));
	if (cpu < 0) {
		ERR("csr_memory page is globally cached.\n");
		return 0;
	}
	x = cpu % width;
	y = cpu / width;
	csr_memory_lotar = HV_XY_TO_LOTAR(x, y);

	cpu = page_home(virt_to_page(tlr->csr_write_queue));
	if (cpu < 0) {
		ERR("csr_write_queue page is globally cached.\n");
		return 0;
	}
	x = cpu % width;
	y = cpu / width;
	csr_write_queue_lotar = HV_XY_TO_LOTAR(x, y);

	/* Perform global initialization. */
	{
		struct pcie_tile_global_init_write init_write = {
			.shm_state_cpa = __pa(tlr->shm_state),
			.shm_state_home = shm_state_lotar,
			.csr_memory_cpa = __pa(tlr->csr_memory),
			.csr_memory_page_size = TILE_CSR_MEMORY_PAGE_SIZE,
			.csr_memory_home = csr_memory_lotar,
			.csr_write_queue_cpa = __pa(tlr->csr_write_queue),
			.csr_write_queue_home = csr_write_queue_lotar,
		};
		res = hv_dev_pwrite(tlr->hv_channel_ctl_fd, 0,
				    (HV_VirtAddr)&init_write,
				    sizeof(init_write),
				    PCIE_CHANNEL_CTL_GLOBAL_INIT_OFF);
		if (res != sizeof(init_write))
			return 0;
	}

	/* Perform local initialization via an SMP IPI call. */
	tlr->irq = TLR_IRQ_NOT_FOUND;
	on_each_cpu(local_init_callback, tlr, 1);

	if (tlr->irq == TLR_IRQ_MISMATCH)
		return 0;

	tile_irq_activate(tlr->irq, TILE_IRQ_PERCPU);
	if (request_irq(tlr->irq, tlr_hv_intr, 0, driver_name, (void *)tlr)) {
		ERR("Failed to register handler for IRQ %d\n", tlr->irq);
		return 0;
	}

	return 1;
}


static int tlr_open_hv_dev(int link_index, char *filename)
{
	char file[128];

	sprintf(file, "pcie/%d/%s", link_index, filename);
	return hv_dev_open((HV_VirtAddr)file, 0);
}


int tlr_is_ready(struct tlr_pcie_dev *tlr)
{
	int result, hv_bios_fd, read, link_index;
	struct tlr_bios_info bios_info;
	union tlr_prebooter_info prebooter_info;

	/*
	 * Use a mutex to make sure we don't double-init; it's okay if
	 * this is a bit heavyweight since we only do this during
	 * open().  We're uninterruptable because the init process
	 * should always finish in a timely fashion.
	 */
	down(&tlr->hv_ready_mutex);

	if (tlr->is_hv_ready) {
		result = 1;
		goto exit;
	}

	hv_bios_fd = tlr_open_hv_dev(tlr->link_index, "bios");
	if (hv_bios_fd == HV_ENOTREADY) {
		TRACE("PCIE endpoint device is not ready.\n");
		result = 0;
		goto exit;
	}
	if (hv_bios_fd < 0) {
		ERR("Unexpected failure to open bios file: %d\n", hv_bios_fd);
		result = 0;
		goto exit;
	}

	tlr->hv_channel_ctl_fd =
		tlr_open_hv_dev(tlr->link_index, "channel_ctl");
	if (tlr->hv_channel_ctl_fd < 0) {
		ERR("Unexpected failure to open channel_ctl file: %d\n",
		    tlr->hv_channel_ctl_fd);
		result = 0;
		goto exit;
	}

	TRACE("Opened HV device handles on PCIE link %d\n", tlr->link_index);

	tlr->is_hv_ready = global_and_local_init(tlr);
	if (!tlr->is_hv_ready) {
		result = 0;
		goto exit;
	} else {
		result = 1;
	}

	/* We're ready; go ahead and grab the BIOS profile. */
	read  = hv_dev_pread(hv_bios_fd, 0, (HV_VirtAddr)&bios_info,
			     sizeof(bios_info), PCIE_BIOS_OFF);
	if (read == sizeof(bios_info)) {
		INFO("Link %d max_payload = %zd, "
		     "max_read = %zd, lanes = %d\n",
		     tlr->link_index,
		     bios_info.max_payload_size,
		     bios_info.max_read_size,
		     bios_info.link_width);

		tlr->max_payload_size = bios_info.max_payload_size;
		tlr->max_read_size = bios_info.max_read_size;
		tlr->link_width = bios_info.link_width;
		tlr->link_bar1_size = bios_info.bar1_size;
		tlr->link_bar1_address = bios_info.bar1_address;
	}
	read = hv_dev_pread(hv_bios_fd, 0, (HV_VirtAddr)&prebooter_info,
			    sizeof(prebooter_info), PCIE_PREBOOTER_OFF);
	if (read == sizeof(prebooter_info))
		tlr->prebooter_info = prebooter_info;

	/* Obtain the global host link index for this link. */
	read = hv_dev_pread(tlr->hv_channel_ctl_fd, 0,
				(HV_VirtAddr)&link_index, sizeof(link_index),
				PCIE_CHANNEL_CTL_HOST_LINK_INDEX_OFF);
	/* Can it fail? */
	if (read == sizeof(link_index))
		tlr->host_link_index = link_index;

 exit:
	up(&tlr->hv_ready_mutex);
	return result;
}


/* Print a mapping of link numbers to major numbers. */
static int tlr_read_link_to_major(char *buf, char **start, off_t offset,
				  int count, int *eof, void *data)
{
	int result = 0;
	char scratch[MAX_PCIE_LINKS_PER_CHIP * 12]; /* "XXX XXX\n" each */
	char *next = scratch;
	int i;

	for (i = 0; i < MAX_PCIE_LINKS_PER_CHIP; i++) {
		if (the_pcie_devs[i]) {
			int bytes;
			bytes = sprintf(next, "%d %d\n", i,
					MAJOR(the_pcie_devs[i]->first_dev));
			next += bytes;
			result += bytes;
		}
	}
	result = min(result, count);
	memcpy(buf, scratch, result);
	*eof = 1;
	return result;
}



/*
 * Attempt to create a device for the specified PCIE link.  We will
 * create the device as long as an HV driver is present.  If the
 * device is present but not ready yet, the tlr_is_ready() function
 * will handle the rest of init.
 */
static int __init tlr_pcie_dev_probe(int link_index)
{
	struct tlr_pcie_dev *tlr;
	struct tlr_stream *stream;
	int fd;
	int err;
	int i;
	unsigned long order = get_order(TILE_CSR_MEMORY_PAGE_SIZE);
	struct page *page;
	int here = smp_processor_id();

	fd = tlr_open_hv_dev(link_index, "bios");

	/*
	 * If the driver exists but isn't ready, or we successfully
	 * opened, go ahead and create the dev structure.
	 */
	if ((fd < 0) &&
	    (fd != HV_ENOTREADY)) {
		TRACE("Link %d is not present.\n", link_index);
		return -ENODEV;
	} else {
		TRACE("Found link %d\n", link_index);
	}

	/* Get some memory for this device's driver state. */
	tlr = kmalloc(sizeof(*tlr), GFP_KERNEL);
	if (tlr == NULL)
		return -ENOMEM;
	memset(tlr, 0, sizeof(*tlr));

	init_MUTEX(&tlr->hv_ready_mutex);
	spin_lock_init(&tlr->cmd_queue_lock);
	tlr->link_index = link_index;
	spin_lock_init(&tlr->irq_lock);
	tlr->irq = -1;
	spin_lock_init(&tlr->open_cpus_lock);
	init_MUTEX(&tlr->bar1_pages_mutex);

	/*
	 * Allocate pages for state.  Note that we force them to be
	 * cached on the local cpu; this is the normal default, but on
	 * tilepro when the hash_default boot option is used, the memory
	 * will otherwise be in hash-for-home mode, and the hypervisor
	 * driver requires the pages to be cached on a single tile.
	 */

	/* Allocate a page for the dedicated tile shared memory state. */
	page = homecache_alloc_pages(GFP_KERNEL | __GFP_ZERO, 0, here);
	if (!page) {
		err = -ENOMEM;
		goto shm_state_failed;
	}
	tlr->shm_state = page_address(page);

	/* Allocate a page for CSR backing memory. */
	page = homecache_alloc_pages(GFP_KERNEL | __GFP_ZERO, order, here);
	if (!page) {
		err = -ENOMEM;
		goto csr_memory_failed;
	}
	tlr->csr_memory = page_address(page);

	/* Allocate a page for the CSR write queue. */
	page = homecache_alloc_pages(GFP_KERNEL | __GFP_ZERO, 0, here);
	if (!page) {
		err = -ENOMEM;
		goto csr_write_queue_failed;
	}
	tlr->csr_write_queue = page_address(page);
	init_waitqueue_head(&tlr->csr_wait_queue);
	init_MUTEX(&tlr->csr_notify_mutex);

	/* Allocate and initialize the character streams. */
	for (i = 0; i < NUM_CHAR_STREAMS; i++) {
		stream = kmalloc(sizeof(*stream), GFP_KERNEL);
		if (stream == NULL) {
			err = -ENOMEM;
			goto streams_alloc_failed;
		}
		tlr_set_stream(tlr, i, stream);
		memset(stream, 0, sizeof(*stream));

		/* Char stream init. */
		stream->index = i;
		stream->dev = tlr;

		init_MUTEX(&stream->write_mutex);
		init_waitqueue_head(&stream->write_queue);

		init_MUTEX(&stream->read_mutex);
		init_waitqueue_head(&stream->read_queue);
	}

	/* Allocate and initialize the zero-copy command queues. */
	err = tlr_zc_init(tlr);
	if (err != 0)
		goto zc_init_failed;

	/* Create our character device (which has NUM_MINOR_DEVICES nodes). */
	err = tlr_cdev_setup(tlr);
	if (err != 0)
		goto cdev_failed;

	/*
	 * We're good to go at this point.  Add this link to the
	 * global list, and go ahead and poke to see if we can do all
	 * the channel initialization.
	 */
	the_pcie_devs[link_index] = tlr;
	(void)tlr_is_ready(tlr);

	return 0;

 cdev_failed:
	tlr_zc_free(&tlr->zc_state);
 zc_init_failed:
 streams_alloc_failed:
	for (i = 0; i < NUM_CHAR_STREAMS; i++) {
		stream = tlr_get_stream(tlr, i);
		if (stream != NULL) {
			kfree(stream);
			tlr_set_stream(tlr, i, NULL);
		}
	}
	homecache_free_page((unsigned long)tlr->csr_write_queue);
 csr_write_queue_failed:
	homecache_free_pages((unsigned long)tlr->csr_memory, order);
 csr_memory_failed:
	homecache_free_page((unsigned long)tlr->shm_state);
 shm_state_failed:
	/* No need to hv_close() hypervisor devices. */
	kfree(tlr);
	return err;
}


static int __init tlr_init(void)
{
	int i;

	for (i = 0; i < MAX_PCIE_LINKS_PER_CHIP; i++)
		tlr_pcie_dev_probe(i);

	create_proc_read_entry("driver/tilepci_link_to_major", 0, NULL,
			       tlr_read_link_to_major, NULL);

	return 0;
}


static void __exit tlr_exit(void)
{
	/*
	 * We're statically compiled into the kernel, so this should never
	 * be called.
	 */
	ERR("Removing %s", driver_name);
}

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Tilera");

module_init(tlr_init);
module_exit(tlr_exit);
