perfmon/perfmon_file.c - pub/scm/linux/kernel/git/joro/linux - Git at Google

 /*
  * perfmon_file.c: perfmon2 file input/output functions
  *
  * This file implements the perfmon2 interface which
  * provides access to the hardware performance counters
  * of the host processor.
  *
  * The initial version of perfmon.c was written by
  * Ganesh Venkitachalam, IBM Corp.
  *
  * Then it was modified for perfmon-1.x by Stephane Eranian and
  * David Mosberger, Hewlett Packard Co.
  *
  * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
  * by Stephane Eranian, Hewlett Packard Co.
  *
  * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
  * Contributed by Stephane Eranian <eranian@hpl.hp.com>
  *                David Mosberger-Tang <davidm@hpl.hp.com>
  *
  * More information about perfmon available at:
  * 	http://perfmon2.sf.net
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
  * License as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  * 02111-1307 USA
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/vfs.h>
 #include <linux/pagemap.h>
 #include <linux/mount.h>
 #include <linux/anon_inodes.h>
 #include <linux/perfmon_kern.h>
 #include "perfmon_priv.h"

 #define PFMFS_MAGIC 0xa0b4d889	/* perfmon filesystem magic number */

 struct pfm_controls pfm_controls = {
 	.sys_group = PFM_GROUP_PERM_ANY,
 	.task_group = PFM_GROUP_PERM_ANY,
 	.arg_mem_max = PAGE_SIZE,
 	.smpl_buffer_mem_max = ~0,
 };
 EXPORT_SYMBOL(pfm_controls);

 static int __init enable_debug(char *str)
 {
 	pfm_controls.debug = 1;
 	PFM_INFO("debug output enabled\n");
 	return 1;
 }
 __setup("perfmon_debug", enable_debug);

 int pfm_buf_map_pagefault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	void *kaddr;
 	unsigned long address;
 	struct pfm_context *ctx;
 	size_t size;

 	address = (unsigned long)vmf->virtual_address;

 	ctx = vma->vm_private_data;
 	if (ctx == NULL) {
 		PFM_DBG("no ctx");
 		return VM_FAULT_SIGBUS;
 	}
 	/*
 	 * size available to user (maybe different from real_smpl_size
 	 */
 	size = ctx->smpl_size;

 	if ((address < vma->vm_start) ||
 	    (address >= (vma->vm_start + size)))
 		return VM_FAULT_SIGBUS;

 	kaddr = ctx->smpl_addr + (address - vma->vm_start);

 	vmf->page = vmalloc_to_page(kaddr);
 	get_page(vmf->page);

 	PFM_DBG("[%d] start=%p ref_count=%d",
 		current->pid,
 		kaddr, page_count(vmf->page));

 	return 0;
 }

 /*
  * we need to determine whther or not we are closing the last reference
  * to the file and thus are going to end up in pfm_close() which eventually
  * calls pfm_release_buf_space(). In that function, we update the accouting
  * for locked_vm given that we are actually freeing the sampling buffer. The
  * issue is that there are multiple paths leading to pfm_release_buf_space(),
  * from exit(), munmap(), close(). The path coming from munmap() is problematic
  * becuse do_munmap() grabs mmap_sem in write-mode which is also what
  * pfm_release_buf_space does. To avoid deadlock, we need to determine where
  * we are calling from and skip the locking. The vm_ops->close() callback
  * is invoked for each remove_vma() independently of the number of references
  * left on the file descriptor, therefore simple reference counter does not
  * work. We need to determine if this is the last call, and then set a flag
  * to skip the locking.
  */
 static void pfm_buf_map_close(struct vm_area_struct *vma)
 {
 	struct file *file;
 	struct pfm_context *ctx;

 	file = vma->vm_file;
 	ctx = vma->vm_private_data;

 	/*
 	 * if file is going to close, then pfm_close() will
 	 * be called, do not lock in pfm_release_buf
 	 */
 	if (atomic_long_read(&file->f_count) == 1)
 		ctx->flags.mmap_nlock = 1;
 }

 /*
  * we do not have a close callback because, the locked
  * memory accounting must be done when the actual buffer
  * is freed. Munmap does not free the page backing the vma
  * because they may still be in use by the PMU interrupt handler.
  */
 struct vm_operations_struct pfm_buf_map_vm_ops = {
 	.fault = pfm_buf_map_pagefault,
 	.close = pfm_buf_map_close
 };

 static int pfm_mmap_buffer(struct pfm_context *ctx, struct vm_area_struct *vma,
 			   size_t size)
 {
 	if (ctx->smpl_addr == NULL) {
 		PFM_DBG("no sampling buffer to map");
 		return -EINVAL;
 	}

 	if (size > ctx->smpl_size) {
 		PFM_DBG("mmap size=%zu >= actual buf size=%zu",
 			size,
 			ctx->smpl_size);
 		return -EINVAL;
 	}

 	vma->vm_ops = &pfm_buf_map_vm_ops;
 	vma->vm_private_data = ctx;

 	return 0;
 }

 static int pfm_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	size_t size;
 	struct pfm_context *ctx;
 	unsigned long flags;
 	int ret;

 	PFM_DBG("pfm_file_ops");

 	ctx  = file->private_data;
 	size = (vma->vm_end - vma->vm_start);

 	if (ctx == NULL)
 		return -EINVAL;

 	ret = -EINVAL;

 	spin_lock_irqsave(&ctx->lock, flags);

 	if (vma->vm_flags & VM_WRITE) {
 		PFM_DBG("cannot map buffer for writing");
 		goto done;
 	}

 	PFM_DBG("vm_pgoff=%lu size=%zu vm_start=0x%lx",
 		vma->vm_pgoff,
 		size,
 		vma->vm_start);

 	ret = pfm_mmap_buffer(ctx, vma, size);
 	if (ret == 0)
 		vma->vm_flags |= VM_RESERVED;

 	PFM_DBG("ret=%d vma_flags=0x%lx vma_start=0x%lx vma_size=%lu",
 		ret,
 		vma->vm_flags,
 		vma->vm_start,
 		vma->vm_end-vma->vm_start);
 done:
 	spin_unlock_irqrestore(&ctx->lock, flags);

 	return ret;
 }

 /*
  * Extract one message from queue.
  *
  * return:
  * 	-EAGAIN:  when non-blocking and nothing is* in the queue.
  * 	-ERESTARTSYS: when blocking and signal is pending
  * 	Otherwise returns size of message (sizeof(pfarg_msg))
  */
 ssize_t __pfm_read(struct pfm_context *ctx, union pfarg_msg *msg_buf, int non_block)
 {
 	ssize_t ret = 0;
 	unsigned long flags;
 	DECLARE_WAITQUEUE(wait, current);

 	/*
 	 * we must masks interrupts to avoid a race condition
 	 * with the PMU interrupt handler.
 	 */
 	spin_lock_irqsave(&ctx->lock, flags);

 	while (pfm_msgq_is_empty(ctx)) {

 		/*
 		 * handle non-blocking reads
 		 * return -EAGAIN
 		 */
 		ret = -EAGAIN;
 		if (non_block)
 			break;

 		add_wait_queue(&ctx->msgq_wait, &wait);
 		set_current_state(TASK_INTERRUPTIBLE);

 		spin_unlock_irqrestore(&ctx->lock, flags);

 		schedule();

 		/*
 		 * during this window, another thread may call
 		 * pfm_read() and steal our message
 		 */

 		spin_lock_irqsave(&ctx->lock, flags);

 		remove_wait_queue(&ctx->msgq_wait, &wait);
 		set_current_state(TASK_RUNNING);

 		/*
 		 * check for pending signals
 		 * return -ERESTARTSYS
 		 */
 		ret = -ERESTARTSYS;
 		if (signal_pending(current))
 			break;

 		/*
 		 * we may have a message
 		 */
 		ret = 0;
 	}

 	/*
 	 * extract message
 	 */
 	if (ret == 0) {
 		/*
 		 * copy the oldest message into msg_buf.
 		 * We cannot directly call copy_to_user()
 		 * because interrupts masked. This is done
 		 * in the caller
 		 */
 		pfm_get_next_msg(ctx, msg_buf);

 		ret = sizeof(*msg_buf);

 		PFM_DBG("extracted type=%d", msg_buf->type);
 	}

 	spin_unlock_irqrestore(&ctx->lock, flags);

 	PFM_DBG("blocking=%d ret=%zd", non_block, ret);

 	return ret;
 }

 static ssize_t pfm_read(struct file *filp, char __user *buf, size_t size,
 			loff_t *ppos)
 {
 	struct pfm_context *ctx;
 	union pfarg_msg msg_buf;
 	int non_block, ret;

 	PFM_DBG_ovfl("buf=%p size=%zu", buf, size);

 	ctx = filp->private_data;
 	if (ctx == NULL) {
 		PFM_ERR("no ctx for pfm_read");
 		return -EINVAL;
 	}

 	non_block = filp->f_flags & O_NONBLOCK;

 #ifdef CONFIG_IA64_PERFMON_COMPAT
 	/*
 	 * detect IA-64 v2.0 context read (message size is different)
 	 * nops on all other architectures
 	 */
 	if (unlikely(ctx->flags.ia64_v20_compat))
 		return pfm_arch_compat_read(ctx,  buf, non_block, size);
 #endif
 	/*
 	 * cannot extract partial messages.
 	 * check even when there is no message
 	 *
 	 * cannot extract more than one message per call. Bytes
 	 * above sizeof(msg) are ignored.
 	 */
 	if (size < sizeof(msg_buf)) {
 		PFM_DBG("message is too small size=%zu must be >=%zu)",
 			size,
 			sizeof(msg_buf));
 		return -EINVAL;
 	}

 	ret =  __pfm_read(ctx, &msg_buf, non_block);
 	if (ret > 0) {
 		if (copy_to_user(buf, &msg_buf, sizeof(msg_buf)))
 			ret = -EFAULT;
 	}
 	PFM_DBG_ovfl("ret=%d", ret);
 	return ret;
 }

 static ssize_t pfm_write(struct file *file, const char __user *ubuf,
 			  size_t size, loff_t *ppos)
 {
 	PFM_DBG("pfm_write called");
 	return -EINVAL;
 }

 static unsigned int pfm_poll(struct file *filp, poll_table *wait)
 {
 	struct pfm_context *ctx;
 	unsigned long flags;
 	unsigned int mask = 0;

 	PFM_DBG("pfm_file_ops");

 	if (filp->f_op != &pfm_file_ops) {
 		PFM_ERR("pfm_poll bad magic");
 		return 0;
 	}

 	ctx = filp->private_data;
 	if (ctx == NULL) {
 		PFM_ERR("pfm_poll no ctx");
 		return 0;
 	}

 	PFM_DBG("before poll_wait");

 	poll_wait(filp, &ctx->msgq_wait, wait);

 	/*
 	 * pfm_msgq_is_empty() is non-atomic
 	 *
 	 * filp is protected by fget() at upper level
 	 * context cannot be closed by another thread.
 	 *
 	 * There may be a race with a PMU interrupt adding
 	 * messages to the queue. But we are interested in
 	 * queue not empty, so adding more messages should
 	 * not really be a problem.
 	 *
 	 * There may be a race with another thread issuing
 	 * a read() and stealing messages from the queue thus
 	 * may return the wrong answer. This could potentially
 	 * lead to a blocking read, because nothing is
 	 * available in the queue
 	 */
 	spin_lock_irqsave(&ctx->lock, flags);

 	if (!pfm_msgq_is_empty(ctx))
 		mask =  POLLIN | POLLRDNORM;

 	spin_unlock_irqrestore(&ctx->lock, flags);

 	PFM_DBG("after poll_wait mask=0x%x", mask);

 	return mask;
 }

 static long pfm_ioctl(struct file *file, unsigned int cmd,
 		     unsigned long arg)
 {
 	PFM_DBG("pfm_ioctl called");
 	return -EINVAL;
 }

 /*
  * interrupt cannot be masked when entering this function
  */
 static inline int __pfm_fasync(int fd, struct file *filp,
 			       struct pfm_context *ctx, int on)
 {
 	int ret;

 	PFM_DBG("in  fd=%d on=%d async_q=%p",
 		fd,
 		on,
 		ctx->async_queue);

 	ret = fasync_helper(fd, filp, on, &ctx->async_queue);

 	PFM_DBG("out fd=%d on=%d async_q=%p ret=%d",
 		fd,
 		on,
 		ctx->async_queue, ret);

 	return ret;
 }

 static int pfm_fasync(int fd, struct file *filp, int on)
 {
 	struct pfm_context *ctx;
 	int ret;

 	PFM_DBG("pfm_file_ops");

 	ctx = filp->private_data;
 	if (ctx == NULL) {
 		PFM_ERR("pfm_fasync no ctx");
 		return -EBADF;
 	}

 	/*
 	 * we cannot mask interrupts during this call because this may
 	 * may go to sleep if memory is not readily avalaible.
 	 *
 	 * We are protected from the context disappearing by the
 	 * get_fd()/put_fd() done in caller. Serialization of this function
 	 * is ensured by caller.
 	 */
 	ret = __pfm_fasync(fd, filp, ctx, on);

 	PFM_DBG("pfm_fasync called on fd=%d on=%d async_queue=%p ret=%d",
 		fd,
 		on,
 		ctx->async_queue, ret);

 	return ret;
 }

 #ifdef CONFIG_SMP
 static void __pfm_close_remote_cpu(void *info)
 {
 	struct pfm_context *ctx = info;
 	int can_release;

 	BUG_ON(ctx != __get_cpu_var(pmu_ctx));

 	/*
 	 * we are in IPI interrupt handler which has always higher
 	 * priority than PMU interrupt, therefore we do not need to
 	 * mask interrupts. context locking is not needed because we
 	 * are in close(), no more user references.
 	 *
 	 * can_release is ignored, release done on calling CPU
 	 */
 	__pfm_unload_context(ctx, &can_release);

 	/*
 	 * we cannot free context here because we are in_interrupt().
 	 * we free on the calling CPU
 	 */
 }

 static int pfm_close_remote_cpu(u32 cpu, struct pfm_context *ctx)
 {
 	BUG_ON(irqs_disabled());
 	return smp_call_function_single(cpu, __pfm_close_remote_cpu, ctx, 1);
 }
 #endif /* CONFIG_SMP */

 /*
  * called either on explicit close() or from exit_files().
  * Only the LAST user of the file gets to this point, i.e., it is
  * called only ONCE.
  *
  * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero
  * (fput()),i.e, last task to access the file. Nobody else can access the
  * file at this point.
  *
  * When called from exit_files(), the VMA has been freed because exit_mm()
  * is executed before exit_files().
  *
  * When called from exit_files(), the current task is not yet ZOMBIE but we
  * flush the PMU state to the context.
  */
 int __pfm_close(struct pfm_context *ctx, struct file *filp)
 {
 	unsigned long flags;
 	int state;
 	int can_free = 1, can_unload = 1;
 	int is_system, can_release = 0;
 	u32 cpu;

 	/*
 	 * no risk of ctx of filp disappearing so we can operate outside
 	 * of spin_lock(). fasync_helper() runs with interrupts masked,
 	 * thus there is no risk with the PMU interrupt handler
 	 *
 	 * In case of zombie, we will not have the async struct anymore
 	 * thus kill_fasync() will not do anything
 	 *
 	 * fd is not used when removing the entry so we pass -1
 	 */
 	if (filp->f_flags & FASYNC)
 		__pfm_fasync (-1, filp, ctx, 0);

 	spin_lock_irqsave(&ctx->lock, flags);

 	state = ctx->state;
 	is_system = ctx->flags.system;
 	cpu = ctx->cpu;

 	PFM_DBG("state=%d", state);

 	/*
 	 * check if unload is needed
 	 */
 	if (state == PFM_CTX_UNLOADED)
 		goto doit;

 #ifdef CONFIG_SMP
 	/*
 	 * we need to release the resource on the ORIGINAL cpu.
 	 * we need to release the context lock to avoid deadlocks
 	 * on the original CPU, especially in the context switch
 	 * routines. It is safe to unlock because we are in close(),
 	 * in other words, there is no more access from user level.
 	 * we can also unmask interrupts on this CPU because the
 	 * context is running on the original CPU. Context will be
 	 * unloaded and the session will be released on the original
 	 * CPU. Upon return, the caller is guaranteed that the context
 	 * is gone from original CPU.
 	 */
 	if (is_system && cpu != smp_processor_id()) {
 		spin_unlock_irqrestore(&ctx->lock, flags);
 		pfm_close_remote_cpu(cpu, ctx);
 		can_release = 1;
 		goto free_it;
 	}

 	if (!is_system && ctx->task != current) {
 		/*
 		 * switch context to zombie state
 		 */
 		ctx->state = PFM_CTX_ZOMBIE;

 		PFM_DBG("zombie ctx for [%d]", ctx->task->pid);
 		/*
 		 * must check if other thread is using block overflow
 		 * notification mode. If so make sure it will not block
 		 * because there will not be any pfm_restart() issued.
 		 * When the thread notices the ZOMBIE state, it will clean
 		 * up what is left of the context
 		 */
 		if (state == PFM_CTX_MASKED && ctx->flags.block) {
 			/*
 			 * force task to wake up from MASKED state
 			 */
 			PFM_DBG("waking up [%d]", ctx->task->pid);

 			complete(&ctx->restart_complete);
 		}
 		/*
 		 * PMU session will be release by monitored task when it notices
 		 * ZOMBIE state as part of pfm_unload_context()
 		 */
 		can_unload = can_free = 0;
 	}
 #endif
 	if (can_unload)
 		__pfm_unload_context(ctx, &can_release);
 doit:
 	spin_unlock_irqrestore(&ctx->lock, flags);

 #ifdef CONFIG_SMP
 free_it:
 #endif
 	if (can_release)
 		pfm_session_release(is_system, cpu);

 	if (can_free)
 		pfm_free_context(ctx);

 	return 0;
 }

 static int pfm_close(struct inode *inode, struct file *filp)
 {
 	struct pfm_context *ctx;

 	PFM_DBG("called filp=%p", filp);

 	ctx = filp->private_data;
 	if (ctx == NULL) {
 		PFM_ERR("no ctx");
 		return -EBADF;
 	}
 	return __pfm_close(ctx, filp);
 }

 static int pfm_no_open(struct inode *irrelevant, struct file *dontcare)
 {
 	PFM_DBG("pfm_file_ops");

 	return -ENXIO;
 }


 const struct file_operations pfm_file_ops = {
 	.llseek = no_llseek,
 	.read = pfm_read,
 	.write = pfm_write,
 	.poll = pfm_poll,
 	.unlocked_ioctl = pfm_ioctl,
 	.open = pfm_no_open, /* special open to disallow open via /proc */
 	.fasync = pfm_fasync,
 	.release = pfm_close,
 	.mmap = pfm_mmap
 };

 int pfm_alloc_fd(struct pfm_context *ctx)
 {
 	return anon_inode_getfd("[pfmfd]", &pfm_file_ops, ctx, O_RDONLY);
 }
	/*
	* perfmon_file.c: perfmon2 file input/output functions
	*
	* This file implements the perfmon2 interface which
	* provides access to the hardware performance counters
	* of the host processor.
	*
	* The initial version of perfmon.c was written by
	* Ganesh Venkitachalam, IBM Corp.
	*
	* Then it was modified for perfmon-1.x by Stephane Eranian and
	* David Mosberger, Hewlett Packard Co.
	*
	* Version Perfmon-2.x is a complete rewrite of perfmon-1.x
	* by Stephane Eranian, Hewlett Packard Co.
	*
	* Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
	* Contributed by Stephane Eranian <eranian@hpl.hp.com>
	* David Mosberger-Tang <davidm@hpl.hp.com>
	*
	* More information about perfmon available at:
	* http://perfmon2.sf.net
	*
	* This program is free software; you can redistribute it and/or
	* modify it under the terms of version 2 of the GNU General Public
	* License as published by the Free Software Foundation.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
	* 02111-1307 USA
	*/
	#include <linux/kernel.h>
	#include <linux/module.h>
	#include <linux/file.h>
	#include <linux/poll.h>
	#include <linux/vfs.h>
	#include <linux/pagemap.h>
	#include <linux/mount.h>
	#include <linux/anon_inodes.h>
	#include <linux/perfmon_kern.h>
	#include "perfmon_priv.h"

	#define PFMFS_MAGIC 0xa0b4d889 /* perfmon filesystem magic number */

	struct pfm_controls pfm_controls = {
	.sys_group = PFM_GROUP_PERM_ANY,
	.task_group = PFM_GROUP_PERM_ANY,
	.arg_mem_max = PAGE_SIZE,
	.smpl_buffer_mem_max = ~0,
	};
	EXPORT_SYMBOL(pfm_controls);

	static int __init enable_debug(char *str)
	{
	pfm_controls.debug = 1;
	PFM_INFO("debug output enabled\n");
	return 1;
	}
	__setup("perfmon_debug", enable_debug);

	int pfm_buf_map_pagefault(struct vm_area_struct vma, struct vm_fault vmf)
	{
	void *kaddr;
	unsigned long address;
	struct pfm_context *ctx;
	size_t size;

	address = (unsigned long)vmf->virtual_address;

	ctx = vma->vm_private_data;
	if (ctx == NULL) {
	PFM_DBG("no ctx");
	return VM_FAULT_SIGBUS;
	}
	/*
	* size available to user (maybe different from real_smpl_size
	*/
	size = ctx->smpl_size;

	if ((address < vma->vm_start) \|\|
	(address >= (vma->vm_start + size)))
	return VM_FAULT_SIGBUS;

	kaddr = ctx->smpl_addr + (address - vma->vm_start);

	vmf->page = vmalloc_to_page(kaddr);
	get_page(vmf->page);

	PFM_DBG("[%d] start=%p ref_count=%d",
	current->pid,
	kaddr, page_count(vmf->page));

	return 0;
	}

	/*
	* we need to determine whther or not we are closing the last reference
	* to the file and thus are going to end up in pfm_close() which eventually
	* calls pfm_release_buf_space(). In that function, we update the accouting
	* for locked_vm given that we are actually freeing the sampling buffer. The
	* issue is that there are multiple paths leading to pfm_release_buf_space(),
	* from exit(), munmap(), close(). The path coming from munmap() is problematic
	* becuse do_munmap() grabs mmap_sem in write-mode which is also what
	* pfm_release_buf_space does. To avoid deadlock, we need to determine where
	* we are calling from and skip the locking. The vm_ops->close() callback
	* is invoked for each remove_vma() independently of the number of references
	* left on the file descriptor, therefore simple reference counter does not
	* work. We need to determine if this is the last call, and then set a flag
	* to skip the locking.
	*/
	static void pfm_buf_map_close(struct vm_area_struct *vma)
	{
	struct file *file;
	struct pfm_context *ctx;

	file = vma->vm_file;
	ctx = vma->vm_private_data;

	/*
	* if file is going to close, then pfm_close() will
	* be called, do not lock in pfm_release_buf
	*/
	if (atomic_long_read(&file->f_count) == 1)
	ctx->flags.mmap_nlock = 1;
	}

	/*
	* we do not have a close callback because, the locked
	* memory accounting must be done when the actual buffer
	* is freed. Munmap does not free the page backing the vma
	* because they may still be in use by the PMU interrupt handler.
	*/
	struct vm_operations_struct pfm_buf_map_vm_ops = {
	.fault = pfm_buf_map_pagefault,
	.close = pfm_buf_map_close
	};

	static int pfm_mmap_buffer(struct pfm_context ctx, struct vm_area_struct vma,
	size_t size)
	{
	if (ctx->smpl_addr == NULL) {
	PFM_DBG("no sampling buffer to map");
	return -EINVAL;
	}

	if (size > ctx->smpl_size) {
	PFM_DBG("mmap size=%zu >= actual buf size=%zu",
	size,
	ctx->smpl_size);
	return -EINVAL;
	}

	vma->vm_ops = &pfm_buf_map_vm_ops;
	vma->vm_private_data = ctx;

	return 0;
	}

	static int pfm_mmap(struct file file, struct vm_area_struct vma)
	{
	size_t size;
	struct pfm_context *ctx;
	unsigned long flags;
	int ret;

	PFM_DBG("pfm_file_ops");

	ctx = file->private_data;
	size = (vma->vm_end - vma->vm_start);

	if (ctx == NULL)
	return -EINVAL;

	ret = -EINVAL;

	spin_lock_irqsave(&ctx->lock, flags);

	if (vma->vm_flags & VM_WRITE) {
	PFM_DBG("cannot map buffer for writing");
	goto done;
	}

	PFM_DBG("vm_pgoff=%lu size=%zu vm_start=0x%lx",
	vma->vm_pgoff,
	size,
	vma->vm_start);

	ret = pfm_mmap_buffer(ctx, vma, size);
	if (ret == 0)
	vma->vm_flags \|= VM_RESERVED;

	PFM_DBG("ret=%d vma_flags=0x%lx vma_start=0x%lx vma_size=%lu",
	ret,
	vma->vm_flags,
	vma->vm_start,
	vma->vm_end-vma->vm_start);
	done:
	spin_unlock_irqrestore(&ctx->lock, flags);

	return ret;
	}

	/*
	* Extract one message from queue.
	*
	* return:
	* -EAGAIN: when non-blocking and nothing is* in the queue.
	* -ERESTARTSYS: when blocking and signal is pending
	* Otherwise returns size of message (sizeof(pfarg_msg))
	*/
	ssize_t __pfm_read(struct pfm_context ctx, union pfarg_msg msg_buf, int non_block)
	{
	ssize_t ret = 0;
	unsigned long flags;
	DECLARE_WAITQUEUE(wait, current);

	/*
	* we must masks interrupts to avoid a race condition
	* with the PMU interrupt handler.
	*/
	spin_lock_irqsave(&ctx->lock, flags);

	while (pfm_msgq_is_empty(ctx)) {

	/*
	* handle non-blocking reads
	* return -EAGAIN
	*/
	ret = -EAGAIN;
	if (non_block)
	break;

	add_wait_queue(&ctx->msgq_wait, &wait);
	set_current_state(TASK_INTERRUPTIBLE);

	spin_unlock_irqrestore(&ctx->lock, flags);

	schedule();

	/*
	* during this window, another thread may call
	* pfm_read() and steal our message
	*/

	spin_lock_irqsave(&ctx->lock, flags);

	remove_wait_queue(&ctx->msgq_wait, &wait);
	set_current_state(TASK_RUNNING);

	/*
	* check for pending signals
	* return -ERESTARTSYS
	*/
	ret = -ERESTARTSYS;
	if (signal_pending(current))
	break;

	/*
	* we may have a message
	*/
	ret = 0;
	}

	/*
	* extract message
	*/
	if (ret == 0) {
	/*
	* copy the oldest message into msg_buf.
	* We cannot directly call copy_to_user()
	* because interrupts masked. This is done
	* in the caller
	*/
	pfm_get_next_msg(ctx, msg_buf);

	ret = sizeof(*msg_buf);

	PFM_DBG("extracted type=%d", msg_buf->type);
	}

	spin_unlock_irqrestore(&ctx->lock, flags);

	PFM_DBG("blocking=%d ret=%zd", non_block, ret);

	return ret;
	}

	static ssize_t pfm_read(struct file filp, char __user buf, size_t size,
	loff_t *ppos)
	{
	struct pfm_context *ctx;
	union pfarg_msg msg_buf;
	int non_block, ret;

	PFM_DBG_ovfl("buf=%p size=%zu", buf, size);

	ctx = filp->private_data;
	if (ctx == NULL) {
	PFM_ERR("no ctx for pfm_read");
	return -EINVAL;
	}

	non_block = filp->f_flags & O_NONBLOCK;

	#ifdef CONFIG_IA64_PERFMON_COMPAT
	/*
	* detect IA-64 v2.0 context read (message size is different)
	* nops on all other architectures
	*/
	if (unlikely(ctx->flags.ia64_v20_compat))
	return pfm_arch_compat_read(ctx, buf, non_block, size);
	#endif
	/*
	* cannot extract partial messages.
	* check even when there is no message
	*
	* cannot extract more than one message per call. Bytes
	* above sizeof(msg) are ignored.
	*/
	if (size < sizeof(msg_buf)) {
	PFM_DBG("message is too small size=%zu must be >=%zu)",
	size,
	sizeof(msg_buf));
	return -EINVAL;
	}

	ret = __pfm_read(ctx, &msg_buf, non_block);
	if (ret > 0) {
	if (copy_to_user(buf, &msg_buf, sizeof(msg_buf)))
	ret = -EFAULT;
	}
	PFM_DBG_ovfl("ret=%d", ret);
	return ret;
	}

	static ssize_t pfm_write(struct file file, const char __user ubuf,
	size_t size, loff_t *ppos)
	{
	PFM_DBG("pfm_write called");
	return -EINVAL;
	}

	static unsigned int pfm_poll(struct file filp, poll_table wait)
	{
	struct pfm_context *ctx;
	unsigned long flags;
	unsigned int mask = 0;

	PFM_DBG("pfm_file_ops");

	if (filp->f_op != &pfm_file_ops) {
	PFM_ERR("pfm_poll bad magic");
	return 0;
	}

	ctx = filp->private_data;
	if (ctx == NULL) {
	PFM_ERR("pfm_poll no ctx");
	return 0;
	}

	PFM_DBG("before poll_wait");

	poll_wait(filp, &ctx->msgq_wait, wait);

	/*
	* pfm_msgq_is_empty() is non-atomic
	*
	* filp is protected by fget() at upper level
	* context cannot be closed by another thread.
	*
	* There may be a race with a PMU interrupt adding
	* messages to the queue. But we are interested in
	* queue not empty, so adding more messages should
	* not really be a problem.
	*
	* There may be a race with another thread issuing
	* a read() and stealing messages from the queue thus
	* may return the wrong answer. This could potentially
	* lead to a blocking read, because nothing is
	* available in the queue
	*/
	spin_lock_irqsave(&ctx->lock, flags);

	if (!pfm_msgq_is_empty(ctx))
	mask = POLLIN \| POLLRDNORM;

	spin_unlock_irqrestore(&ctx->lock, flags);

	PFM_DBG("after poll_wait mask=0x%x", mask);

	return mask;
	}

	static long pfm_ioctl(struct file *file, unsigned int cmd,
	unsigned long arg)
	{
	PFM_DBG("pfm_ioctl called");
	return -EINVAL;
	}

	/*
	* interrupt cannot be masked when entering this function
	*/
	static inline int __pfm_fasync(int fd, struct file *filp,
	struct pfm_context *ctx, int on)
	{
	int ret;

	PFM_DBG("in fd=%d on=%d async_q=%p",
	fd,
	on,
	ctx->async_queue);

	ret = fasync_helper(fd, filp, on, &ctx->async_queue);

	PFM_DBG("out fd=%d on=%d async_q=%p ret=%d",
	fd,
	on,
	ctx->async_queue, ret);

	return ret;
	}

	static int pfm_fasync(int fd, struct file *filp, int on)
	{
	struct pfm_context *ctx;
	int ret;

	PFM_DBG("pfm_file_ops");

	ctx = filp->private_data;
	if (ctx == NULL) {
	PFM_ERR("pfm_fasync no ctx");
	return -EBADF;
	}

	/*
	* we cannot mask interrupts during this call because this may
	* may go to sleep if memory is not readily avalaible.
	*
	* We are protected from the context disappearing by the
	* get_fd()/put_fd() done in caller. Serialization of this function
	* is ensured by caller.
	*/
	ret = __pfm_fasync(fd, filp, ctx, on);

	PFM_DBG("pfm_fasync called on fd=%d on=%d async_queue=%p ret=%d",
	fd,
	on,
	ctx->async_queue, ret);

	return ret;
	}

	#ifdef CONFIG_SMP
	static void __pfm_close_remote_cpu(void *info)
	{
	struct pfm_context *ctx = info;
	int can_release;

	BUG_ON(ctx != __get_cpu_var(pmu_ctx));

	/*
	* we are in IPI interrupt handler which has always higher
	* priority than PMU interrupt, therefore we do not need to
	* mask interrupts. context locking is not needed because we
	* are in close(), no more user references.
	*
	* can_release is ignored, release done on calling CPU
	*/
	__pfm_unload_context(ctx, &can_release);

	/*
	* we cannot free context here because we are in_interrupt().
	* we free on the calling CPU
	*/
	}

	static int pfm_close_remote_cpu(u32 cpu, struct pfm_context *ctx)
	{
	BUG_ON(irqs_disabled());
	return smp_call_function_single(cpu, __pfm_close_remote_cpu, ctx, 1);
	}
	#endif /* CONFIG_SMP */

	/*
	* called either on explicit close() or from exit_files().
	* Only the LAST user of the file gets to this point, i.e., it is
	* called only ONCE.
	*
	* IMPORTANT: we get called ONLY when the refcnt on the file gets to zero
	* (fput()),i.e, last task to access the file. Nobody else can access the
	* file at this point.
	*
	* When called from exit_files(), the VMA has been freed because exit_mm()
	* is executed before exit_files().
	*
	* When called from exit_files(), the current task is not yet ZOMBIE but we
	* flush the PMU state to the context.
	*/
	int __pfm_close(struct pfm_context ctx, struct file filp)
	{
	unsigned long flags;
	int state;
	int can_free = 1, can_unload = 1;
	int is_system, can_release = 0;
	u32 cpu;

	/*
	* no risk of ctx of filp disappearing so we can operate outside
	* of spin_lock(). fasync_helper() runs with interrupts masked,
	* thus there is no risk with the PMU interrupt handler
	*
	* In case of zombie, we will not have the async struct anymore
	* thus kill_fasync() will not do anything
	*
	* fd is not used when removing the entry so we pass -1
	*/
	if (filp->f_flags & FASYNC)
	__pfm_fasync (-1, filp, ctx, 0);

	spin_lock_irqsave(&ctx->lock, flags);

	state = ctx->state;
	is_system = ctx->flags.system;
	cpu = ctx->cpu;

	PFM_DBG("state=%d", state);

	/*
	* check if unload is needed
	*/
	if (state == PFM_CTX_UNLOADED)
	goto doit;

	#ifdef CONFIG_SMP
	/*
	* we need to release the resource on the ORIGINAL cpu.
	* we need to release the context lock to avoid deadlocks
	* on the original CPU, especially in the context switch
	* routines. It is safe to unlock because we are in close(),
	* in other words, there is no more access from user level.
	* we can also unmask interrupts on this CPU because the
	* context is running on the original CPU. Context will be
	* unloaded and the session will be released on the original
	* CPU. Upon return, the caller is guaranteed that the context
	* is gone from original CPU.
	*/
	if (is_system && cpu != smp_processor_id()) {
	spin_unlock_irqrestore(&ctx->lock, flags);
	pfm_close_remote_cpu(cpu, ctx);
	can_release = 1;
	goto free_it;
	}

	if (!is_system && ctx->task != current) {
	/*
	* switch context to zombie state
	*/
	ctx->state = PFM_CTX_ZOMBIE;

	PFM_DBG("zombie ctx for [%d]", ctx->task->pid);
	/*
	* must check if other thread is using block overflow
	* notification mode. If so make sure it will not block
	* because there will not be any pfm_restart() issued.
	* When the thread notices the ZOMBIE state, it will clean
	* up what is left of the context
	*/
	if (state == PFM_CTX_MASKED && ctx->flags.block) {
	/*
	* force task to wake up from MASKED state
	*/
	PFM_DBG("waking up [%d]", ctx->task->pid);

	complete(&ctx->restart_complete);
	}
	/*
	* PMU session will be release by monitored task when it notices
	* ZOMBIE state as part of pfm_unload_context()
	*/
	can_unload = can_free = 0;
	}
	#endif
	if (can_unload)
	__pfm_unload_context(ctx, &can_release);
	doit:
	spin_unlock_irqrestore(&ctx->lock, flags);

	#ifdef CONFIG_SMP
	free_it:
	#endif
	if (can_release)
	pfm_session_release(is_system, cpu);

	if (can_free)
	pfm_free_context(ctx);

	return 0;
	}

	static int pfm_close(struct inode inode, struct file filp)
	{
	struct pfm_context *ctx;

	PFM_DBG("called filp=%p", filp);

	ctx = filp->private_data;
	if (ctx == NULL) {
	PFM_ERR("no ctx");
	return -EBADF;
	}
	return __pfm_close(ctx, filp);
	}

	static int pfm_no_open(struct inode irrelevant, struct file dontcare)
	{
	PFM_DBG("pfm_file_ops");

	return -ENXIO;
	}


	const struct file_operations pfm_file_ops = {
	.llseek = no_llseek,
	.read = pfm_read,
	.write = pfm_write,
	.poll = pfm_poll,
	.unlocked_ioctl = pfm_ioctl,
	.open = pfm_no_open, /* special open to disallow open via /proc */
	.fasync = pfm_fasync,
	.release = pfm_close,
	.mmap = pfm_mmap
	};

	int pfm_alloc_fd(struct pfm_context *ctx)
	{
	return anon_inode_getfd("[pfmfd]", &pfm_file_ops, ctx, O_RDONLY);
	}