blob: 76c2a9bced55d6d0c566268f4d438f3feb8d27c0 [file] [log] [blame]
/*
* perfmon_file.c: perfmon2 file input/output functions
*
* This file implements the perfmon2 interface which
* provides access to the hardware performance counters
* of the host processor.
*
* The initial version of perfmon.c was written by
* Ganesh Venkitachalam, IBM Corp.
*
* Then it was modified for perfmon-1.x by Stephane Eranian and
* David Mosberger, Hewlett Packard Co.
*
* Version Perfmon-2.x is a complete rewrite of perfmon-1.x
* by Stephane Eranian, Hewlett Packard Co.
*
* Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
* Contributed by Stephane Eranian <eranian@hpl.hp.com>
* David Mosberger-Tang <davidm@hpl.hp.com>
*
* More information about perfmon available at:
* http://perfmon2.sf.net
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
* 02111-1307 USA
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/vfs.h>
#include <linux/pagemap.h>
#include <linux/mount.h>
#include <linux/anon_inodes.h>
#include <linux/perfmon_kern.h>
#include "perfmon_priv.h"
#define PFMFS_MAGIC 0xa0b4d889 /* perfmon filesystem magic number */
struct pfm_controls pfm_controls = {
.sys_group = PFM_GROUP_PERM_ANY,
.task_group = PFM_GROUP_PERM_ANY,
.arg_mem_max = PAGE_SIZE,
.smpl_buffer_mem_max = ~0,
};
EXPORT_SYMBOL(pfm_controls);
static int __init enable_debug(char *str)
{
pfm_controls.debug = 1;
PFM_INFO("debug output enabled\n");
return 1;
}
__setup("perfmon_debug", enable_debug);
int pfm_buf_map_pagefault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
void *kaddr;
unsigned long address;
struct pfm_context *ctx;
size_t size;
address = (unsigned long)vmf->virtual_address;
ctx = vma->vm_private_data;
if (ctx == NULL) {
PFM_DBG("no ctx");
return VM_FAULT_SIGBUS;
}
/*
* size available to user (maybe different from real_smpl_size
*/
size = ctx->smpl_size;
if ((address < vma->vm_start) ||
(address >= (vma->vm_start + size)))
return VM_FAULT_SIGBUS;
kaddr = ctx->smpl_addr + (address - vma->vm_start);
vmf->page = vmalloc_to_page(kaddr);
get_page(vmf->page);
PFM_DBG("[%d] start=%p ref_count=%d",
current->pid,
kaddr, page_count(vmf->page));
return 0;
}
/*
* we need to determine whther or not we are closing the last reference
* to the file and thus are going to end up in pfm_close() which eventually
* calls pfm_release_buf_space(). In that function, we update the accouting
* for locked_vm given that we are actually freeing the sampling buffer. The
* issue is that there are multiple paths leading to pfm_release_buf_space(),
* from exit(), munmap(), close(). The path coming from munmap() is problematic
* becuse do_munmap() grabs mmap_sem in write-mode which is also what
* pfm_release_buf_space does. To avoid deadlock, we need to determine where
* we are calling from and skip the locking. The vm_ops->close() callback
* is invoked for each remove_vma() independently of the number of references
* left on the file descriptor, therefore simple reference counter does not
* work. We need to determine if this is the last call, and then set a flag
* to skip the locking.
*/
static void pfm_buf_map_close(struct vm_area_struct *vma)
{
struct file *file;
struct pfm_context *ctx;
file = vma->vm_file;
ctx = vma->vm_private_data;
/*
* if file is going to close, then pfm_close() will
* be called, do not lock in pfm_release_buf
*/
if (atomic_long_read(&file->f_count) == 1)
ctx->flags.mmap_nlock = 1;
}
/*
* we do not have a close callback because, the locked
* memory accounting must be done when the actual buffer
* is freed. Munmap does not free the page backing the vma
* because they may still be in use by the PMU interrupt handler.
*/
struct vm_operations_struct pfm_buf_map_vm_ops = {
.fault = pfm_buf_map_pagefault,
.close = pfm_buf_map_close
};
static int pfm_mmap_buffer(struct pfm_context *ctx, struct vm_area_struct *vma,
size_t size)
{
if (ctx->smpl_addr == NULL) {
PFM_DBG("no sampling buffer to map");
return -EINVAL;
}
if (size > ctx->smpl_size) {
PFM_DBG("mmap size=%zu >= actual buf size=%zu",
size,
ctx->smpl_size);
return -EINVAL;
}
vma->vm_ops = &pfm_buf_map_vm_ops;
vma->vm_private_data = ctx;
return 0;
}
static int pfm_mmap(struct file *file, struct vm_area_struct *vma)
{
size_t size;
struct pfm_context *ctx;
unsigned long flags;
int ret;
PFM_DBG("pfm_file_ops");
ctx = file->private_data;
size = (vma->vm_end - vma->vm_start);
if (ctx == NULL)
return -EINVAL;
ret = -EINVAL;
spin_lock_irqsave(&ctx->lock, flags);
if (vma->vm_flags & VM_WRITE) {
PFM_DBG("cannot map buffer for writing");
goto done;
}
PFM_DBG("vm_pgoff=%lu size=%zu vm_start=0x%lx",
vma->vm_pgoff,
size,
vma->vm_start);
ret = pfm_mmap_buffer(ctx, vma, size);
if (ret == 0)
vma->vm_flags |= VM_RESERVED;
PFM_DBG("ret=%d vma_flags=0x%lx vma_start=0x%lx vma_size=%lu",
ret,
vma->vm_flags,
vma->vm_start,
vma->vm_end-vma->vm_start);
done:
spin_unlock_irqrestore(&ctx->lock, flags);
return ret;
}
/*
* Extract one message from queue.
*
* return:
* -EAGAIN: when non-blocking and nothing is* in the queue.
* -ERESTARTSYS: when blocking and signal is pending
* Otherwise returns size of message (sizeof(pfarg_msg))
*/
ssize_t __pfm_read(struct pfm_context *ctx, union pfarg_msg *msg_buf, int non_block)
{
ssize_t ret = 0;
unsigned long flags;
DECLARE_WAITQUEUE(wait, current);
/*
* we must masks interrupts to avoid a race condition
* with the PMU interrupt handler.
*/
spin_lock_irqsave(&ctx->lock, flags);
while (pfm_msgq_is_empty(ctx)) {
/*
* handle non-blocking reads
* return -EAGAIN
*/
ret = -EAGAIN;
if (non_block)
break;
add_wait_queue(&ctx->msgq_wait, &wait);
set_current_state(TASK_INTERRUPTIBLE);
spin_unlock_irqrestore(&ctx->lock, flags);
schedule();
/*
* during this window, another thread may call
* pfm_read() and steal our message
*/
spin_lock_irqsave(&ctx->lock, flags);
remove_wait_queue(&ctx->msgq_wait, &wait);
set_current_state(TASK_RUNNING);
/*
* check for pending signals
* return -ERESTARTSYS
*/
ret = -ERESTARTSYS;
if (signal_pending(current))
break;
/*
* we may have a message
*/
ret = 0;
}
/*
* extract message
*/
if (ret == 0) {
/*
* copy the oldest message into msg_buf.
* We cannot directly call copy_to_user()
* because interrupts masked. This is done
* in the caller
*/
pfm_get_next_msg(ctx, msg_buf);
ret = sizeof(*msg_buf);
PFM_DBG("extracted type=%d", msg_buf->type);
}
spin_unlock_irqrestore(&ctx->lock, flags);
PFM_DBG("blocking=%d ret=%zd", non_block, ret);
return ret;
}
static ssize_t pfm_read(struct file *filp, char __user *buf, size_t size,
loff_t *ppos)
{
struct pfm_context *ctx;
union pfarg_msg msg_buf;
int non_block, ret;
PFM_DBG_ovfl("buf=%p size=%zu", buf, size);
ctx = filp->private_data;
if (ctx == NULL) {
PFM_ERR("no ctx for pfm_read");
return -EINVAL;
}
non_block = filp->f_flags & O_NONBLOCK;
#ifdef CONFIG_IA64_PERFMON_COMPAT
/*
* detect IA-64 v2.0 context read (message size is different)
* nops on all other architectures
*/
if (unlikely(ctx->flags.ia64_v20_compat))
return pfm_arch_compat_read(ctx, buf, non_block, size);
#endif
/*
* cannot extract partial messages.
* check even when there is no message
*
* cannot extract more than one message per call. Bytes
* above sizeof(msg) are ignored.
*/
if (size < sizeof(msg_buf)) {
PFM_DBG("message is too small size=%zu must be >=%zu)",
size,
sizeof(msg_buf));
return -EINVAL;
}
ret = __pfm_read(ctx, &msg_buf, non_block);
if (ret > 0) {
if (copy_to_user(buf, &msg_buf, sizeof(msg_buf)))
ret = -EFAULT;
}
PFM_DBG_ovfl("ret=%d", ret);
return ret;
}
static ssize_t pfm_write(struct file *file, const char __user *ubuf,
size_t size, loff_t *ppos)
{
PFM_DBG("pfm_write called");
return -EINVAL;
}
static unsigned int pfm_poll(struct file *filp, poll_table *wait)
{
struct pfm_context *ctx;
unsigned long flags;
unsigned int mask = 0;
PFM_DBG("pfm_file_ops");
if (filp->f_op != &pfm_file_ops) {
PFM_ERR("pfm_poll bad magic");
return 0;
}
ctx = filp->private_data;
if (ctx == NULL) {
PFM_ERR("pfm_poll no ctx");
return 0;
}
PFM_DBG("before poll_wait");
poll_wait(filp, &ctx->msgq_wait, wait);
/*
* pfm_msgq_is_empty() is non-atomic
*
* filp is protected by fget() at upper level
* context cannot be closed by another thread.
*
* There may be a race with a PMU interrupt adding
* messages to the queue. But we are interested in
* queue not empty, so adding more messages should
* not really be a problem.
*
* There may be a race with another thread issuing
* a read() and stealing messages from the queue thus
* may return the wrong answer. This could potentially
* lead to a blocking read, because nothing is
* available in the queue
*/
spin_lock_irqsave(&ctx->lock, flags);
if (!pfm_msgq_is_empty(ctx))
mask = POLLIN | POLLRDNORM;
spin_unlock_irqrestore(&ctx->lock, flags);
PFM_DBG("after poll_wait mask=0x%x", mask);
return mask;
}
static long pfm_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
PFM_DBG("pfm_ioctl called");
return -EINVAL;
}
/*
* interrupt cannot be masked when entering this function
*/
static inline int __pfm_fasync(int fd, struct file *filp,
struct pfm_context *ctx, int on)
{
int ret;
PFM_DBG("in fd=%d on=%d async_q=%p",
fd,
on,
ctx->async_queue);
ret = fasync_helper(fd, filp, on, &ctx->async_queue);
PFM_DBG("out fd=%d on=%d async_q=%p ret=%d",
fd,
on,
ctx->async_queue, ret);
return ret;
}
static int pfm_fasync(int fd, struct file *filp, int on)
{
struct pfm_context *ctx;
int ret;
PFM_DBG("pfm_file_ops");
ctx = filp->private_data;
if (ctx == NULL) {
PFM_ERR("pfm_fasync no ctx");
return -EBADF;
}
/*
* we cannot mask interrupts during this call because this may
* may go to sleep if memory is not readily avalaible.
*
* We are protected from the context disappearing by the
* get_fd()/put_fd() done in caller. Serialization of this function
* is ensured by caller.
*/
ret = __pfm_fasync(fd, filp, ctx, on);
PFM_DBG("pfm_fasync called on fd=%d on=%d async_queue=%p ret=%d",
fd,
on,
ctx->async_queue, ret);
return ret;
}
#ifdef CONFIG_SMP
static void __pfm_close_remote_cpu(void *info)
{
struct pfm_context *ctx = info;
int can_release;
BUG_ON(ctx != __get_cpu_var(pmu_ctx));
/*
* we are in IPI interrupt handler which has always higher
* priority than PMU interrupt, therefore we do not need to
* mask interrupts. context locking is not needed because we
* are in close(), no more user references.
*
* can_release is ignored, release done on calling CPU
*/
__pfm_unload_context(ctx, &can_release);
/*
* we cannot free context here because we are in_interrupt().
* we free on the calling CPU
*/
}
static int pfm_close_remote_cpu(u32 cpu, struct pfm_context *ctx)
{
BUG_ON(irqs_disabled());
return smp_call_function_single(cpu, __pfm_close_remote_cpu, ctx, 1);
}
#endif /* CONFIG_SMP */
/*
* called either on explicit close() or from exit_files().
* Only the LAST user of the file gets to this point, i.e., it is
* called only ONCE.
*
* IMPORTANT: we get called ONLY when the refcnt on the file gets to zero
* (fput()),i.e, last task to access the file. Nobody else can access the
* file at this point.
*
* When called from exit_files(), the VMA has been freed because exit_mm()
* is executed before exit_files().
*
* When called from exit_files(), the current task is not yet ZOMBIE but we
* flush the PMU state to the context.
*/
int __pfm_close(struct pfm_context *ctx, struct file *filp)
{
unsigned long flags;
int state;
int can_free = 1, can_unload = 1;
int is_system, can_release = 0;
u32 cpu;
/*
* no risk of ctx of filp disappearing so we can operate outside
* of spin_lock(). fasync_helper() runs with interrupts masked,
* thus there is no risk with the PMU interrupt handler
*
* In case of zombie, we will not have the async struct anymore
* thus kill_fasync() will not do anything
*
* fd is not used when removing the entry so we pass -1
*/
if (filp->f_flags & FASYNC)
__pfm_fasync (-1, filp, ctx, 0);
spin_lock_irqsave(&ctx->lock, flags);
state = ctx->state;
is_system = ctx->flags.system;
cpu = ctx->cpu;
PFM_DBG("state=%d", state);
/*
* check if unload is needed
*/
if (state == PFM_CTX_UNLOADED)
goto doit;
#ifdef CONFIG_SMP
/*
* we need to release the resource on the ORIGINAL cpu.
* we need to release the context lock to avoid deadlocks
* on the original CPU, especially in the context switch
* routines. It is safe to unlock because we are in close(),
* in other words, there is no more access from user level.
* we can also unmask interrupts on this CPU because the
* context is running on the original CPU. Context will be
* unloaded and the session will be released on the original
* CPU. Upon return, the caller is guaranteed that the context
* is gone from original CPU.
*/
if (is_system && cpu != smp_processor_id()) {
spin_unlock_irqrestore(&ctx->lock, flags);
pfm_close_remote_cpu(cpu, ctx);
can_release = 1;
goto free_it;
}
if (!is_system && ctx->task != current) {
/*
* switch context to zombie state
*/
ctx->state = PFM_CTX_ZOMBIE;
PFM_DBG("zombie ctx for [%d]", ctx->task->pid);
/*
* must check if other thread is using block overflow
* notification mode. If so make sure it will not block
* because there will not be any pfm_restart() issued.
* When the thread notices the ZOMBIE state, it will clean
* up what is left of the context
*/
if (state == PFM_CTX_MASKED && ctx->flags.block) {
/*
* force task to wake up from MASKED state
*/
PFM_DBG("waking up [%d]", ctx->task->pid);
complete(&ctx->restart_complete);
}
/*
* PMU session will be release by monitored task when it notices
* ZOMBIE state as part of pfm_unload_context()
*/
can_unload = can_free = 0;
}
#endif
if (can_unload)
__pfm_unload_context(ctx, &can_release);
doit:
spin_unlock_irqrestore(&ctx->lock, flags);
#ifdef CONFIG_SMP
free_it:
#endif
if (can_release)
pfm_session_release(is_system, cpu);
if (can_free)
pfm_free_context(ctx);
return 0;
}
static int pfm_close(struct inode *inode, struct file *filp)
{
struct pfm_context *ctx;
PFM_DBG("called filp=%p", filp);
ctx = filp->private_data;
if (ctx == NULL) {
PFM_ERR("no ctx");
return -EBADF;
}
return __pfm_close(ctx, filp);
}
static int pfm_no_open(struct inode *irrelevant, struct file *dontcare)
{
PFM_DBG("pfm_file_ops");
return -ENXIO;
}
const struct file_operations pfm_file_ops = {
.llseek = no_llseek,
.read = pfm_read,
.write = pfm_write,
.poll = pfm_poll,
.unlocked_ioctl = pfm_ioctl,
.open = pfm_no_open, /* special open to disallow open via /proc */
.fasync = pfm_fasync,
.release = pfm_close,
.mmap = pfm_mmap
};
int pfm_alloc_fd(struct pfm_context *ctx)
{
return anon_inode_getfd("[pfmfd]", &pfm_file_ops, ctx, O_RDONLY);
}