blob: 71d05133f556000530d930537c9dc14c543037c9 [file] [log] [blame]
/*
* This file implements the perfmon-2 subsystem which is used
* to program the IA-64 Performance Monitoring Unit (PMU).
*
* The initial version of perfmon.c was written by
* Ganesh Venkitachalam, IBM Corp.
*
* Then it was modified for perfmon-1.x by Stephane Eranian and
* David Mosberger, Hewlett Packard Co.
*
* Version Perfmon-2.x is a rewrite of perfmon-1.x
* by Stephane Eranian, Hewlett Packard Co.
*
* Copyright (C) 1999-2005 Hewlett Packard Co
* Stephane Eranian <eranian@hpl.hp.com>
* David Mosberger-Tang <davidm@hpl.hp.com>
*
* More information about perfmon available at:
* http://www.hpl.hp.com/research/linux/perfmon
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <linux/list.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/vfs.h>
#include <linux/smp.h>
#include <linux/pagemap.h>
#include <linux/mount.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/rcupdate.h>
#include <linux/completion.h>
#include <asm/errno.h>
#include <asm/intrinsics.h>
#include <asm/page.h>
#include <asm/perfmon.h>
#include <asm/processor.h>
#include <asm/signal.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/delay.h>
#ifdef CONFIG_PERFMON
/*
* perfmon context state
*/
#define PFM_CTX_UNLOADED 1 /* context is not loaded onto any task */
#define PFM_CTX_LOADED 2 /* context is loaded onto a task */
#define PFM_CTX_MASKED 3 /* context is loaded but monitoring is masked due to overflow */
#define PFM_CTX_ZOMBIE 4 /* owner of the context is closing it */
#define PFM_INVALID_ACTIVATION (~0UL)
#define PFM_NUM_PMC_REGS 64 /* PMC save area for ctxsw */
#define PFM_NUM_PMD_REGS 64 /* PMD save area for ctxsw */
/*
* depth of message queue
*/
#define PFM_MAX_MSGS 32
#define PFM_CTXQ_EMPTY(g) ((g)->ctx_msgq_head == (g)->ctx_msgq_tail)
/*
* type of a PMU register (bitmask).
* bitmask structure:
* bit0 : register implemented
* bit1 : end marker
* bit2-3 : reserved
* bit4 : pmc has pmc.pm
* bit5 : pmc controls a counter (has pmc.oi), pmd is used as counter
* bit6-7 : register type
* bit8-31: reserved
*/
#define PFM_REG_NOTIMPL 0x0 /* not implemented at all */
#define PFM_REG_IMPL 0x1 /* register implemented */
#define PFM_REG_END 0x2 /* end marker */
#define PFM_REG_MONITOR (0x1<<4|PFM_REG_IMPL) /* a PMC with a pmc.pm field only */
#define PFM_REG_COUNTING (0x2<<4|PFM_REG_MONITOR) /* a monitor + pmc.oi+ PMD used as a counter */
#define PFM_REG_CONTROL (0x4<<4|PFM_REG_IMPL) /* PMU control register */
#define PFM_REG_CONFIG (0x8<<4|PFM_REG_IMPL) /* configuration register */
#define PFM_REG_BUFFER (0xc<<4|PFM_REG_IMPL) /* PMD used as buffer */
#define PMC_IS_LAST(i) (pmu_conf->pmc_desc[i].type & PFM_REG_END)
#define PMD_IS_LAST(i) (pmu_conf->pmd_desc[i].type & PFM_REG_END)
#define PMC_OVFL_NOTIFY(ctx, i) ((ctx)->ctx_pmds[i].flags & PFM_REGFL_OVFL_NOTIFY)
/* i assumed unsigned */
#define PMC_IS_IMPL(i) (i< PMU_MAX_PMCS && (pmu_conf->pmc_desc[i].type & PFM_REG_IMPL))
#define PMD_IS_IMPL(i) (i< PMU_MAX_PMDS && (pmu_conf->pmd_desc[i].type & PFM_REG_IMPL))
/* XXX: these assume that register i is implemented */
#define PMD_IS_COUNTING(i) ((pmu_conf->pmd_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
#define PMC_IS_COUNTING(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
#define PMC_IS_MONITOR(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_MONITOR) == PFM_REG_MONITOR)
#define PMC_IS_CONTROL(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_CONTROL) == PFM_REG_CONTROL)
#define PMC_DFL_VAL(i) pmu_conf->pmc_desc[i].default_value
#define PMC_RSVD_MASK(i) pmu_conf->pmc_desc[i].reserved_mask
#define PMD_PMD_DEP(i) pmu_conf->pmd_desc[i].dep_pmd[0]
#define PMC_PMD_DEP(i) pmu_conf->pmc_desc[i].dep_pmd[0]
#define PFM_NUM_IBRS IA64_NUM_DBG_REGS
#define PFM_NUM_DBRS IA64_NUM_DBG_REGS
#define CTX_OVFL_NOBLOCK(c) ((c)->ctx_fl_block == 0)
#define CTX_HAS_SMPL(c) ((c)->ctx_fl_is_sampling)
#define PFM_CTX_TASK(h) (h)->ctx_task
#define PMU_PMC_OI 5 /* position of pmc.oi bit */
/* XXX: does not support more than 64 PMDs */
#define CTX_USED_PMD(ctx, mask) (ctx)->ctx_used_pmds[0] |= (mask)
#define CTX_IS_USED_PMD(ctx, c) (((ctx)->ctx_used_pmds[0] & (1UL << (c))) != 0UL)
#define CTX_USED_MONITOR(ctx, mask) (ctx)->ctx_used_monitors[0] |= (mask)
#define CTX_USED_IBR(ctx,n) (ctx)->ctx_used_ibrs[(n)>>6] |= 1UL<< ((n) % 64)
#define CTX_USED_DBR(ctx,n) (ctx)->ctx_used_dbrs[(n)>>6] |= 1UL<< ((n) % 64)
#define CTX_USES_DBREGS(ctx) (((pfm_context_t *)(ctx))->ctx_fl_using_dbreg==1)
#define PFM_CODE_RR 0 /* requesting code range restriction */
#define PFM_DATA_RR 1 /* requestion data range restriction */
#define PFM_CPUINFO_CLEAR(v) pfm_get_cpu_var(pfm_syst_info) &= ~(v)
#define PFM_CPUINFO_SET(v) pfm_get_cpu_var(pfm_syst_info) |= (v)
#define PFM_CPUINFO_GET() pfm_get_cpu_var(pfm_syst_info)
#define RDEP(x) (1UL<<(x))
/*
* context protection macros
* in SMP:
* - we need to protect against CPU concurrency (spin_lock)
* - we need to protect against PMU overflow interrupts (local_irq_disable)
* in UP:
* - we need to protect against PMU overflow interrupts (local_irq_disable)
*
* spin_lock_irqsave()/spin_unlock_irqrestore():
* in SMP: local_irq_disable + spin_lock
* in UP : local_irq_disable
*
* spin_lock()/spin_lock():
* in UP : removed automatically
* in SMP: protect against context accesses from other CPU. interrupts
* are not masked. This is useful for the PMU interrupt handler
* because we know we will not get PMU concurrency in that code.
*/
#define PROTECT_CTX(c, f) \
do { \
DPRINT(("spinlock_irq_save ctx %p by [%d]\n", c, task_pid_nr(current))); \
spin_lock_irqsave(&(c)->ctx_lock, f); \
DPRINT(("spinlocked ctx %p by [%d]\n", c, task_pid_nr(current))); \
} while(0)
#define UNPROTECT_CTX(c, f) \
do { \
DPRINT(("spinlock_irq_restore ctx %p by [%d]\n", c, task_pid_nr(current))); \
spin_unlock_irqrestore(&(c)->ctx_lock, f); \
} while(0)
#define PROTECT_CTX_NOPRINT(c, f) \
do { \
spin_lock_irqsave(&(c)->ctx_lock, f); \
} while(0)
#define UNPROTECT_CTX_NOPRINT(c, f) \
do { \
spin_unlock_irqrestore(&(c)->ctx_lock, f); \
} while(0)
#define PROTECT_CTX_NOIRQ(c) \
do { \
spin_lock(&(c)->ctx_lock); \
} while(0)
#define UNPROTECT_CTX_NOIRQ(c) \
do { \
spin_unlock(&(c)->ctx_lock); \
} while(0)
#ifdef CONFIG_SMP
#define GET_ACTIVATION() pfm_get_cpu_var(pmu_activation_number)
#define INC_ACTIVATION() pfm_get_cpu_var(pmu_activation_number)++
#define SET_ACTIVATION(c) (c)->ctx_last_activation = GET_ACTIVATION()
#else /* !CONFIG_SMP */
#define SET_ACTIVATION(t) do {} while(0)
#define GET_ACTIVATION(t) do {} while(0)
#define INC_ACTIVATION(t) do {} while(0)
#endif /* CONFIG_SMP */
#define SET_PMU_OWNER(t, c) do { pfm_get_cpu_var(pmu_owner) = (t); pfm_get_cpu_var(pmu_ctx) = (c); } while(0)
#define GET_PMU_OWNER() pfm_get_cpu_var(pmu_owner)
#define GET_PMU_CTX() pfm_get_cpu_var(pmu_ctx)
#define LOCK_PFS(g) spin_lock_irqsave(&pfm_sessions.pfs_lock, g)
#define UNLOCK_PFS(g) spin_unlock_irqrestore(&pfm_sessions.pfs_lock, g)
#define PFM_REG_RETFLAG_SET(flags, val) do { flags &= ~PFM_REG_RETFL_MASK; flags |= (val); } while(0)
/*
* cmp0 must be the value of pmc0
*/
#define PMC0_HAS_OVFL(cmp0) (cmp0 & ~0x1UL)
#define PFMFS_MAGIC 0xa0b4d889
/*
* debugging
*/
#define PFM_DEBUGGING 1
#ifdef PFM_DEBUGGING
#define DPRINT(a) \
do { \
if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __func__, __LINE__, smp_processor_id(), task_pid_nr(current)); printk a; } \
} while (0)
#define DPRINT_ovfl(a) \
do { \
if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __func__, __LINE__, smp_processor_id(), task_pid_nr(current)); printk a; } \
} while (0)
#endif
/*
* 64-bit software counter structure
*
* the next_reset_type is applied to the next call to pfm_reset_regs()
*/
typedef struct {
unsigned long val; /* virtual 64bit counter value */
unsigned long lval; /* last reset value */
unsigned long long_reset; /* reset value on sampling overflow */
unsigned long short_reset; /* reset value on overflow */
unsigned long reset_pmds[4]; /* which other pmds to reset when this counter overflows */
unsigned long smpl_pmds[4]; /* which pmds are accessed when counter overflow */
unsigned long seed; /* seed for random-number generator */
unsigned long mask; /* mask for random-number generator */
unsigned int flags; /* notify/do not notify */
unsigned long eventid; /* overflow event identifier */
} pfm_counter_t;
/*
* context flags
*/
typedef struct {
unsigned int block:1; /* when 1, task will blocked on user notifications */
unsigned int system:1; /* do system wide monitoring */
unsigned int using_dbreg:1; /* using range restrictions (debug registers) */
unsigned int is_sampling:1; /* true if using a custom format */
unsigned int excl_idle:1; /* exclude idle task in system wide session */
unsigned int going_zombie:1; /* context is zombie (MASKED+blocking) */
unsigned int trap_reason:2; /* reason for going into pfm_handle_work() */
unsigned int no_msg:1; /* no message sent on overflow */
unsigned int can_restart:1; /* allowed to issue a PFM_RESTART */
unsigned int reserved:22;
} pfm_context_flags_t;
#define PFM_TRAP_REASON_NONE 0x0 /* default value */
#define PFM_TRAP_REASON_BLOCK 0x1 /* we need to block on overflow */
#define PFM_TRAP_REASON_RESET 0x2 /* we need to reset PMDs */
/*
* perfmon context: encapsulates all the state of a monitoring session
*/
typedef struct pfm_context {
spinlock_t ctx_lock; /* context protection */
pfm_context_flags_t ctx_flags; /* bitmask of flags (block reason incl.) */
unsigned int ctx_state; /* state: active/inactive (no bitfield) */
struct task_struct *ctx_task; /* task to which context is attached */
unsigned long ctx_ovfl_regs[4]; /* which registers overflowed (notification) */
struct completion ctx_restart_done; /* use for blocking notification mode */
unsigned long ctx_used_pmds[4]; /* bitmask of PMD used */
unsigned long ctx_all_pmds[4]; /* bitmask of all accessible PMDs */
unsigned long ctx_reload_pmds[4]; /* bitmask of force reload PMD on ctxsw in */
unsigned long ctx_all_pmcs[4]; /* bitmask of all accessible PMCs */
unsigned long ctx_reload_pmcs[4]; /* bitmask of force reload PMC on ctxsw in */
unsigned long ctx_used_monitors[4]; /* bitmask of monitor PMC being used */
unsigned long ctx_pmcs[PFM_NUM_PMC_REGS]; /* saved copies of PMC values */
unsigned int ctx_used_ibrs[1]; /* bitmask of used IBR (speedup ctxsw in) */
unsigned int ctx_used_dbrs[1]; /* bitmask of used DBR (speedup ctxsw in) */
unsigned long ctx_dbrs[IA64_NUM_DBG_REGS]; /* DBR values (cache) when not loaded */
unsigned long ctx_ibrs[IA64_NUM_DBG_REGS]; /* IBR values (cache) when not loaded */
pfm_counter_t ctx_pmds[PFM_NUM_PMD_REGS]; /* software state for PMDS */
unsigned long th_pmcs[PFM_NUM_PMC_REGS]; /* PMC thread save state */
unsigned long th_pmds[PFM_NUM_PMD_REGS]; /* PMD thread save state */
u64 ctx_saved_psr_up; /* only contains psr.up value */
unsigned long ctx_last_activation; /* context last activation number for last_cpu */
unsigned int ctx_last_cpu; /* CPU id of current or last CPU used (SMP only) */
unsigned int ctx_cpu; /* cpu to which perfmon is applied (system wide) */
int ctx_fd; /* file descriptor used my this context */
pfm_ovfl_arg_t ctx_ovfl_arg; /* argument to custom buffer format handler */
pfm_buffer_fmt_t *ctx_buf_fmt; /* buffer format callbacks */
void *ctx_smpl_hdr; /* points to sampling buffer header kernel vaddr */
unsigned long ctx_smpl_size; /* size of sampling buffer */
void *ctx_smpl_vaddr; /* user level virtual address of smpl buffer */
wait_queue_head_t ctx_msgq_wait;
pfm_msg_t ctx_msgq[PFM_MAX_MSGS];
int ctx_msgq_head;
int ctx_msgq_tail;
struct fasync_struct *ctx_async_queue;
wait_queue_head_t ctx_zombieq; /* termination cleanup wait queue */
} pfm_context_t;
/*
* magic number used to verify that structure is really
* a perfmon context
*/
#define PFM_IS_FILE(f) ((f)->f_op == &pfm_file_ops)
#define PFM_GET_CTX(t) ((pfm_context_t *)(t)->thread.pfm_context)
#ifdef CONFIG_SMP
#define SET_LAST_CPU(ctx, v) (ctx)->ctx_last_cpu = (v)
#define GET_LAST_CPU(ctx) (ctx)->ctx_last_cpu
#else
#define SET_LAST_CPU(ctx, v) do {} while(0)
#define GET_LAST_CPU(ctx) do {} while(0)
#endif
#define ctx_fl_block ctx_flags.block
#define ctx_fl_system ctx_flags.system
#define ctx_fl_using_dbreg ctx_flags.using_dbreg
#define ctx_fl_is_sampling ctx_flags.is_sampling
#define ctx_fl_excl_idle ctx_flags.excl_idle
#define ctx_fl_going_zombie ctx_flags.going_zombie
#define ctx_fl_trap_reason ctx_flags.trap_reason
#define ctx_fl_no_msg ctx_flags.no_msg
#define ctx_fl_can_restart ctx_flags.can_restart
#define PFM_SET_WORK_PENDING(t, v) do { (t)->thread.pfm_needs_checking = v; } while(0);
#define PFM_GET_WORK_PENDING(t) (t)->thread.pfm_needs_checking
/*
* global information about all sessions
* mostly used to synchronize between system wide and per-process
*/
typedef struct {
spinlock_t pfs_lock; /* lock the structure */
unsigned int pfs_task_sessions; /* number of per task sessions */
unsigned int pfs_sys_sessions; /* number of per system wide sessions */
unsigned int pfs_sys_use_dbregs; /* incremented when a system wide session uses debug regs */
unsigned int pfs_ptrace_use_dbregs; /* incremented when a process uses debug regs */
struct task_struct *pfs_sys_session[NR_CPUS]; /* point to task owning a system-wide session */
} pfm_session_t;
/*
* information about a PMC or PMD.
* dep_pmd[]: a bitmask of dependent PMD registers
* dep_pmc[]: a bitmask of dependent PMC registers
*/
typedef int (*pfm_reg_check_t)(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
typedef struct {
unsigned int type;
int pm_pos;
unsigned long default_value; /* power-on default value */
unsigned long reserved_mask; /* bitmask of reserved bits */
pfm_reg_check_t read_check;
pfm_reg_check_t write_check;
unsigned long dep_pmd[4];
unsigned long dep_pmc[4];
} pfm_reg_desc_t;
/* assume cnum is a valid monitor */
#define PMC_PM(cnum, val) (((val) >> (pmu_conf->pmc_desc[cnum].pm_pos)) & 0x1)
/*
* This structure is initialized at boot time and contains
* a description of the PMU main characteristics.
*
* If the probe function is defined, detection is based
* on its return value:
* - 0 means recognized PMU
* - anything else means not supported
* When the probe function is not defined, then the pmu_family field
* is used and it must match the host CPU family such that:
* - cpu->family & config->pmu_family != 0
*/
typedef struct {
unsigned long ovfl_val; /* overflow value for counters */
pfm_reg_desc_t *pmc_desc; /* detailed PMC register dependencies descriptions */
pfm_reg_desc_t *pmd_desc; /* detailed PMD register dependencies descriptions */
unsigned int num_pmcs; /* number of PMCS: computed at init time */
unsigned int num_pmds; /* number of PMDS: computed at init time */
unsigned long impl_pmcs[4]; /* bitmask of implemented PMCS */
unsigned long impl_pmds[4]; /* bitmask of implemented PMDS */
char *pmu_name; /* PMU family name */
unsigned int pmu_family; /* cpuid family pattern used to identify pmu */
unsigned int flags; /* pmu specific flags */
unsigned int num_ibrs; /* number of IBRS: computed at init time */
unsigned int num_dbrs; /* number of DBRS: computed at init time */
unsigned int num_counters; /* PMC/PMD counting pairs : computed at init time */
int (*probe)(void); /* customized probe routine */
unsigned int use_rr_dbregs:1; /* set if debug registers used for range restriction */
} pmu_config_t;
/*
* PMU specific flags
*/
#define PFM_PMU_IRQ_RESEND 1 /* PMU needs explicit IRQ resend */
/*
* debug register related type definitions
*/
typedef struct {
unsigned long ibr_mask:56;
unsigned long ibr_plm:4;
unsigned long ibr_ig:3;
unsigned long ibr_x:1;
} ibr_mask_reg_t;
typedef struct {
unsigned long dbr_mask:56;
unsigned long dbr_plm:4;
unsigned long dbr_ig:2;
unsigned long dbr_w:1;
unsigned long dbr_r:1;
} dbr_mask_reg_t;
typedef union {
unsigned long val;
ibr_mask_reg_t ibr;
dbr_mask_reg_t dbr;
} dbreg_t;
/*
* perfmon command descriptions
*/
typedef struct {
int (*cmd_func)(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
char *cmd_name;
int cmd_flags;
unsigned int cmd_narg;
size_t cmd_argsize;
int (*cmd_getsize)(void *arg, size_t *sz);
} pfm_cmd_desc_t;
#define PFM_CMD_FD 0x01 /* command requires a file descriptor */
#define PFM_CMD_ARG_READ 0x02 /* command must read argument(s) */
#define PFM_CMD_ARG_RW 0x04 /* command must read/write argument(s) */
#define PFM_CMD_STOP 0x08 /* command does not work on zombie context */
#define PFM_CMD_NAME(cmd) pfm_cmd_tab[(cmd)].cmd_name
#define PFM_CMD_READ_ARG(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_READ)
#define PFM_CMD_RW_ARG(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_RW)
#define PFM_CMD_USE_FD(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_FD)
#define PFM_CMD_STOPPED(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_STOP)
#define PFM_CMD_ARG_MANY -1 /* cannot be zero */
typedef struct {
unsigned long pfm_spurious_ovfl_intr_count; /* keep track of spurious ovfl interrupts */
unsigned long pfm_replay_ovfl_intr_count; /* keep track of replayed ovfl interrupts */
unsigned long pfm_ovfl_intr_count; /* keep track of ovfl interrupts */
unsigned long pfm_ovfl_intr_cycles; /* cycles spent processing ovfl interrupts */
unsigned long pfm_ovfl_intr_cycles_min; /* min cycles spent processing ovfl interrupts */
unsigned long pfm_ovfl_intr_cycles_max; /* max cycles spent processing ovfl interrupts */
unsigned long pfm_smpl_handler_calls;
unsigned long pfm_smpl_handler_cycles;
char pad[SMP_CACHE_BYTES] ____cacheline_aligned;
} pfm_stats_t;
/*
* perfmon internal variables
*/
static pfm_stats_t pfm_stats[NR_CPUS];
static pfm_session_t pfm_sessions; /* global sessions information */
static DEFINE_SPINLOCK(pfm_alt_install_check);
static pfm_intr_handler_desc_t *pfm_alt_intr_handler;
static struct proc_dir_entry *perfmon_dir;
static pfm_uuid_t pfm_null_uuid = {0,};
static spinlock_t pfm_buffer_fmt_lock;
static LIST_HEAD(pfm_buffer_fmt_list);
static pmu_config_t *pmu_conf;
/* sysctl() controls */
pfm_sysctl_t pfm_sysctl;
EXPORT_SYMBOL(pfm_sysctl);
static ctl_table pfm_ctl_table[]={
{
.ctl_name = CTL_UNNUMBERED,
.procname = "debug",
.data = &pfm_sysctl.debug,
.maxlen = sizeof(int),
.mode = 0666,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "debug_ovfl",
.data = &pfm_sysctl.debug_ovfl,
.maxlen = sizeof(int),
.mode = 0666,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "fastctxsw",
.data = &pfm_sysctl.fastctxsw,
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "expert_mode",
.data = &pfm_sysctl.expert_mode,
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = &proc_dointvec,
},
{}
};
static ctl_table pfm_sysctl_dir[] = {
{
.ctl_name = CTL_UNNUMBERED,
.procname = "perfmon",
.mode = 0555,
.child = pfm_ctl_table,
},
{}
};
static ctl_table pfm_sysctl_root[] = {
{
.ctl_name = CTL_KERN,
.procname = "kernel",
.mode = 0555,
.child = pfm_sysctl_dir,
},
{}
};
static struct ctl_table_header *pfm_sysctl_header;
static int pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
#define pfm_get_cpu_var(v) __ia64_per_cpu_var(v)
#define pfm_get_cpu_data(a,b) per_cpu(a, b)
static inline void
pfm_put_task(struct task_struct *task)
{
if (task != current) put_task_struct(task);
}
static inline void
pfm_reserve_page(unsigned long a)
{
SetPageReserved(vmalloc_to_page((void *)a));
}
static inline void
pfm_unreserve_page(unsigned long a)
{
ClearPageReserved(vmalloc_to_page((void*)a));
}
static inline unsigned long
pfm_protect_ctx_ctxsw(pfm_context_t *x)
{
spin_lock(&(x)->ctx_lock);
return 0UL;
}
static inline void
pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f)
{
spin_unlock(&(x)->ctx_lock);
}
static inline unsigned int
pfm_do_munmap(struct mm_struct *mm, unsigned long addr, size_t len, int acct)
{
return do_munmap(mm, addr, len);
}
static inline unsigned long
pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec)
{
return get_unmapped_area(file, addr, len, pgoff, flags);
}
static int
pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data,
struct vfsmount *mnt)
{
return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt);
}
static struct file_system_type pfm_fs_type = {
.name = "pfmfs",
.get_sb = pfmfs_get_sb,
.kill_sb = kill_anon_super,
};
DEFINE_PER_CPU(unsigned long, pfm_syst_info);
DEFINE_PER_CPU(struct task_struct *, pmu_owner);
DEFINE_PER_CPU(pfm_context_t *, pmu_ctx);
DEFINE_PER_CPU(unsigned long, pmu_activation_number);
EXPORT_PER_CPU_SYMBOL_GPL(pfm_syst_info);
/* forward declaration */
static const struct file_operations pfm_file_ops;
/*
* forward declarations
*/
#ifndef CONFIG_SMP
static void pfm_lazy_save_regs (struct task_struct *ta);
#endif
void dump_pmu_state(const char *);
static int pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
#include "perfmon_itanium.h"
#include "perfmon_mckinley.h"
#include "perfmon_montecito.h"
#include "perfmon_generic.h"
static pmu_config_t *pmu_confs[]={
&pmu_conf_mont,
&pmu_conf_mck,
&pmu_conf_ita,
&pmu_conf_gen, /* must be last */
NULL
};
static int pfm_end_notify_user(pfm_context_t *ctx);
static inline void
pfm_clear_psr_pp(void)
{
ia64_rsm(IA64_PSR_PP);
ia64_srlz_i();
}
static inline void
pfm_set_psr_pp(void)
{
ia64_ssm(IA64_PSR_PP);
ia64_srlz_i();
}
static inline void
pfm_clear_psr_up(void)
{
ia64_rsm(IA64_PSR_UP);
ia64_srlz_i();
}
static inline void
pfm_set_psr_up(void)
{
ia64_ssm(IA64_PSR_UP);
ia64_srlz_i();
}
static inline unsigned long
pfm_get_psr(void)
{
unsigned long tmp;
tmp = ia64_getreg(_IA64_REG_PSR);
ia64_srlz_i();
return tmp;
}
static inline void
pfm_set_psr_l(unsigned long val)
{
ia64_setreg(_IA64_REG_PSR_L, val);
ia64_srlz_i();
}
static inline void
pfm_freeze_pmu(void)
{
ia64_set_pmc(0,1UL);
ia64_srlz_d();
}
static inline void
pfm_unfreeze_pmu(void)
{
ia64_set_pmc(0,0UL);
ia64_srlz_d();
}
static inline void
pfm_restore_ibrs(unsigned long *ibrs, unsigned int nibrs)
{
int i;
for (i=0; i < nibrs; i++) {
ia64_set_ibr(i, ibrs[i]);
ia64_dv_serialize_instruction();
}
ia64_srlz_i();
}
static inline void
pfm_restore_dbrs(unsigned long *dbrs, unsigned int ndbrs)
{
int i;
for (i=0; i < ndbrs; i++) {
ia64_set_dbr(i, dbrs[i]);
ia64_dv_serialize_data();
}
ia64_srlz_d();
}
/*
* PMD[i] must be a counter. no check is made
*/
static inline unsigned long
pfm_read_soft_counter(pfm_context_t *ctx, int i)
{
return ctx->ctx_pmds[i].val + (ia64_get_pmd(i) & pmu_conf->ovfl_val);
}
/*
* PMD[i] must be a counter. no check is made
*/
static inline void
pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val)
{
unsigned long ovfl_val = pmu_conf->ovfl_val;
ctx->ctx_pmds[i].val = val & ~ovfl_val;
/*
* writing to unimplemented part is ignore, so we do not need to
* mask off top part
*/
ia64_set_pmd(i, val & ovfl_val);
}
static pfm_msg_t *
pfm_get_new_msg(pfm_context_t *ctx)
{
int idx, next;
next = (ctx->ctx_msgq_tail+1) % PFM_MAX_MSGS;
DPRINT(("ctx_fd=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
if (next == ctx->ctx_msgq_head) return NULL;
idx = ctx->ctx_msgq_tail;
ctx->ctx_msgq_tail = next;
DPRINT(("ctx=%p head=%d tail=%d msg=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, idx));
return ctx->ctx_msgq+idx;
}
static pfm_msg_t *
pfm_get_next_msg(pfm_context_t *ctx)
{
pfm_msg_t *msg;
DPRINT(("ctx=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
if (PFM_CTXQ_EMPTY(ctx)) return NULL;
/*
* get oldest message
*/
msg = ctx->ctx_msgq+ctx->ctx_msgq_head;
/*
* and move forward
*/
ctx->ctx_msgq_head = (ctx->ctx_msgq_head+1) % PFM_MAX_MSGS;
DPRINT(("ctx=%p head=%d tail=%d type=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, msg->pfm_gen_msg.msg_type));
return msg;
}
static void
pfm_reset_msgq(pfm_context_t *ctx)
{
ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
DPRINT(("ctx=%p msgq reset\n", ctx));
}
static void *
pfm_rvmalloc(unsigned long size)
{
void *mem;
unsigned long addr;
size = PAGE_ALIGN(size);
mem = vmalloc(size);
if (mem) {
//printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem);
memset(mem, 0, size);
addr = (unsigned long)mem;
while (size > 0) {
pfm_reserve_page(addr);
addr+=PAGE_SIZE;
size-=PAGE_SIZE;
}
}
return mem;
}
static void
pfm_rvfree(void *mem, unsigned long size)
{
unsigned long addr;
if (mem) {
DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size));
addr = (unsigned long) mem;
while ((long) size > 0) {
pfm_unreserve_page(addr);
addr+=PAGE_SIZE;
size-=PAGE_SIZE;
}
vfree(mem);
}
return;
}
static pfm_context_t *
pfm_context_alloc(int ctx_flags)
{
pfm_context_t *ctx;
/*
* allocate context descriptor
* must be able to free with interrupts disabled
*/
ctx = kzalloc(sizeof(pfm_context_t), GFP_KERNEL);
if (ctx) {
DPRINT(("alloc ctx @%p\n", ctx));
/*
* init context protection lock
*/
spin_lock_init(&ctx->ctx_lock);
/*
* context is unloaded
*/
ctx->ctx_state = PFM_CTX_UNLOADED;
/*
* initialization of context's flags
*/
ctx->ctx_fl_block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
ctx->ctx_fl_system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
ctx->ctx_fl_no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0;
/*
* will move to set properties
* ctx->ctx_fl_excl_idle = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0;
*/
/*
* init restart semaphore to locked
*/
init_completion(&ctx->ctx_restart_done);
/*
* activation is used in SMP only
*/
ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
SET_LAST_CPU(ctx, -1);
/*
* initialize notification message queue
*/
ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
init_waitqueue_head(&ctx->ctx_msgq_wait);
init_waitqueue_head(&ctx->ctx_zombieq);
}
return ctx;
}
static void
pfm_context_free(pfm_context_t *ctx)
{
if (ctx) {
DPRINT(("free ctx @%p\n", ctx));
kfree(ctx);
}
}
static void
pfm_mask_monitoring(struct task_struct *task)
{
pfm_context_t *ctx = PFM_GET_CTX(task);
unsigned long mask, val, ovfl_mask;
int i;
DPRINT_ovfl(("masking monitoring for [%d]\n", task_pid_nr(task)));
ovfl_mask = pmu_conf->ovfl_val;
/*
* monitoring can only be masked as a result of a valid
* counter overflow. In UP, it means that the PMU still
* has an owner. Note that the owner can be different
* from the current task. However the PMU state belongs
* to the owner.
* In SMP, a valid overflow only happens when task is
* current. Therefore if we come here, we know that
* the PMU state belongs to the current task, therefore
* we can access the live registers.
*
* So in both cases, the live register contains the owner's
* state. We can ONLY touch the PMU registers and NOT the PSR.
*
* As a consequence to this call, the ctx->th_pmds[] array
* contains stale information which must be ignored
* when context is reloaded AND monitoring is active (see
* pfm_restart).
*/
mask = ctx->ctx_used_pmds[0];
for (i = 0; mask; i++, mask>>=1) {
/* skip non used pmds */
if ((mask & 0x1) == 0) continue;
val = ia64_get_pmd(i);
if (PMD_IS_COUNTING(i)) {
/*
* we rebuild the full 64 bit value of the counter
*/
ctx->ctx_pmds[i].val += (val & ovfl_mask);
} else {
ctx->ctx_pmds[i].val = val;
}
DPRINT_ovfl(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
i,
ctx->ctx_pmds[i].val,
val & ovfl_mask));
}
/*
* mask monitoring by setting the privilege level to 0
* we cannot use psr.pp/psr.up for this, it is controlled by
* the user
*
* if task is current, modify actual registers, otherwise modify
* thread save state, i.e., what will be restored in pfm_load_regs()
*/
mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
if ((mask & 0x1) == 0UL) continue;
ia64_set_pmc(i, ctx->th_pmcs[i] & ~0xfUL);
ctx->th_pmcs[i] &= ~0xfUL;
DPRINT_ovfl(("pmc[%d]=0x%lx\n", i, ctx->th_pmcs[i]));
}
/*
* make all of this visible
*/
ia64_srlz_d();
}
/*
* must always be done with task == current
*
* context must be in MASKED state when calling
*/
static void
pfm_restore_monitoring(struct task_struct *task)
{
pfm_context_t *ctx = PFM_GET_CTX(task);
unsigned long mask, ovfl_mask;
unsigned long psr, val;
int i, is_system;
is_system = ctx->ctx_fl_system;
ovfl_mask = pmu_conf->ovfl_val;
if (task != current) {
printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task_pid_nr(task), task_pid_nr(current));
return;
}
if (ctx->ctx_state != PFM_CTX_MASKED) {
printk(KERN_ERR "perfmon.%d: task[%d] current[%d] invalid state=%d\n", __LINE__,
task_pid_nr(task), task_pid_nr(current), ctx->ctx_state);
return;
}
psr = pfm_get_psr();
/*
* monitoring is masked via the PMC.
* As we restore their value, we do not want each counter to
* restart right away. We stop monitoring using the PSR,
* restore the PMC (and PMD) and then re-establish the psr
* as it was. Note that there can be no pending overflow at
* this point, because monitoring was MASKED.
*
* system-wide session are pinned and self-monitoring
*/
if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) {
/* disable dcr pp */
ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP);
pfm_clear_psr_pp();
} else {
pfm_clear_psr_up();
}
/*
* first, we restore the PMD
*/
mask = ctx->ctx_used_pmds[0];
for (i = 0; mask; i++, mask>>=1) {
/* skip non used pmds */
if ((mask & 0x1) == 0) continue;
if (PMD_IS_COUNTING(i)) {
/*
* we split the 64bit value according to
* counter width
*/
val = ctx->ctx_pmds[i].val & ovfl_mask;
ctx->ctx_pmds[i].val &= ~ovfl_mask;
} else {
val = ctx->ctx_pmds[i].val;
}
ia64_set_pmd(i, val);
DPRINT(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
i,
ctx->ctx_pmds[i].val,
val));
}
/*
* restore the PMCs
*/
mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
if ((mask & 0x1) == 0UL) continue;
ctx->th_pmcs[i] = ctx->ctx_pmcs[i];
ia64_set_pmc(i, ctx->th_pmcs[i]);
DPRINT(("[%d] pmc[%d]=0x%lx\n",
task_pid_nr(task), i, ctx->th_pmcs[i]));
}
ia64_srlz_d();
/*
* must restore DBR/IBR because could be modified while masked
* XXX: need to optimize
*/
if (ctx->ctx_fl_using_dbreg) {
pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
}
/*
* now restore PSR
*/
if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) {
/* enable dcr pp */
ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) | IA64_DCR_PP);
ia64_srlz_i();
}
pfm_set_psr_l(psr);
}
static inline void
pfm_save_pmds(unsigned long *pmds, unsigned long mask)
{
int i;
ia64_srlz_d();
for (i=0; mask; i++, mask>>=1) {
if (mask & 0x1) pmds[i] = ia64_get_pmd(i);
}
}
/*
* reload from thread state (used for ctxw only)
*/
static inline void
pfm_restore_pmds(unsigned long *pmds, unsigned long mask)
{
int i;
unsigned long val, ovfl_val = pmu_conf->ovfl_val;
for (i=0; mask; i++, mask>>=1) {
if ((mask & 0x1) == 0) continue;
val = PMD_IS_COUNTING(i) ? pmds[i] & ovfl_val : pmds[i];
ia64_set_pmd(i, val);
}
ia64_srlz_d();
}
/*
* propagate PMD from context to thread-state
*/
static inline void
pfm_copy_pmds(struct task_struct *task, pfm_context_t *ctx)
{
unsigned long ovfl_val = pmu_conf->ovfl_val;
unsigned long mask = ctx->ctx_all_pmds[0];
unsigned long val;
int i;
DPRINT(("mask=0x%lx\n", mask));
for (i=0; mask; i++, mask>>=1) {
val = ctx->ctx_pmds[i].val;
/*
* We break up the 64 bit value into 2 pieces
* the lower bits go to the machine state in the
* thread (will be reloaded on ctxsw in).
* The upper part stays in the soft-counter.
*/
if (PMD_IS_COUNTING(i)) {
ctx->ctx_pmds[i].val = val & ~ovfl_val;
val &= ovfl_val;
}
ctx->th_pmds[i] = val;
DPRINT(("pmd[%d]=0x%lx soft_val=0x%lx\n",
i,
ctx->th_pmds[i],
ctx->ctx_pmds[i].val));
}
}
/*
* propagate PMC from context to thread-state
*/
static inline void
pfm_copy_pmcs(struct task_struct *task, pfm_context_t *ctx)
{
unsigned long mask = ctx->ctx_all_pmcs[0];
int i;
DPRINT(("mask=0x%lx\n", mask));
for (i=0; mask; i++, mask>>=1) {
/* masking 0 with ovfl_val yields 0 */
ctx->th_pmcs[i] = ctx->ctx_pmcs[i];
DPRINT(("pmc[%d]=0x%lx\n", i, ctx->th_pmcs[i]));
}
}
static inline void
pfm_restore_pmcs(unsigned long *pmcs, unsigned long mask)
{
int i;
for (i=0; mask; i++, mask>>=1) {
if ((mask & 0x1) == 0) continue;
ia64_set_pmc(i, pmcs[i]);
}
ia64_srlz_d();
}
static inline int
pfm_uuid_cmp(pfm_uuid_t a, pfm_uuid_t b)
{
return memcmp(a, b, sizeof(pfm_uuid_t));
}
static inline int
pfm_buf_fmt_exit(pfm_buffer_fmt_t *fmt, struct task_struct *task, void *buf, struct pt_regs *regs)
{
int ret = 0;
if (fmt->fmt_exit) ret = (*fmt->fmt_exit)(task, buf, regs);
return ret;
}
static inline int
pfm_buf_fmt_getsize(pfm_buffer_fmt_t *fmt, struct task_struct *task, unsigned int flags, int cpu, void *arg, unsigned long *size)
{
int ret = 0;
if (fmt->fmt_getsize) ret = (*fmt->fmt_getsize)(task, flags, cpu, arg, size);
return ret;
}
static inline int
pfm_buf_fmt_validate(pfm_buffer_fmt_t *fmt, struct task_struct *task, unsigned int flags,
int cpu, void *arg)
{
int ret = 0;
if (fmt->fmt_validate) ret = (*fmt->fmt_validate)(task, flags, cpu, arg);
return ret;
}
static inline int
pfm_buf_fmt_init(pfm_buffer_fmt_t *fmt, struct task_struct *task, void *buf, unsigned int flags,
int cpu, void *arg)
{
int ret = 0;
if (fmt->fmt_init) ret = (*fmt->fmt_init)(task, buf, flags, cpu, arg);
return ret;
}
static inline int
pfm_buf_fmt_restart(pfm_buffer_fmt_t *fmt, struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs)
{
int ret = 0;
if (fmt->fmt_restart) ret = (*fmt->fmt_restart)(task, ctrl, buf, regs);
return ret;
}
static inline int
pfm_buf_fmt_restart_active(pfm_buffer_fmt_t *fmt, struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs)
{
int ret = 0;
if (fmt->fmt_restart_active) ret = (*fmt->fmt_restart_active)(task, ctrl, buf, regs);
return ret;
}
static pfm_buffer_fmt_t *
__pfm_find_buffer_fmt(pfm_uuid_t uuid)
{
struct list_head * pos;
pfm_buffer_fmt_t * entry;
list_for_each(pos, &pfm_buffer_fmt_list) {
entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
if (pfm_uuid_cmp(uuid, entry->fmt_uuid) == 0)
return entry;
}
return NULL;
}
/*
* find a buffer format based on its uuid
*/
static pfm_buffer_fmt_t *
pfm_find_buffer_fmt(pfm_uuid_t uuid)
{
pfm_buffer_fmt_t * fmt;
spin_lock(&pfm_buffer_fmt_lock);
fmt = __pfm_find_buffer_fmt(uuid);
spin_unlock(&pfm_buffer_fmt_lock);
return fmt;
}
int
pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt)
{
int ret = 0;
/* some sanity checks */
if (fmt == NULL || fmt->fmt_name == NULL) return -EINVAL;
/* we need at least a handler */
if (fmt->fmt_handler == NULL) return -EINVAL;
/*
* XXX: need check validity of fmt_arg_size
*/
spin_lock(&pfm_buffer_fmt_lock);
if (__pfm_find_buffer_fmt(fmt->fmt_uuid)) {
printk(KERN_ERR "perfmon: duplicate sampling format: %s\n", fmt->fmt_name);
ret = -EBUSY;
goto out;
}
list_add(&fmt->fmt_list, &pfm_buffer_fmt_list);
printk(KERN_INFO "perfmon: added sampling format %s\n", fmt->fmt_name);
out:
spin_unlock(&pfm_buffer_fmt_lock);
return ret;
}
EXPORT_SYMBOL(pfm_register_buffer_fmt);
int
pfm_unregister_buffer_fmt(pfm_uuid_t uuid)
{
pfm_buffer_fmt_t *fmt;
int ret = 0;
spin_lock(&pfm_buffer_fmt_lock);
fmt = __pfm_find_buffer_fmt(uuid);
if (!fmt) {
printk(KERN_ERR "perfmon: cannot unregister format, not found\n");
ret = -EINVAL;
goto out;
}
list_del_init(&fmt->fmt_list);
printk(KERN_INFO "perfmon: removed sampling format: %s\n", fmt->fmt_name);
out:
spin_unlock(&pfm_buffer_fmt_lock);
return ret;
}
EXPORT_SYMBOL(pfm_unregister_buffer_fmt);
extern void update_pal_halt_status(int);
static int
pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu)
{
unsigned long flags;
/*
* validity checks on cpu_mask have been done upstream
*/
LOCK_PFS(flags);
DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
pfm_sessions.pfs_sys_sessions,
pfm_sessions.pfs_task_sessions,
pfm_sessions.pfs_sys_use_dbregs,
is_syswide,
cpu));
if (is_syswide) {
/*
* cannot mix system wide and per-task sessions
*/
if (pfm_sessions.pfs_task_sessions > 0UL) {
DPRINT(("system wide not possible, %u conflicting task_sessions\n",
pfm_sessions.pfs_task_sessions));
goto abort;
}
if (pfm_sessions.pfs_sys_session[cpu]) goto error_conflict;
DPRINT(("reserving system wide session on CPU%u currently on CPU%u\n", cpu, smp_processor_id()));
pfm_sessions.pfs_sys_session[cpu] = task;
pfm_sessions.pfs_sys_sessions++ ;
} else {
if (pfm_sessions.pfs_sys_sessions) goto abort;
pfm_sessions.pfs_task_sessions++;
}
DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
pfm_sessions.pfs_sys_sessions,
pfm_sessions.pfs_task_sessions,
pfm_sessions.pfs_sys_use_dbregs,
is_syswide,
cpu));
/*
* disable default_idle() to go to PAL_HALT
*/
update_pal_halt_status(0);
UNLOCK_PFS(flags);
return 0;
error_conflict:
DPRINT(("system wide not possible, conflicting session [%d] on CPU%d\n",
task_pid_nr(pfm_sessions.pfs_sys_session[cpu]),
cpu));
abort:
UNLOCK_PFS(flags);
return -EBUSY;
}
static int
pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu)
{
unsigned long flags;
/*
* validity checks on cpu_mask have been done upstream
*/
LOCK_PFS(flags);
DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
pfm_sessions.pfs_sys_sessions,
pfm_sessions.pfs_task_sessions,
pfm_sessions.pfs_sys_use_dbregs,
is_syswide,
cpu));
if (is_syswide) {
pfm_sessions.pfs_sys_session[cpu] = NULL;
/*
* would not work with perfmon+more than one bit in cpu_mask
*/
if (ctx && ctx->ctx_fl_using_dbreg) {
if (pfm_sessions.pfs_sys_use_dbregs == 0) {
printk(KERN_ERR "perfmon: invalid release for ctx %p sys_use_dbregs=0\n", ctx);
} else {
pfm_sessions.pfs_sys_use_dbregs--;
}
}
pfm_sessions.pfs_sys_sessions--;
} else {
pfm_sessions.pfs_task_sessions--;
}
DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
pfm_sessions.pfs_sys_sessions,
pfm_sessions.pfs_task_sessions,
pfm_sessions.pfs_sys_use_dbregs,
is_syswide,
cpu));
/*
* if possible, enable default_idle() to go into PAL_HALT
*/
if (pfm_sessions.pfs_task_sessions == 0 && pfm_sessions.pfs_sys_sessions == 0)
update_pal_halt_status(1);
UNLOCK_PFS(flags);
return 0;
}
/*
* removes virtual mapping of the sampling buffer.
* IMPORTANT: cannot be called with interrupts disable, e.g. inside
* a PROTECT_CTX() section.
*/
static int
pfm_remove_smpl_mapping(struct task_struct *task, void *vaddr, unsigned long size)
{
int r;
/* sanity checks */
if (task->mm == NULL || size == 0UL || vaddr == NULL) {
printk(KERN_ERR "perfmon: pfm_remove_smpl_mapping [%d] invalid context mm=%p\n", task_pid_nr(task), task->mm);
return -EINVAL;
}
DPRINT(("smpl_vaddr=%p size=%lu\n", vaddr, size));
/*
* does the actual unmapping
*/
down_write(&task->mm->mmap_sem);
DPRINT(("down_write done smpl_vaddr=%p size=%lu\n", vaddr, size));
r = pfm_do_munmap(task->mm, (unsigned long)vaddr, size, 0);
up_write(&task->mm->mmap_sem);
if (r !=0) {
printk(KERN_ERR "perfmon: [%d] unable to unmap sampling buffer @%p size=%lu\n", task_pid_nr(task), vaddr, size);
}
DPRINT(("do_unmap(%p, %lu)=%d\n", vaddr, size, r));
return 0;
}
/*
* free actual physical storage used by sampling buffer
*/
#if 0
static int
pfm_free_smpl_buffer(pfm_context_t *ctx)
{
pfm_buffer_fmt_t *fmt;
if (ctx->ctx_smpl_hdr == NULL) goto invalid_free;
/*
* we won't use the buffer format anymore
*/
fmt = ctx->ctx_buf_fmt;
DPRINT(("sampling buffer @%p size %lu vaddr=%p\n",
ctx->ctx_smpl_hdr,
ctx->ctx_smpl_size,
ctx->ctx_smpl_vaddr));
pfm_buf_fmt_exit(fmt, current, NULL, NULL);
/*
* free the buffer
*/
pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size);
ctx->ctx_smpl_hdr = NULL;
ctx->ctx_smpl_size = 0UL;
return 0;
invalid_free:
printk(KERN_ERR "perfmon: pfm_free_smpl_buffer [%d] no buffer\n", task_pid_nr(current));
return -EINVAL;
}
#endif
static inline void
pfm_exit_smpl_buffer(pfm_buffer_fmt_t *fmt)
{
if (fmt == NULL) return;
pfm_buf_fmt_exit(fmt, current, NULL, NULL);
}
/*
* pfmfs should _never_ be mounted by userland - too much of security hassle,
* no real gain from having the whole whorehouse mounted. So we don't need
* any operations on the root directory. However, we need a non-trivial
* d_name - pfm: will go nicely and kill the special-casing in procfs.
*/
static struct vfsmount *pfmfs_mnt;
static int __init
init_pfm_fs(void)
{
int err = register_filesystem(&pfm_fs_type);
if (!err) {
pfmfs_mnt = kern_mount(&pfm_fs_type);
err = PTR_ERR(pfmfs_mnt);
if (IS_ERR(pfmfs_mnt))
unregister_filesystem(&pfm_fs_type);
else
err = 0;
}
return err;
}
static ssize_t
pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos)
{
pfm_context_t *ctx;
pfm_msg_t *msg;
ssize_t ret;
unsigned long flags;
DECLARE_WAITQUEUE(wait, current);
if (PFM_IS_FILE(filp) == 0) {
printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", task_pid_nr(current));
return -EINVAL;
}
ctx = (pfm_context_t *)filp->private_data;
if (ctx == NULL) {
printk(KERN_ERR "perfmon: pfm_read: NULL ctx [%d]\n", task_pid_nr(current));
return -EINVAL;
}
/*
* check even when there is no message
*/
if (size < sizeof(pfm_msg_t)) {
DPRINT(("message is too small ctx=%p (>=%ld)\n", ctx, sizeof(pfm_msg_t)));
return -EINVAL;
}
PROTECT_CTX(ctx, flags);
/*
* put ourselves on the wait queue
*/
add_wait_queue(&ctx->ctx_msgq_wait, &wait);
for(;;) {
/*
* check wait queue
*/
set_current_state(TASK_INTERRUPTIBLE);
DPRINT(("head=%d tail=%d\n", ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
ret = 0;
if(PFM_CTXQ_EMPTY(ctx) == 0) break;
UNPROTECT_CTX(ctx, flags);
/*
* check non-blocking read
*/
ret = -EAGAIN;
if(filp->f_flags & O_NONBLOCK) break;
/*
* check pending signals
*/
if(signal_pending(current)) {
ret = -EINTR;
break;
}
/*
* no message, so wait
*/
schedule();
PROTECT_CTX(ctx, flags);
}
DPRINT(("[%d] back to running ret=%ld\n", task_pid_nr(current), ret));
set_current_state(TASK_RUNNING);
remove_wait_queue(&ctx->ctx_msgq_wait, &wait);
if (ret < 0) goto abort;
ret = -EINVAL;
msg = pfm_get_next_msg(ctx);
if (msg == NULL) {
printk(KERN_ERR "perfmon: pfm_read no msg for ctx=%p [%d]\n", ctx, task_pid_nr(current));
goto abort_locked;
}
DPRINT(("fd=%d type=%d\n", msg->pfm_gen_msg.msg_ctx_fd, msg->pfm_gen_msg.msg_type));
ret = -EFAULT;
if(copy_to_user(buf, msg, sizeof(pfm_msg_t)) == 0) ret = sizeof(pfm_msg_t);
abort_locked:
UNPROTECT_CTX(ctx, flags);
abort:
return ret;
}
static ssize_t
pfm_write(struct file *file, const char __user *ubuf,
size_t size, loff_t *ppos)
{
DPRINT(("pfm_write called\n"));
return -EINVAL;
}
static unsigned int
pfm_poll(struct file *filp, poll_table * wait)
{
pfm_context_t *ctx;
unsigned long flags;
unsigned int mask = 0;
if (PFM_IS_FILE(filp) == 0) {
printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", task_pid_nr(current));
return 0;
}
ctx = (pfm_context_t *)filp->private_data;
if (ctx == NULL) {
printk(KERN_ERR "perfmon: pfm_poll: NULL ctx [%d]\n", task_pid_nr(current));
return 0;
}
DPRINT(("pfm_poll ctx_fd=%d before poll_wait\n", ctx->ctx_fd));
poll_wait(filp, &ctx->ctx_msgq_wait, wait);
PROTECT_CTX(ctx, flags);
if (PFM_CTXQ_EMPTY(ctx) == 0)
mask = POLLIN | POLLRDNORM;
UNPROTECT_CTX(ctx, flags);
DPRINT(("pfm_poll ctx_fd=%d mask=0x%x\n", ctx->ctx_fd, mask));
return mask;
}
static int
pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
{
DPRINT(("pfm_ioctl called\n"));
return -EINVAL;
}
/*
* interrupt cannot be masked when coming here
*/
static inline int
pfm_do_fasync(int fd, struct file *filp, pfm_context_t *ctx, int on)
{
int ret;
ret = fasync_helper (fd, filp, on, &ctx->ctx_async_queue);
DPRINT(("pfm_fasync called by [%d] on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
task_pid_nr(current),
fd,
on,
ctx->ctx_async_queue, ret));
return ret;
}
static int
pfm_fasync(int fd, struct file *filp, int on)
{
pfm_context_t *ctx;
int ret;
if (PFM_IS_FILE(filp) == 0) {
printk(KERN_ERR "perfmon: pfm_fasync bad magic [%d]\n", task_pid_nr(current));
return -EBADF;
}
ctx = (pfm_context_t *)filp->private_data;
if (ctx == NULL) {
printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", task_pid_nr(current));
return -EBADF;
}
/*
* we cannot mask interrupts during this call because this may
* may go to sleep if memory is not readily avalaible.
*
* We are protected from the conetxt disappearing by the get_fd()/put_fd()
* done in caller. Serialization of this function is ensured by caller.
*/
ret = pfm_do_fasync(fd, filp, ctx, on);
DPRINT(("pfm_fasync called on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
fd,
on,
ctx->ctx_async_queue, ret));
return ret;
}
#ifdef CONFIG_SMP
/*
* this function is exclusively called from pfm_close().
* The context is not protected at that time, nor are interrupts
* on the remote CPU. That's necessary to avoid deadlocks.
*/
static void
pfm_syswide_force_stop(void *info)
{
pfm_context_t *ctx = (pfm_context_t *)info;
struct pt_regs *regs = task_pt_regs(current);
struct task_struct *owner;
unsigned long flags;
int ret;
if (ctx->ctx_cpu != smp_processor_id()) {
printk(KERN_ERR "perfmon: pfm_syswide_force_stop for CPU%d but on CPU%d\n",
ctx->ctx_cpu,
smp_processor_id());
return;
}
owner = GET_PMU_OWNER();
if (owner != ctx->ctx_task) {
printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected owner [%d] instead of [%d]\n",
smp_processor_id(),
task_pid_nr(owner), task_pid_nr(ctx->ctx_task));
return;
}
if (GET_PMU_CTX() != ctx) {
printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected ctx %p instead of %p\n",
smp_processor_id(),
GET_PMU_CTX(), ctx);
return;
}
DPRINT(("on CPU%d forcing system wide stop for [%d]\n", smp_processor_id(), task_pid_nr(ctx->ctx_task)));
/*
* the context is already protected in pfm_close(), we simply
* need to mask interrupts to avoid a PMU interrupt race on
* this CPU
*/
local_irq_save(flags);
ret = pfm_context_unload(ctx, NULL, 0, regs);
if (ret) {
DPRINT(("context_unload returned %d\n", ret));
}
/*
* unmask interrupts, PMU interrupts are now spurious here
*/
local_irq_restore(flags);
}
static void
pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx)
{
int ret;
DPRINT(("calling CPU%d for cleanup\n", ctx->ctx_cpu));
ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 0, 1);
DPRINT(("called CPU%d for cleanup ret=%d\n", ctx->ctx_cpu, ret));
}
#endif /* CONFIG_SMP */
/*
* called for each close(). Partially free resources.
* When caller is self-monitoring, the context is unloaded.
*/
static int
pfm_flush(struct file *filp, fl_owner_t id)
{
pfm_context_t *ctx;
struct task_struct *task;
struct pt_regs *regs;
unsigned long flags;
unsigned long smpl_buf_size = 0UL;
void *smpl_buf_vaddr = NULL;
int state, is_system;
if (PFM_IS_FILE(filp) == 0) {
DPRINT(("bad magic for\n"));
return -EBADF;
}
ctx = (pfm_context_t *)filp->private_data;
if (ctx == NULL) {
printk(KERN_ERR "perfmon: pfm_flush: NULL ctx [%d]\n", task_pid_nr(current));
return -EBADF;
}
/*
* remove our file from the async queue, if we use this mode.
* This can be done without the context being protected. We come
* here when the context has become unreachable by other tasks.
*
* We may still have active monitoring at this point and we may
* end up in pfm_overflow_handler(). However, fasync_helper()
* operates with interrupts disabled and it cleans up the
* queue. If the PMU handler is called prior to entering
* fasync_helper() then it will send a signal. If it is
* invoked after, it will find an empty queue and no
* signal will be sent. In both case, we are safe
*/
if (filp->f_flags & FASYNC) {
DPRINT(("cleaning up async_queue=%p\n", ctx->ctx_async_queue));
pfm_do_fasync (-1, filp, ctx, 0);
}
PROTECT_CTX(ctx, flags);
state = ctx->ctx_state;
is_system = ctx->ctx_fl_system;
task = PFM_CTX_TASK(ctx);
regs = task_pt_regs(task);
DPRINT(("ctx_state=%d is_current=%d\n",
state,
task == current ? 1 : 0));
/*
* if state == UNLOADED, then task is NULL
*/
/*
* we must stop and unload because we are losing access to the context.
*/
if (task == current) {
#ifdef CONFIG_SMP
/*
* the task IS the owner but it migrated to another CPU: that's bad
* but we must handle this cleanly. Unfortunately, the kernel does
* not provide a mechanism to block migration (while the context is loaded).
*
* We need to release the resource on the ORIGINAL cpu.
*/
if (is_system && ctx->ctx_cpu != smp_processor_id()) {
DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
/*
* keep context protected but unmask interrupt for IPI
*/
local_irq_restore(flags);
pfm_syswide_cleanup_other_cpu(ctx);
/*
* restore interrupt masking
*/
local_irq_save(flags);
/*
* context is unloaded at this point
*/
} else
#endif /* CONFIG_SMP */
{
DPRINT(("forcing unload\n"));
/*
* stop and unload, returning with state UNLOADED
* and session unreserved.
*/
pfm_context_unload(ctx, NULL, 0, regs);
DPRINT(("ctx_state=%d\n", ctx->ctx_state));
}
}
/*
* remove virtual mapping, if any, for the calling task.
* cannot reset ctx field until last user is calling close().
*
* ctx_smpl_vaddr must never be cleared because it is needed
* by every task with access to the context
*
* When called from do_exit(), the mm context is gone already, therefore
* mm is NULL, i.e., the VMA is already gone and we do not have to
* do anything here
*/
if (ctx->ctx_smpl_vaddr && current->mm) {
smpl_buf_vaddr = ctx->ctx_smpl_vaddr;
smpl_buf_size = ctx->ctx_smpl_size;
}
UNPROTECT_CTX(ctx, flags);
/*
* if there was a mapping, then we systematically remove it
* at this point. Cannot be done inside critical section
* because some VM function reenables interrupts.
*
*/
if (smpl_buf_vaddr) pfm_remove_smpl_mapping(current, smpl_buf_vaddr, smpl_buf_size);
return 0;
}
/*
* called either on explicit close() or from exit_files().
* Only the LAST user of the file gets to this point, i.e., it is
* called only ONCE.
*
* IMPORTANT: we get called ONLY when the refcnt on the file gets to zero
* (fput()),i.e, last task to access the file. Nobody else can access the
* file at this point.
*
* When called from exit_files(), the VMA has been freed because exit_mm()
* is executed before exit_files().
*
* When called from exit_files(), the current task is not yet ZOMBIE but we
* flush the PMU state to the context.
*/
static int
pfm_close(struct inode *inode, struct file *filp)
{
pfm_context_t *ctx;
struct task_struct *task;
struct pt_regs *regs;
DECLARE_WAITQUEUE(wait, current);
unsigned long flags;
unsigned long smpl_buf_size = 0UL;
void *smpl_buf_addr = NULL;
int free_possible = 1;
int state, is_system;
DPRINT(("pfm_close called private=%p\n", filp->private_data));
if (PFM_IS_FILE(filp) == 0) {
DPRINT(("bad magic\n"));
return -EBADF;
}
ctx = (pfm_context_t *)filp->private_data;
if (ctx == NULL) {
printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", task_pid_nr(current));
return -EBADF;
}
PROTECT_CTX(ctx, flags);
state = ctx->ctx_state;
is_system = ctx->ctx_fl_system;
task = PFM_CTX_TASK(ctx);
regs = task_pt_regs(task);
DPRINT(("ctx_state=%d is_current=%d\n",
state,
task == current ? 1 : 0));
/*
* if task == current, then pfm_flush() unloaded the context
*/
if (state == PFM_CTX_UNLOADED) goto doit;
/*
* context is loaded/masked and task != current, we need to
* either force an unload or go zombie
*/
/*
* The task is currently blocked or will block after an overflow.
* we must force it to wakeup to get out of the
* MASKED state and transition to the unloaded state by itself.
*
* This situation is only possible for per-task mode
*/
if (state == PFM_CTX_MASKED && CTX_OVFL_NOBLOCK(ctx) == 0) {
/*
* set a "partial" zombie state to be checked
* upon return from down() in pfm_handle_work().
*
* We cannot use the ZOMBIE state, because it is checked
* by pfm_load_regs() which is called upon wakeup from down().
* In such case, it would free the context and then we would
* return to pfm_handle_work() which would access the
* stale context. Instead, we set a flag invisible to pfm_load_regs()
* but visible to pfm_handle_work().
*
* For some window of time, we have a zombie context with
* ctx_state = MASKED and not ZOMBIE
*/
ctx->ctx_fl_going_zombie = 1;
/*
* force task to wake up from MASKED state
*/
complete(&ctx->ctx_restart_done);
DPRINT(("waking up ctx_state=%d\n", state));
/*
* put ourself to sleep waiting for the other
* task to report completion
*
* the context is protected by mutex, therefore there
* is no risk of being notified of completion before
* begin actually on the waitq.
*/
set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(&ctx->ctx_zombieq, &wait);
UNPROTECT_CTX(ctx, flags);
/*
* XXX: check for signals :
* - ok for explicit close
* - not ok when coming from exit_files()
*/
schedule();
PROTECT_CTX(ctx, flags);
remove_wait_queue(&ctx->ctx_zombieq, &wait);
set_current_state(TASK_RUNNING);
/*
* context is unloaded at this point
*/
DPRINT(("after zombie wakeup ctx_state=%d for\n", state));
}
else if (task != current) {
#ifdef CONFIG_SMP
/*
* switch context to zombie state
*/
ctx->ctx_state = PFM_CTX_ZOMBIE;
DPRINT(("zombie ctx for [%d]\n", task_pid_nr(task)));
/*
* cannot free the context on the spot. deferred until
* the task notices the ZOMBIE state
*/
free_possible = 0;
#else
pfm_context_unload(ctx, NULL, 0, regs);
#endif
}
doit:
/* reload state, may have changed during opening of critical section */
state = ctx->ctx_state;
/*
* the context is still attached to a task (possibly current)
* we cannot destroy it right now
*/
/*
* we must free the sampling buffer right here because
* we cannot rely on it being cleaned up later by the
* monitored task. It is not possible to free vmalloc'ed
* memory in pfm_load_regs(). Instead, we remove the buffer
* now. should there be subsequent PMU overflow originally
* meant for sampling, the will be converted to spurious
* and that's fine because the monitoring tools is gone anyway.
*/
if (ctx->ctx_smpl_hdr) {
smpl_buf_addr = ctx->ctx_smpl_hdr;
smpl_buf_size = ctx->ctx_smpl_size;
/* no more sampling */
ctx->ctx_smpl_hdr = NULL;
ctx->ctx_fl_is_sampling = 0;
}
DPRINT(("ctx_state=%d free_possible=%d addr=%p size=%lu\n",
state,
free_possible,
smpl_buf_addr,
smpl_buf_size));
if (smpl_buf_addr) pfm_exit_smpl_buffer(ctx->ctx_buf_fmt);
/*
* UNLOADED that the session has already been unreserved.
*/
if (state == PFM_CTX_ZOMBIE) {
pfm_unreserve_session(ctx, ctx->ctx_fl_system , ctx->ctx_cpu);
}
/*
* disconnect file descriptor from context must be done
* before we unlock.
*/
filp->private_data = NULL;
/*
* if we free on the spot, the context is now completely unreachable
* from the callers side. The monitored task side is also cut, so we
* can freely cut.
*
* If we have a deferred free, only the caller side is disconnected.
*/
UNPROTECT_CTX(ctx, flags);
/*
* All memory free operations (especially for vmalloc'ed memory)
* MUST be done with interrupts ENABLED.
*/
if (smpl_buf_addr) pfm_rvfree(smpl_buf_addr, smpl_buf_size);
/*
* return the memory used by the context
*/
if (free_possible) pfm_context_free(ctx);
return 0;
}
static int
pfm_no_open(struct inode *irrelevant, struct file *dontcare)
{
DPRINT(("pfm_no_open called\n"));
return -ENXIO;
}
static const struct file_operations pfm_file_ops = {
.llseek = no_llseek,
.read = pfm_read,
.write = pfm_write,
.poll = pfm_poll,
.ioctl = pfm_ioctl,
.open = pfm_no_open, /* special open code to disallow open via /proc */
.fasync = pfm_fasync,
.release = pfm_close,
.flush = pfm_flush
};
static int
pfmfs_delete_dentry(struct dentry *dentry)
{
return 1;
}
static struct dentry_operations pfmfs_dentry_operations = {
.d_delete = pfmfs_delete_dentry,
};
static struct file *
pfm_alloc_file(pfm_context_t *ctx)
{
struct file *file;
struct inode *inode;
struct dentry *dentry;
char name[32];
struct qstr this;
/*
* allocate a new inode
*/
inode = new_inode(pfmfs_mnt->mnt_sb);
if (!inode)
return ERR_PTR(-ENOMEM);
DPRINT(("new inode ino=%ld @%p\n", inode->i_ino, inode));
inode->i_mode = S_IFCHR|S_IRUGO;
inode->i_uid = current->fsuid;
inode->i_gid = current->fsgid;
sprintf(name, "[%lu]", inode->i_ino);
this.name = name;
this.len = strlen(name);
this.hash = inode->i_ino;
/*
* allocate a new dcache entry
*/
dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this);
if (!dentry) {
iput(inode);
return ERR_PTR(-ENOMEM);
}
dentry->d_op = &pfmfs_dentry_operations;
d_add(dentry, inode);
file = alloc_file(pfmfs_mnt, dentry, FMODE_READ, &pfm_file_ops);
if (!file) {
dput(dentry);
return ERR_PTR(-ENFILE);
}
file->f_flags = O_RDONLY;
file->private_data = ctx;
return file;
}
static int
pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long addr, unsigned long size)
{
DPRINT(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size));
while (size > 0) {
unsigned long pfn = ia64_tpa(buf) >> PAGE_SHIFT;
if (remap_pfn_range(vma, addr, pfn, PAGE_SIZE, PAGE_READONLY))
return -ENOMEM;
addr += PAGE_SIZE;
buf += PAGE_SIZE;
size -= PAGE_SIZE;
}
return 0;
}
/*
* allocate a sampling buffer and remaps it into the user address space of the task
*/
static int
pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t *ctx, unsigned long rsize, void **user_vaddr)
{
struct mm_struct *mm = task->mm;
struct vm_area_struct *vma = NULL;
unsigned long size;
void *smpl_buf;
/*
* the fixed header + requested size and align to page boundary
*/
size = PAGE_ALIGN(rsize);
DPRINT(("sampling buffer rsize=%lu size=%lu bytes\n", rsize, size));
/*
* check requested size to avoid Denial-of-service attacks
* XXX: may have to refine this test
* Check against address space limit.
*
* if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur)
* return -ENOMEM;
*/
if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
return -ENOMEM;
/*
* We do the easy to undo allocations first.
*
* pfm_rvmalloc(), clears the buffer, so there is no leak
*/
smpl_buf = pfm_rvmalloc(size);
if (smpl_buf == NULL) {
DPRINT(("Can't allocate sampling buffer\n"));
return -ENOMEM;
}
DPRINT(("smpl_buf @%p\n", smpl_buf));
/* allocate vma */
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma) {
DPRINT(("Cannot allocate vma\n"));
goto error_kmem;
}
/*
* partially initialize the vma for the sampling buffer
*/
vma->vm_mm = mm;
vma->vm_file = filp;
vma->vm_flags = VM_READ| VM_MAYREAD |VM_RESERVED;
vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */
/*
* Now we have everything we need and we can initialize
* and connect all the data structures
*/
ctx->ctx_smpl_hdr = smpl_buf;
ctx->ctx_smpl_size = size; /* aligned size */
/*
* Let's do the difficult operations next.
*
* now we atomically find some area in the address space and
* remap the buffer in it.
*/
down_write(&task->mm->mmap_sem);
/* find some free area in address space, must have mmap sem held */
vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS, 0);
if (vma->vm_start == 0UL) {
DPRINT(("Cannot find unmapped area for size %ld\n", size));
up_write(&task->mm->mmap_sem);
goto error;
}
vma->vm_end = vma->vm_start + size;
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
DPRINT(("aligned size=%ld, hdr=%p mapped @0x%lx\n", size, ctx->ctx_smpl_hdr, vma->vm_start));
/* can only be applied to current task, need to have the mm semaphore held when called */
if (pfm_remap_buffer(vma, (unsigned long)smpl_buf, vma->vm_start, size)) {
DPRINT(("Can't remap buffer\n"));
up_write(&task->mm->mmap_sem);
goto error;
}
get_file(filp);
/*
* now insert the vma in the vm list for the process, must be
* done with mmap lock held
*/
insert_vm_struct(mm, vma);
mm->total_vm += size >> PAGE_SHIFT;
vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
vma_pages(vma));
up_write(&task->mm->mmap_sem);
/*
* keep track of user level virtual address
*/
ctx->ctx_smpl_vaddr = (void *)vma->vm_start;
*(unsigned long *)user_vaddr = vma->vm_start;
return 0;
error:
kmem_cache_free(vm_area_cachep, vma);
error_kmem:
pfm_rvfree(smpl_buf, size);
return -ENOMEM;
}
/*
* XXX: do something better here
*/
static int
pfm_bad_permissions(struct task_struct *task)
{
/* inspired by ptrace_attach() */
DPRINT(("cur: uid=%d gid=%d task: euid=%d suid=%d uid=%d egid=%d sgid=%d\n",
current->uid,
current->gid,
task->euid,
task->suid,
task->uid,
task->egid,
task->sgid));
return ((current->uid != task->euid)
|| (current->uid != task->suid)
|| (current->uid != task->uid)
|| (current->gid != task->egid)
|| (current->gid != task->sgid)
|| (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE);
}
static int
pfarg_is_sane(struct task_struct *task, pfarg_context_t *pfx)
{
int ctx_flags;
/* valid signal */
ctx_flags = pfx->ctx_flags;
if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
/*
* cannot block in this mode
*/
if (ctx_flags & PFM_FL_NOTIFY_BLOCK) {
DPRINT(("cannot use blocking mode when in system wide monitoring\n"));
return -EINVAL;
}
} else {
}
/* probably more to add here */
return 0;
}
static int
pfm_setup_buffer_fmt(struct task_struct *task, struct file *filp, pfm_context_t *ctx, unsigned int ctx_flags,
unsigned int cpu, pfarg_context_t *arg)
{
pfm_buffer_fmt_t *fmt = NULL;
unsigned long size = 0UL;
void *uaddr = NULL;
void *fmt_arg = NULL;
int ret = 0;
#define PFM_CTXARG_BUF_ARG(a) (pfm_buffer_fmt_t *)(a+1)
/* invoke and lock buffer format, if found */
fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id);
if (fmt == NULL) {
DPRINT(("[%d] cannot find buffer format\n", task_pid_nr(task)));
return -EINVAL;
}
/*
* buffer argument MUST be contiguous to pfarg_context_t
*/
if (fmt->fmt_arg_size) fmt_arg = PFM_CTXARG_BUF_ARG(arg);
ret = pfm_buf_fmt_validate(fmt, task, ctx_flags, cpu, fmt_arg);
DPRINT(("[%d] after validate(0x%x,%d,%p)=%d\n", task_pid_nr(task), ctx_flags, cpu, fmt_arg, ret));
if (ret) goto error;
/* link buffer format and context */
ctx->ctx_buf_fmt = fmt;
ctx->ctx_fl_is_sampling = 1; /* assume record() is defined */
/*
* check if buffer format wants to use perfmon buffer allocation/mapping service
*/
ret = pfm_buf_fmt_getsize(fmt, task, ctx_flags, cpu, fmt_arg, &size);
if (ret) goto error;
if (size) {
/*
* buffer is always remapped into the caller's address space
*/
ret = pfm_smpl_buffer_alloc(current, filp, ctx, size, &uaddr);
if (ret) goto error;
/* keep track of user address of buffer */
arg->ctx_smpl_vaddr = uaddr;
}
ret = pfm_buf_fmt_init(fmt, task, ctx->ctx_smpl_hdr, ctx_flags, cpu, fmt_arg);
error:
return ret;
}
static void
pfm_reset_pmu_state(pfm_context_t *ctx)
{
int i;
/*
* install reset values for PMC.
*/
for (i=1; PMC_IS_LAST(i) == 0; i++) {
if (PMC_IS_IMPL(i) == 0) continue;
ctx->ctx_pmcs[i] = PMC_DFL_VAL(i);
DPRINT(("pmc[%d]=0x%lx\n", i, ctx->ctx_pmcs[i]));
}
/*
* PMD registers are set to 0UL when the context in memset()
*/
/*
* On context switched restore, we must restore ALL pmc and ALL pmd even
* when they are not actively used by the task. In UP, the incoming process
* may otherwise pick up left over PMC, PMD state from the previous process.
* As opposed to PMD, stale PMC can cause harm to the incoming
* process because they may change what is being measured.
* Therefore, we must systematically reinstall the entire
* PMC state. In SMP, the same thing is possible on the
* same CPU but also on between 2 CPUs.
*
* The problem with PMD is information leaking especially
* to user level when psr.sp=0
*
* There is unfortunately no easy way to avoid this problem
* on either UP or SMP. This definitively slows down the
* pfm_load_regs() function.
*/
/*
* bitmask of all PMCs accessible to this context
*
* PMC0 is treated differently.
*/
ctx->ctx_all_pmcs[0] = pmu_conf->impl_pmcs[0] & ~0x1;
/*
* bitmask of all PMDs that are accessible to this context
*/
ctx->ctx_all_pmds[0] = pmu_conf->impl_pmds[0];
DPRINT(("<%d> all_pmcs=0x%lx all_pmds=0x%lx\n", ctx->ctx_fd, ctx->ctx_all_pmcs[0],ctx->ctx_all_pmds[0]));
/*
* useful in case of re-enable after disable
*/
ctx->ctx_used_ibrs[0] = 0UL;
ctx->ctx_used_dbrs[0] = 0UL;
}
static int
pfm_ctx_getsize(void *arg, size_t *sz)
{
pfarg_context_t *req = (pfarg_context_t *)arg;
pfm_buffer_fmt_t *fmt;
*sz = 0;
if (!pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) return 0;
fmt = pfm_find_buffer_fmt(req->ctx_smpl_buf_id);
if (fmt == NULL) {
DPRINT(("cannot find buffer format\n"));
return -EINVAL;
}
/* get just enough to copy in user parameters */
*sz = fmt->fmt_arg_size;
DPRINT(("arg_size=%lu\n", *sz));
return 0;
}
/*
* cannot attach if :
* - kernel task
* - task not owned by caller
* - task incompatible with context mode
*/
static int
pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
{
/*
* no kernel task or task not owner by caller
*/
if (task->mm == NULL) {
DPRINT(("task [%d] has not memory context (kernel thread)\n", task_pid_nr(task)));
return -EPERM;
}
if (pfm_bad_permissions(task)) {
DPRINT(("no permission to attach to [%d]\n", task_pid_nr(task)));
return -EPERM;
}
/*
* cannot block in self-monitoring mode
*/
if (CTX_OVFL_NOBLOCK(ctx) == 0 && task == current) {
DPRINT(("cannot load a blocking context on self for [%d]\n", task_pid_nr(task)));
return -EINVAL;
}
if (task->exit_state == EXIT_ZOMBIE) {
DPRINT(("cannot attach to zombie task [%d]\n", task_pid_nr(task)));
return -EBUSY;
}
/*
* always ok for self
*/
if (task == current) return 0;
if (!task_is_stopped_or_traced(task)) {
DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task_pid_nr(task), task->state));
return -EBUSY;
}
/*
* make sure the task is off any CPU
*/
wait_task_inactive(task);
/* more to come... */
return 0;
}
static int
pfm_get_task(pfm_context_t *ctx, pid_t pid, struct task_struct **task)
{
struct task_struct *p = current;
int ret;
/* XXX: need to add more checks here */
if (pid < 2) return -EPERM;
if (pid != task_pid_vnr(current)) {
read_lock(&tasklist_lock);
p = find_task_by_vpid(pid);
/* make sure task cannot go away while we operate on it */
if (p) get_task_struct(p);
read_unlock(&tasklist_lock);
if (p == NULL) return -ESRCH;
}
ret = pfm_task_incompatible(ctx, p);
if (ret == 0) {
*task = p;
} else if (p != current) {
pfm_put_task(p);
}
return ret;
}
static int
pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
{
pfarg_context_t *req = (pfarg_context_t *)arg;
struct file *filp;
struct path path;
int ctx_flags;
int fd;
int ret;
/* let's check the arguments first */
ret = pfarg_is_sane(current, req);
if (ret < 0)
return ret;
ctx_flags = req->ctx_flags;
ret = -ENOMEM;
fd = get_unused_fd();
if (fd < 0)
return fd;
ctx = pfm_context_alloc(ctx_flags);
if (!ctx)
goto error;
filp = pfm_alloc_file(ctx);
if (IS_ERR(filp)) {
ret = PTR_ERR(filp);
goto error_file;
}
req->ctx_fd = ctx->ctx_fd = fd;
/*
* does the user want to sample?
*/
if (pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) {
ret = pfm_setup_buffer_fmt(current, filp, ctx, ctx_flags, 0, req);
if (ret)
goto buffer_error;
}
DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d \n",
ctx,
ctx_flags,
ctx->ctx_fl_system,
ctx->ctx_fl_block,
ctx->ctx_fl_excl_idle,
ctx->ctx_fl_no_msg,
ctx->ctx_fd));
/*
* initialize soft PMU state
*/
pfm_reset_pmu_state(ctx);
fd_install(fd, filp);
return 0;
buffer_error:
path = filp->f_path;
put_filp(filp);
path_put(&path);
if (ctx->ctx_buf_fmt) {
pfm_buf_fmt_exit(ctx->ctx_buf_fmt, current, NULL, regs);
}
error_file:
pfm_context_free(ctx);
error:
put_unused_fd(fd);
return ret;
}
static inline unsigned long
pfm_new_counter_value (pfm_counter_t *reg, int is_long_reset)
{
unsigned long val = is_long_reset ? reg->long_reset : reg->short_reset;
unsigned long new_seed, old_seed = reg->seed, mask = reg->mask;
extern unsigned long carta_random32 (unsigned long seed);
if (reg->flags & PFM_REGFL_RANDOM) {
new_seed = carta_random32(old_seed);
val -= (old_seed & mask); /* counter values are negative numbers! */
if ((mask >> 32) != 0)
/* construct a full 64-bit random value: */
new_seed |= carta_random32(old_seed >> 32) << 32;
reg->seed = new_seed;
}
reg->lval = val;
return val;
}
static void
pfm_reset_regs_masked(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset)
{
unsigned long mask = ovfl_regs[0];
unsigned long reset_others = 0UL;
unsigned long val;
int i;
/*
* now restore reset value on sampling overflowed counters
*/
mask >>= PMU_FIRST_COUNTER;
for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
if ((mask & 0x1UL) == 0UL) continue;
ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset);
reset_others |= ctx->ctx_pmds[i].reset_pmds[0];
DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val));
}
/*
* Now take care of resetting the other registers
*/
for(i = 0; reset_others; i++, reset_others >>= 1) {
if ((reset_others & 0x1) == 0) continue;
ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset);
DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n",
is_long_reset ? "long" : "short", i, val));
}
}
static void
pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset)
{
unsigned long mask = ovfl_regs[0];
unsigned long reset_others = 0UL;
unsigned long val;
int i;
DPRINT_ovfl(("ovfl_regs=0x%lx is_long_reset=%d\n", ovfl_regs[0], is_long_reset));
if (ctx->ctx_state == PFM_CTX_MASKED) {
pfm_reset_regs_masked(ctx, ovfl_regs, is_long_reset);
return;
}
/*
* now restore reset value on sampling overflowed counters
*/
mask >>= PMU_FIRST_COUNTER;
for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
if ((mask & 0x1UL) == 0UL) continue;
val = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset);
reset_others |= ctx->ctx_pmds[i].reset_pmds[0];
DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val));
pfm_write_soft_counter(ctx, i, val);
}
/*
* Now take care of resetting the other registers
*/
for(i = 0; reset_others; i++, reset_others >>= 1) {
if ((reset_others & 0x1) == 0) continue;
val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset);
if (PMD_IS_COUNTING(i)) {
pfm_write_soft_counter(ctx, i, val);
} else {
ia64_set_pmd(i, val);
}
DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n",
is_long_reset ? "long" : "short", i, val));
}
ia64_srlz_d();
}
static int
pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
{
struct task_struct *task;
pfarg_reg_t *req = (pfarg_reg_t *)arg;
unsigned long value, pmc_pm;
unsigned long smpl_pmds, reset_pmds, impl_pmds;
unsigned int cnum, reg_flags, flags, pmc_type;