perfmon/perfmon_intr.c - pub/scm/linux/kernel/git/joro/linux - Git at Google

 /*
  * perfmon_intr.c: perfmon2 interrupt handling
  *
  * This file implements the perfmon2 interface which
  * provides access to the hardware performance counters
  * of the host processor.
  *
  * The initial version of perfmon.c was written by
  * Ganesh Venkitachalam, IBM Corp.
  *
  * Then it was modified for perfmon-1.x by Stephane Eranian and
  * David Mosberger, Hewlett Packard Co.
  *
  * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
  * by Stephane Eranian, Hewlett Packard Co.
  *
  * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
  * Contributed by Stephane Eranian <eranian@hpl.hp.com>
  *                David Mosberger-Tang <davidm@hpl.hp.com>
  *
  * More information about perfmon available at:
  * 	http://perfmon2.sf.net
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
  * License as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  * 02111-1307 USA
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/perfmon_kern.h>
 #include "perfmon_priv.h"

 /**
  * pfm_intr_process_64bit_ovfls - handle 64-bit counter emulation
  * @ctx: context to operate on
  * @set: set to operate on
  *
  * The function returns the number of 64-bit overflows detected.
  *
  * 64-bit software pmds are updated for overflowed pmd registers
  * the set->reset_pmds is updated to the list of pmds to reset
  *
  * In any case, set->npend_ovfls is cleared
  */
 static u16 pfm_intr_process_64bit_ovfls(struct pfm_context *ctx,
 					struct pfm_event_set *set,
 					u32 *ovfl_ctrl)
 {
 	u64 ovfl_thres, old_val, new_val, ovfl_mask;
 	u16 num_64b_ovfls, has_ovfl_sw, must_switch;
 	u16 max_pmd;
 	int i;

 	num_64b_ovfls = must_switch = 0;
 	max_pmd = ctx->regs.max_pmd;
 	ovfl_mask = pfm_pmu_conf->ovfl_mask;

 	has_ovfl_sw = set->flags & PFM_SETFL_OVFL_SWITCH;

 	bitmap_zero(cast_ulp(set->reset_pmds), max_pmd);

 	for_each_set_bit(i, cast_ulp(set->povfl_pmds), max_pmd) {
 		/*
 		 * Update software value for counters ONLY
 		 *
 		 * Note that the pmd is not necessarily 0 at this point as
 		 * qualified events may have happened before the PMU was
 		 * frozen. The residual count is not taken into consideration
 		 * here but will be with any read of the pmd
 		 */
 		ovfl_thres = set->pmds[i].ovflsw_thres;

 		if (likely(test_bit(i, cast_ulp(ctx->regs.cnt_pmds)))) {
 			old_val = new_val = set->pmds[i].value;
 			new_val += 1 + ovfl_mask;
 			set->pmds[i].value = new_val;
 		}  else {
 			/*
 			 * for non counters which interrupt, e.g., AMD IBS,
 			 * we consider this equivalent to a 64-bit counter
 			 * overflow.
 			 */
 			old_val = 1; new_val = 0;
 		}

 		/*
 		 * check for 64-bit overflow condition
 		 */
 		if (likely(old_val > new_val)) {
 			num_64b_ovfls++;
 			if (has_ovfl_sw && ovfl_thres > 0) {
 				if (ovfl_thres == 1)
 					must_switch = 1;
 				set->pmds[i].ovflsw_thres = ovfl_thres - 1;
 			}

 			/*
 			 * what to reset because of this overflow
 			 * - the overflowed register
 			 * - its reset_smpls
 			 */
 			__set_bit(i, cast_ulp(set->reset_pmds));

 			bitmap_or(cast_ulp(set->reset_pmds),
 				  cast_ulp(set->reset_pmds),
 				  cast_ulp(set->pmds[i].reset_pmds),
 				  max_pmd);
 		} else {
 			/*
 			 * only keep track of 64-bit overflows or
 			 * assimilated
 			 */
 			__clear_bit(i, cast_ulp(set->povfl_pmds));

 			/*
 			 * on some PMU, it may be necessary to re-arm the PMD
 			 */
 			pfm_arch_ovfl_reset_pmd(ctx, i);
 		}

 		PFM_DBG_ovfl("ovfl=%s pmd%u new=0x%llx old=0x%llx "
 			     "hw_pmd=0x%llx o_pmds=0x%llx must_switch=%u "
 			     "o_thres=%llu o_thres_ref=%llu",
 			     old_val > new_val ? "64-bit" : "HW",
 			     i,
 			     (unsigned long long)new_val,
 			     (unsigned long long)old_val,
 			     (unsigned long long)pfm_read_pmd(ctx, i),
 			     (unsigned long long)set->povfl_pmds[0],
 			     must_switch,
 			     (unsigned long long)set->pmds[i].ovflsw_thres,
 			     (unsigned long long)set->pmds[i].ovflsw_ref_thres);
 	}
 	/*
 	 * update public bitmask of 64-bit overflowed pmds
 	 */
 	if (num_64b_ovfls)
 		bitmap_copy(cast_ulp(set->ovfl_pmds),
 			    cast_ulp(set->povfl_pmds),
 			    max_pmd);

 	if (must_switch)
 		*ovfl_ctrl |= PFM_OVFL_CTRL_SWITCH;

 	/*
 	 * mark the overflows as consumed
 	 */
 	set->npend_ovfls = 0;
 	bitmap_zero(cast_ulp(set->povfl_pmds), max_pmd);

 	return num_64b_ovfls;
 }

 /**
  * pfm_intr_get_smpl_pmds_values - copy 64-bit pmd values for sampling format
  * @ctx: context to work on
  * @set: current event set
  * @arg: overflow arg to be passed to format
  * @smpl_pmds: list of PMDs of interest for the overflowed register
  *
  * build an array of 46-bit PMD values based on smpl_pmds. Values are
  * stored in increasing order of the PMD indexes
  */
 static void pfm_intr_get_smpl_pmds_values(struct pfm_context *ctx,
 					  struct pfm_event_set *set,
 					  struct pfm_ovfl_arg *arg,
 					  u64 *smpl_pmds)
 {
 	u16 j, k;
 	u64 new_val, ovfl_mask;
 	u64 *cnt_pmds;

 	cnt_pmds = ctx->regs.cnt_pmds;
 	ovfl_mask = pfm_pmu_conf->ovfl_mask;

 	k = 0;
 	for_each_set_bit(j, cast_ulp(smpl_pmds), ctx->regs.max_pmd) {
 		new_val = pfm_read_pmd(ctx, j);

 		/* for counters, build 64-bit value */
 		if (test_bit(j, cast_ulp(cnt_pmds)))
 			new_val = (set->pmds[j].value & ~ovfl_mask)
 				| (new_val & ovfl_mask);

 		arg->smpl_pmds_values[k++] = new_val;

 		PFM_DBG_ovfl("s_pmd_val[%u]=pmd%u=0x%llx", k, j,
 			     (unsigned long long)new_val);
 	}
 	arg->num_smpl_pmds = k;
 }

 /**
  * pfm_intr_process_smpl_fmt -- handle sampling format callback
  * @ctx: context to work on
  * @set: current event set
  * @ip: interrupted instruction pointer
  * @now: timestamp
  * @num_ovfls: number of 64-bit overflows
  * @ovfl_ctrl: set of controls for interrupt handler tail processing
  * @regs: register state
  *
  * Prepare argument (ovfl_arg) to be passed to sampling format callback, then
  * invoke the callback (fmt_handler)
  */
 static int pfm_intr_process_smpl_fmt(struct pfm_context *ctx,
 				    struct pfm_event_set *set,
 				    unsigned long ip,
 				    u64 now,
 				    u64 num_ovfls,
 				    u32 *ovfl_ctrl,
 				    struct pt_regs *regs)
 {
 	struct pfm_ovfl_arg *ovfl_arg;
 	u64 start_cycles, end_cycles;
 	u16 max_pmd;
 	int i, ret = 0;

 	ovfl_arg = &ctx->ovfl_arg;
 	ovfl_arg->active_set = set->id;
 	max_pmd = ctx->regs.max_pmd;

 	/*
 	 * go over all 64-bit overflow
 	 */
 	for_each_set_bit(i, cast_ulp(set->ovfl_pmds), max_pmd) {
 		/*
 		 * prepare argument to fmt_handler
 		 */
 		ovfl_arg->ovfl_pmd = i;
 		ovfl_arg->ovfl_ctrl = 0;

 		ovfl_arg->pmd_last_reset = set->pmds[i].lval;
 		ovfl_arg->pmd_eventid = set->pmds[i].eventid;
 		ovfl_arg->num_smpl_pmds = 0;

 		/*
 		 * copy values of pmds of interest, if any
 		 * Sampling format may use them
 		 * We do not initialize the unused smpl_pmds_values
 		 */
 		if (!bitmap_empty(cast_ulp(set->pmds[i].smpl_pmds), max_pmd))
 			pfm_intr_get_smpl_pmds_values(ctx, set, ovfl_arg,
 						      set->pmds[i].smpl_pmds);

 		pfm_stats_inc(fmt_handler_calls);

 		/*
 		 * call format record (handler) routine
 		 */
 		start_cycles = sched_clock();
 		ret = (*ctx->smpl_fmt->fmt_handler)(ctx, ip, now, regs);
 		end_cycles = sched_clock();

 		/*
 		 * The reset_pmds mask is constructed automatically
 		 * on overflow. When the actual reset takes place
 		 * depends on the masking, switch and notification
 		 * status. It may be deferred until pfm_restart().
 		 */
 		*ovfl_ctrl |= ovfl_arg->ovfl_ctrl;

 		pfm_stats_add(fmt_handler_ns, end_cycles - start_cycles);
 	}
 	/*
 	 * when the format cannot handle the rest of the overflow, we abort
 	 */
 	if (ret)
 		PFM_DBG_ovfl("handler aborted at PMD%u ret=%d", i, ret);
 	return ret;
 }
 /**
  * pfm_overflow_handler - main overflow processing routine.
  * @ctx: context to work on (always current context)
  * @set: current event set
  * @ip: interrupt instruction pointer
  * @regs: machine state
  *
  * set->num_ovfl_pmds is 0 when returning from this function even though
  * set->ovfl_pmds[] may have bits set. When leaving set->num_ovfl_pmds
  * must never be used to determine if there was a pending overflow.
  */
 static void pfm_overflow_handler(struct pfm_context *ctx,
 				 struct pfm_event_set *set,
 				 unsigned long ip,
 				 struct pt_regs *regs)
 {
 	struct pfm_event_set *set_orig;
 	u64 now;
 	u32 ovfl_ctrl;
 	u16 max_intr, max_pmd;
 	u16 num_ovfls;
 	int ret, has_notify;

 	/*
 	 * take timestamp
 	 */
 	now = sched_clock();

 	max_pmd = ctx->regs.max_pmd;
 	max_intr = ctx->regs.max_intr_pmd;

 	set_orig = set;
 	ovfl_ctrl = 0;

 	/*
 	 * skip ZOMBIE case
 	 */
 	if (unlikely(ctx->state == PFM_CTX_ZOMBIE))
 		goto stop_monitoring;

 	PFM_DBG_ovfl("intr_pmds=0x%llx npend=%u ip=%p, blocking=%d "
 		     "u_pmds=0x%llx use_fmt=%u",
 		     (unsigned long long)set->povfl_pmds[0],
 		     set->npend_ovfls,
 		     (void *)ip,
 		     ctx->flags.block,
 		     (unsigned long long)set->used_pmds[0],
 		     !!ctx->smpl_fmt);

 	/*
 	 * return number of 64-bit overflows
 	 */
 	num_ovfls = pfm_intr_process_64bit_ovfls(ctx, set, &ovfl_ctrl);

 	/*
 	 * there were no 64-bit overflows
 	 * nothing else to do
 	 */
 	if (!num_ovfls)
 		return;

 	/*
 	 * tmp_ovfl_notify = ovfl_pmds & ovfl_notify
 	 * with:
 	 *   - ovfl_pmds: last 64-bit overflowed pmds
 	 *   - ovfl_notify: notify on overflow registers
 	 */
 	bitmap_and(cast_ulp(ctx->tmp_ovfl_notify),
 		   cast_ulp(set->ovfl_pmds),
 		   cast_ulp(set->ovfl_notify),
 		   max_intr);

 	has_notify = !bitmap_empty(cast_ulp(ctx->tmp_ovfl_notify), max_intr);

 	/*
 	 * check for sampling format and invoke fmt_handler
 	 */
 	if (likely(ctx->smpl_fmt)) {
 		pfm_intr_process_smpl_fmt(ctx, set, ip, now, num_ovfls,
 					  &ovfl_ctrl, regs);
 	} else {
 		/*
 		 * When no sampling format is used, the default
 		 * is:
 		 * 	- mask monitoring if not switching
 		 * 	- notify user if requested
 		 *
 		 * If notification is not requested, monitoring is masked
 		 * and overflowed registers are not reset (saturation).
 		 * This mimics the behavior of the default sampling format.
 		 */
 		ovfl_ctrl |= PFM_OVFL_CTRL_NOTIFY;
 		if (has_notify || !(ovfl_ctrl & PFM_OVFL_CTRL_SWITCH))
 			ovfl_ctrl |= PFM_OVFL_CTRL_MASK;
 	}

 	PFM_DBG_ovfl("set%u o_notify=0x%llx o_pmds=0x%llx "
 		     "r_pmds=0x%llx ovfl_ctrl=0x%x",
 		     set->id,
 		     (unsigned long long)ctx->tmp_ovfl_notify[0],
 		     (unsigned long long)set->ovfl_pmds[0],
 		     (unsigned long long)set->reset_pmds[0],
 		     ovfl_ctrl);

 	/*
 	 * execute the various controls
 	 *        ORDER MATTERS
 	 */


 	/*
 	 * mask monitoring
 	 */
 	if (ovfl_ctrl & PFM_OVFL_CTRL_MASK) {
 		pfm_mask_monitoring(ctx, set);
 		/*
 		 * when masking, reset is deferred until
 		 * pfm_restart()
 		 */
 		ovfl_ctrl &= ~PFM_OVFL_CTRL_RESET;

 		/*
 		 * when masking, switching is deferred until
 		 * pfm_restart and we need to remember it
 		 */
 		if (ovfl_ctrl & PFM_OVFL_CTRL_SWITCH) {
 			set->priv_flags |= PFM_SETFL_PRIV_SWITCH;
 			ovfl_ctrl &= ~PFM_OVFL_CTRL_SWITCH;
 		}
 	}

 	/*
 	 * switch event set
 	 */
 	if (ovfl_ctrl & PFM_OVFL_CTRL_SWITCH) {
 		pfm_switch_sets_from_intr(ctx);
 		/* update view of active set */
 		set = ctx->active_set;
 	}
 	/*
 	 * send overflow notification
 	 *
 	 * only necessary if at least one overflowed
 	 * register had the notify flag set
 	 */
 	if (has_notify && (ovfl_ctrl & PFM_OVFL_CTRL_NOTIFY)) {
 		/*
 		 * block on notify, not on masking
 		 */
 		if (ctx->flags.block)
 			pfm_post_work(current, ctx, PFM_WORK_BLOCK);

 		/*
 		 * send notification and passed original set id
 		 * if error, queue full, for instance, then default
 		 * to masking monitoring, i.e., saturate
 		 */
 		ret = pfm_ovfl_notify(ctx, set_orig, ip);
 		if (unlikely(ret)) {
 			if (ctx->state == PFM_CTX_LOADED) {
 				pfm_mask_monitoring(ctx, set);
 				ovfl_ctrl &= ~PFM_OVFL_CTRL_RESET;
 			}
 		} else {
 			ctx->flags.can_restart++;
 			PFM_DBG_ovfl("can_restart=%u", ctx->flags.can_restart);
 		}
 	}

 	/*
 	 * reset overflowed registers
 	 */
 	if (ovfl_ctrl & PFM_OVFL_CTRL_RESET) {
 		u16 nn;
 		nn = bitmap_weight(cast_ulp(set->reset_pmds), max_pmd);
 		if (nn)
 			pfm_reset_pmds(ctx, set, nn, PFM_PMD_RESET_SHORT);
 	}
 	return;

 stop_monitoring:
 	/*
 	 * Does not happen for a system-wide context nor for a
 	 * self-monitored context. We cannot attach to kernel-only
 	 * thread, thus it is safe to set TIF bits, i.e., the thread
 	 * will eventually leave the kernel or die and either we will
 	 * catch the context and clean it up in pfm_handler_work() or
 	 * pfm_exit_thread().
 	 *
 	 * Mask until we get to pfm_handle_work()
 	 */
 	pfm_mask_monitoring(ctx, set);

 	PFM_DBG_ovfl("ctx is zombie, converted to spurious");
 	pfm_post_work(current, ctx, PFM_WORK_ZOMBIE);
 }

 /**
  * __pfm_interrupt_handler - 1st level interrupt handler
  * @ip: interrupted instruction pointer
  * @regs: machine state
  *
  * Function is static because we use a wrapper to easily capture timing infos.
  *
  *
  * Context locking necessary to avoid concurrent accesses from other CPUs
  * 	- For per-thread, we must prevent pfm_restart() which works when
  * 	  context is LOADED or MASKED
  */
 static void __pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs)
 {
 	struct task_struct *task;
 	struct pfm_context *ctx;
 	struct pfm_event_set *set;


 	task = __get_cpu_var(pmu_owner);
 	ctx = __get_cpu_var(pmu_ctx);

 	/*
 	 * verify if there is a context on this CPU
 	 */
 	if (unlikely(ctx == NULL)) {
 		PFM_DBG_ovfl("no ctx");
 		goto spurious;
 	}

 	/*
 	 * we need to lock context because it could be accessed
 	 * from another CPU. Depending on the priority level of
 	 * the PMU interrupt or the arch, it may be necessary to
 	 * mask interrupts alltogether to avoid race condition with
 	 * the timer interrupt in case of time-based set switching,
 	 * for instance.
 	 */
 	spin_lock(&ctx->lock);

 	set = ctx->active_set;

 	/*
 	 * For SMP per-thread, it is not possible to have
 	 * owner != NULL && task != current.
 	 *
 	 * For UP per-thread, because of lazy save, it
 	 * is possible to receive an interrupt in another task
 	 * which is not using the PMU. This means
 	 * that the interrupt was in-flight at the
 	 * time of pfm_ctxswout_thread(). In that
 	 * case, it will be replayed when the task
 	 * is scheduled again. Hence we convert to spurious.
 	 *
 	 * The basic rule is that an overflow is always
 	 * processed in the context of the task that
 	 * generated it for all per-thread contexts.
 	 *
 	 * for system-wide, task is always NULL
 	 */
 #ifndef CONFIG_SMP
 	if (unlikely((task && current->pfm_context != ctx))) {
 		PFM_DBG_ovfl("spurious: not owned by current task");
 		goto spurious;
 	}
 #endif
 	if (unlikely(ctx->state == PFM_CTX_MASKED)) {
 		PFM_DBG_ovfl("spurious: monitoring masked");
 		goto spurious;
 	}

 	/*
 	 * check that monitoring is active, otherwise convert
 	 * to spurious
 	 */
 	if (unlikely(!pfm_arch_is_active(ctx))) {
 		PFM_DBG_ovfl("spurious: monitoring non active");
 		goto spurious;
 	}

 	/*
 	 * freeze PMU and collect overflowed PMD registers
 	 * into set->povfl_pmds. Number of overflowed PMDs
 	 * reported in set->npend_ovfls
 	 */
 	pfm_arch_intr_freeze_pmu(ctx, set);

 	/*
 	 * no overflow detected, interrupt may have come
 	 * from the previous thread running on this CPU
 	 */
 	if (unlikely(!set->npend_ovfls)) {
 		PFM_DBG_ovfl("no npend_ovfls");
 		goto spurious;
 	}

 	pfm_stats_inc(ovfl_intr_regular_count);

 	/*
 	 * invoke actual handler
 	 */
 	pfm_overflow_handler(ctx, set, ip, regs);

 	/*
 	 * unfreeze PMU, monitoring may not actual be restarted
 	 * if context is MASKED
 	 */
 	pfm_arch_intr_unfreeze_pmu(ctx);

 	spin_unlock(&ctx->lock);

 	return;

 spurious:
 	/* ctx may be NULL */
 	pfm_arch_intr_unfreeze_pmu(ctx);
 	if (ctx)
 		spin_unlock(&ctx->lock);

 	pfm_stats_inc(ovfl_intr_spurious_count);
 }


 /**
  * pfm_interrupt_handler - 1st level interrupt handler
  * @ip: interrupt instruction pointer
  * @regs: machine state
  *
  * Function called from the low-level assembly code or arch-specific perfmon
  * code. Simple wrapper used for timing purpose. Actual work done in
  * __pfm_overflow_handler()
  */
 void pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs)
 {
 	u64 start;

 	pfm_stats_inc(ovfl_intr_all_count);

 	BUG_ON(!irqs_disabled());

 	start = sched_clock();

 	__pfm_interrupt_handler(ip, regs);

 	pfm_stats_add(ovfl_intr_ns, sched_clock() - start);
 }
 EXPORT_SYMBOL(pfm_interrupt_handler);
	/*
	* perfmon_intr.c: perfmon2 interrupt handling
	*
	* This file implements the perfmon2 interface which
	* provides access to the hardware performance counters
	* of the host processor.
	*
	* The initial version of perfmon.c was written by
	* Ganesh Venkitachalam, IBM Corp.
	*
	* Then it was modified for perfmon-1.x by Stephane Eranian and
	* David Mosberger, Hewlett Packard Co.
	*
	* Version Perfmon-2.x is a complete rewrite of perfmon-1.x
	* by Stephane Eranian, Hewlett Packard Co.
	*
	* Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
	* Contributed by Stephane Eranian <eranian@hpl.hp.com>
	* David Mosberger-Tang <davidm@hpl.hp.com>
	*
	* More information about perfmon available at:
	* http://perfmon2.sf.net
	*
	* This program is free software; you can redistribute it and/or
	* modify it under the terms of version 2 of the GNU General Public
	* License as published by the Free Software Foundation.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
	* 02111-1307 USA
	*/
	#include <linux/kernel.h>
	#include <linux/module.h>
	#include <linux/perfmon_kern.h>
	#include "perfmon_priv.h"

	/**
	* pfm_intr_process_64bit_ovfls - handle 64-bit counter emulation
	* @ctx: context to operate on
	* @set: set to operate on
	*
	* The function returns the number of 64-bit overflows detected.
	*
	* 64-bit software pmds are updated for overflowed pmd registers
	* the set->reset_pmds is updated to the list of pmds to reset
	*
	* In any case, set->npend_ovfls is cleared
	*/
	static u16 pfm_intr_process_64bit_ovfls(struct pfm_context *ctx,
	struct pfm_event_set *set,
	u32 *ovfl_ctrl)
	{
	u64 ovfl_thres, old_val, new_val, ovfl_mask;
	u16 num_64b_ovfls, has_ovfl_sw, must_switch;
	u16 max_pmd;
	int i;

	num_64b_ovfls = must_switch = 0;
	max_pmd = ctx->regs.max_pmd;
	ovfl_mask = pfm_pmu_conf->ovfl_mask;

	has_ovfl_sw = set->flags & PFM_SETFL_OVFL_SWITCH;

	bitmap_zero(cast_ulp(set->reset_pmds), max_pmd);

	for_each_set_bit(i, cast_ulp(set->povfl_pmds), max_pmd) {
	/*
	* Update software value for counters ONLY
	*
	* Note that the pmd is not necessarily 0 at this point as
	* qualified events may have happened before the PMU was
	* frozen. The residual count is not taken into consideration
	* here but will be with any read of the pmd
	*/
	ovfl_thres = set->pmds[i].ovflsw_thres;

	if (likely(test_bit(i, cast_ulp(ctx->regs.cnt_pmds)))) {
	old_val = new_val = set->pmds[i].value;
	new_val += 1 + ovfl_mask;
	set->pmds[i].value = new_val;
	} else {
	/*
	* for non counters which interrupt, e.g., AMD IBS,
	* we consider this equivalent to a 64-bit counter
	* overflow.
	*/
	old_val = 1; new_val = 0;
	}

	/*
	* check for 64-bit overflow condition
	*/
	if (likely(old_val > new_val)) {
	num_64b_ovfls++;
	if (has_ovfl_sw && ovfl_thres > 0) {
	if (ovfl_thres == 1)
	must_switch = 1;
	set->pmds[i].ovflsw_thres = ovfl_thres - 1;
	}

	/*
	* what to reset because of this overflow
	* - the overflowed register
	* - its reset_smpls
	*/
	__set_bit(i, cast_ulp(set->reset_pmds));

	bitmap_or(cast_ulp(set->reset_pmds),
	cast_ulp(set->reset_pmds),
	cast_ulp(set->pmds[i].reset_pmds),
	max_pmd);
	} else {
	/*
	* only keep track of 64-bit overflows or
	* assimilated
	*/
	__clear_bit(i, cast_ulp(set->povfl_pmds));

	/*
	* on some PMU, it may be necessary to re-arm the PMD
	*/
	pfm_arch_ovfl_reset_pmd(ctx, i);
	}

	PFM_DBG_ovfl("ovfl=%s pmd%u new=0x%llx old=0x%llx "
	"hw_pmd=0x%llx o_pmds=0x%llx must_switch=%u "
	"o_thres=%llu o_thres_ref=%llu",
	old_val > new_val ? "64-bit" : "HW",
	i,
	(unsigned long long)new_val,
	(unsigned long long)old_val,
	(unsigned long long)pfm_read_pmd(ctx, i),
	(unsigned long long)set->povfl_pmds[0],
	must_switch,
	(unsigned long long)set->pmds[i].ovflsw_thres,
	(unsigned long long)set->pmds[i].ovflsw_ref_thres);
	}
	/*
	* update public bitmask of 64-bit overflowed pmds
	*/
	if (num_64b_ovfls)
	bitmap_copy(cast_ulp(set->ovfl_pmds),
	cast_ulp(set->povfl_pmds),
	max_pmd);

	if (must_switch)
	*ovfl_ctrl \|= PFM_OVFL_CTRL_SWITCH;

	/*
	* mark the overflows as consumed
	*/
	set->npend_ovfls = 0;
	bitmap_zero(cast_ulp(set->povfl_pmds), max_pmd);

	return num_64b_ovfls;
	}

	/**
	* pfm_intr_get_smpl_pmds_values - copy 64-bit pmd values for sampling format
	* @ctx: context to work on
	* @set: current event set
	* @arg: overflow arg to be passed to format
	* @smpl_pmds: list of PMDs of interest for the overflowed register
	*
	* build an array of 46-bit PMD values based on smpl_pmds. Values are
	* stored in increasing order of the PMD indexes
	*/
	static void pfm_intr_get_smpl_pmds_values(struct pfm_context *ctx,
	struct pfm_event_set *set,
	struct pfm_ovfl_arg *arg,
	u64 *smpl_pmds)
	{
	u16 j, k;
	u64 new_val, ovfl_mask;
	u64 *cnt_pmds;

	cnt_pmds = ctx->regs.cnt_pmds;
	ovfl_mask = pfm_pmu_conf->ovfl_mask;

	k = 0;
	for_each_set_bit(j, cast_ulp(smpl_pmds), ctx->regs.max_pmd) {
	new_val = pfm_read_pmd(ctx, j);

	/* for counters, build 64-bit value */
	if (test_bit(j, cast_ulp(cnt_pmds)))
	new_val = (set->pmds[j].value & ~ovfl_mask)
	\| (new_val & ovfl_mask);

	arg->smpl_pmds_values[k++] = new_val;

	PFM_DBG_ovfl("s_pmd_val[%u]=pmd%u=0x%llx", k, j,
	(unsigned long long)new_val);
	}
	arg->num_smpl_pmds = k;
	}

	/**
	* pfm_intr_process_smpl_fmt -- handle sampling format callback
	* @ctx: context to work on
	* @set: current event set
	* @ip: interrupted instruction pointer
	* @now: timestamp
	* @num_ovfls: number of 64-bit overflows
	* @ovfl_ctrl: set of controls for interrupt handler tail processing
	* @regs: register state
	*
	* Prepare argument (ovfl_arg) to be passed to sampling format callback, then
	* invoke the callback (fmt_handler)
	*/
	static int pfm_intr_process_smpl_fmt(struct pfm_context *ctx,
	struct pfm_event_set *set,
	unsigned long ip,
	u64 now,
	u64 num_ovfls,
	u32 *ovfl_ctrl,
	struct pt_regs *regs)
	{
	struct pfm_ovfl_arg *ovfl_arg;
	u64 start_cycles, end_cycles;
	u16 max_pmd;
	int i, ret = 0;

	ovfl_arg = &ctx->ovfl_arg;
	ovfl_arg->active_set = set->id;
	max_pmd = ctx->regs.max_pmd;

	/*
	* go over all 64-bit overflow
	*/
	for_each_set_bit(i, cast_ulp(set->ovfl_pmds), max_pmd) {
	/*
	* prepare argument to fmt_handler
	*/
	ovfl_arg->ovfl_pmd = i;
	ovfl_arg->ovfl_ctrl = 0;

	ovfl_arg->pmd_last_reset = set->pmds[i].lval;
	ovfl_arg->pmd_eventid = set->pmds[i].eventid;
	ovfl_arg->num_smpl_pmds = 0;

	/*
	* copy values of pmds of interest, if any
	* Sampling format may use them
	* We do not initialize the unused smpl_pmds_values
	*/
	if (!bitmap_empty(cast_ulp(set->pmds[i].smpl_pmds), max_pmd))
	pfm_intr_get_smpl_pmds_values(ctx, set, ovfl_arg,
	set->pmds[i].smpl_pmds);

	pfm_stats_inc(fmt_handler_calls);

	/*
	* call format record (handler) routine
	*/
	start_cycles = sched_clock();
	ret = (*ctx->smpl_fmt->fmt_handler)(ctx, ip, now, regs);
	end_cycles = sched_clock();

	/*
	* The reset_pmds mask is constructed automatically
	* on overflow. When the actual reset takes place
	* depends on the masking, switch and notification
	* status. It may be deferred until pfm_restart().
	*/
	*ovfl_ctrl \|= ovfl_arg->ovfl_ctrl;

	pfm_stats_add(fmt_handler_ns, end_cycles - start_cycles);
	}
	/*
	* when the format cannot handle the rest of the overflow, we abort
	*/
	if (ret)
	PFM_DBG_ovfl("handler aborted at PMD%u ret=%d", i, ret);
	return ret;
	}
	/**
	* pfm_overflow_handler - main overflow processing routine.
	* @ctx: context to work on (always current context)
	* @set: current event set
	* @ip: interrupt instruction pointer
	* @regs: machine state
	*
	* set->num_ovfl_pmds is 0 when returning from this function even though
	* set->ovfl_pmds[] may have bits set. When leaving set->num_ovfl_pmds
	* must never be used to determine if there was a pending overflow.
	*/
	static void pfm_overflow_handler(struct pfm_context *ctx,
	struct pfm_event_set *set,
	unsigned long ip,
	struct pt_regs *regs)
	{
	struct pfm_event_set *set_orig;
	u64 now;
	u32 ovfl_ctrl;
	u16 max_intr, max_pmd;
	u16 num_ovfls;
	int ret, has_notify;

	/*
	* take timestamp
	*/
	now = sched_clock();

	max_pmd = ctx->regs.max_pmd;
	max_intr = ctx->regs.max_intr_pmd;

	set_orig = set;
	ovfl_ctrl = 0;

	/*
	* skip ZOMBIE case
	*/
	if (unlikely(ctx->state == PFM_CTX_ZOMBIE))
	goto stop_monitoring;

	PFM_DBG_ovfl("intr_pmds=0x%llx npend=%u ip=%p, blocking=%d "
	"u_pmds=0x%llx use_fmt=%u",
	(unsigned long long)set->povfl_pmds[0],
	set->npend_ovfls,
	(void *)ip,
	ctx->flags.block,
	(unsigned long long)set->used_pmds[0],
	!!ctx->smpl_fmt);

	/*
	* return number of 64-bit overflows
	*/
	num_ovfls = pfm_intr_process_64bit_ovfls(ctx, set, &ovfl_ctrl);

	/*
	* there were no 64-bit overflows
	* nothing else to do
	*/
	if (!num_ovfls)
	return;

	/*
	* tmp_ovfl_notify = ovfl_pmds & ovfl_notify
	* with:
	* - ovfl_pmds: last 64-bit overflowed pmds
	* - ovfl_notify: notify on overflow registers
	*/
	bitmap_and(cast_ulp(ctx->tmp_ovfl_notify),
	cast_ulp(set->ovfl_pmds),
	cast_ulp(set->ovfl_notify),
	max_intr);

	has_notify = !bitmap_empty(cast_ulp(ctx->tmp_ovfl_notify), max_intr);

	/*
	* check for sampling format and invoke fmt_handler
	*/
	if (likely(ctx->smpl_fmt)) {
	pfm_intr_process_smpl_fmt(ctx, set, ip, now, num_ovfls,
	&ovfl_ctrl, regs);
	} else {
	/*
	* When no sampling format is used, the default
	* is:
	* - mask monitoring if not switching
	* - notify user if requested
	*
	* If notification is not requested, monitoring is masked
	* and overflowed registers are not reset (saturation).
	* This mimics the behavior of the default sampling format.
	*/
	ovfl_ctrl \|= PFM_OVFL_CTRL_NOTIFY;
	if (has_notify \|\| !(ovfl_ctrl & PFM_OVFL_CTRL_SWITCH))
	ovfl_ctrl \|= PFM_OVFL_CTRL_MASK;
	}

	PFM_DBG_ovfl("set%u o_notify=0x%llx o_pmds=0x%llx "
	"r_pmds=0x%llx ovfl_ctrl=0x%x",
	set->id,
	(unsigned long long)ctx->tmp_ovfl_notify[0],
	(unsigned long long)set->ovfl_pmds[0],
	(unsigned long long)set->reset_pmds[0],
	ovfl_ctrl);

	/*
	* execute the various controls
	* ORDER MATTERS
	*/


	/*
	* mask monitoring
	*/
	if (ovfl_ctrl & PFM_OVFL_CTRL_MASK) {
	pfm_mask_monitoring(ctx, set);
	/*
	* when masking, reset is deferred until
	* pfm_restart()
	*/
	ovfl_ctrl &= ~PFM_OVFL_CTRL_RESET;

	/*
	* when masking, switching is deferred until
	* pfm_restart and we need to remember it
	*/
	if (ovfl_ctrl & PFM_OVFL_CTRL_SWITCH) {
	set->priv_flags \|= PFM_SETFL_PRIV_SWITCH;
	ovfl_ctrl &= ~PFM_OVFL_CTRL_SWITCH;
	}
	}

	/*
	* switch event set
	*/
	if (ovfl_ctrl & PFM_OVFL_CTRL_SWITCH) {
	pfm_switch_sets_from_intr(ctx);
	/* update view of active set */
	set = ctx->active_set;
	}
	/*
	* send overflow notification
	*
	* only necessary if at least one overflowed
	* register had the notify flag set
	*/
	if (has_notify && (ovfl_ctrl & PFM_OVFL_CTRL_NOTIFY)) {
	/*
	* block on notify, not on masking
	*/
	if (ctx->flags.block)
	pfm_post_work(current, ctx, PFM_WORK_BLOCK);

	/*
	* send notification and passed original set id
	* if error, queue full, for instance, then default
	* to masking monitoring, i.e., saturate
	*/
	ret = pfm_ovfl_notify(ctx, set_orig, ip);
	if (unlikely(ret)) {
	if (ctx->state == PFM_CTX_LOADED) {
	pfm_mask_monitoring(ctx, set);
	ovfl_ctrl &= ~PFM_OVFL_CTRL_RESET;
	}
	} else {
	ctx->flags.can_restart++;
	PFM_DBG_ovfl("can_restart=%u", ctx->flags.can_restart);
	}
	}

	/*
	* reset overflowed registers
	*/
	if (ovfl_ctrl & PFM_OVFL_CTRL_RESET) {
	u16 nn;
	nn = bitmap_weight(cast_ulp(set->reset_pmds), max_pmd);
	if (nn)
	pfm_reset_pmds(ctx, set, nn, PFM_PMD_RESET_SHORT);
	}
	return;

	stop_monitoring:
	/*
	* Does not happen for a system-wide context nor for a
	* self-monitored context. We cannot attach to kernel-only
	* thread, thus it is safe to set TIF bits, i.e., the thread
	* will eventually leave the kernel or die and either we will
	* catch the context and clean it up in pfm_handler_work() or
	* pfm_exit_thread().
	*
	* Mask until we get to pfm_handle_work()
	*/
	pfm_mask_monitoring(ctx, set);

	PFM_DBG_ovfl("ctx is zombie, converted to spurious");
	pfm_post_work(current, ctx, PFM_WORK_ZOMBIE);
	}

	/**
	* __pfm_interrupt_handler - 1st level interrupt handler
	* @ip: interrupted instruction pointer
	* @regs: machine state
	*
	* Function is static because we use a wrapper to easily capture timing infos.
	*
	*
	* Context locking necessary to avoid concurrent accesses from other CPUs
	* - For per-thread, we must prevent pfm_restart() which works when
	* context is LOADED or MASKED
	*/
	static void __pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs)
	{
	struct task_struct *task;
	struct pfm_context *ctx;
	struct pfm_event_set *set;


	task = __get_cpu_var(pmu_owner);
	ctx = __get_cpu_var(pmu_ctx);

	/*
	* verify if there is a context on this CPU
	*/
	if (unlikely(ctx == NULL)) {
	PFM_DBG_ovfl("no ctx");
	goto spurious;
	}

	/*
	* we need to lock context because it could be accessed
	* from another CPU. Depending on the priority level of
	* the PMU interrupt or the arch, it may be necessary to
	* mask interrupts alltogether to avoid race condition with
	* the timer interrupt in case of time-based set switching,
	* for instance.
	*/
	spin_lock(&ctx->lock);

	set = ctx->active_set;

	/*
	* For SMP per-thread, it is not possible to have
	* owner != NULL && task != current.
	*
	* For UP per-thread, because of lazy save, it
	* is possible to receive an interrupt in another task
	* which is not using the PMU. This means
	* that the interrupt was in-flight at the
	* time of pfm_ctxswout_thread(). In that
	* case, it will be replayed when the task
	* is scheduled again. Hence we convert to spurious.
	*
	* The basic rule is that an overflow is always
	* processed in the context of the task that
	* generated it for all per-thread contexts.
	*
	* for system-wide, task is always NULL
	*/
	#ifndef CONFIG_SMP
	if (unlikely((task && current->pfm_context != ctx))) {
	PFM_DBG_ovfl("spurious: not owned by current task");
	goto spurious;
	}
	#endif
	if (unlikely(ctx->state == PFM_CTX_MASKED)) {
	PFM_DBG_ovfl("spurious: monitoring masked");
	goto spurious;
	}

	/*
	* check that monitoring is active, otherwise convert
	* to spurious
	*/
	if (unlikely(!pfm_arch_is_active(ctx))) {
	PFM_DBG_ovfl("spurious: monitoring non active");
	goto spurious;
	}

	/*
	* freeze PMU and collect overflowed PMD registers
	* into set->povfl_pmds. Number of overflowed PMDs
	* reported in set->npend_ovfls
	*/
	pfm_arch_intr_freeze_pmu(ctx, set);

	/*
	* no overflow detected, interrupt may have come
	* from the previous thread running on this CPU
	*/
	if (unlikely(!set->npend_ovfls)) {
	PFM_DBG_ovfl("no npend_ovfls");
	goto spurious;
	}

	pfm_stats_inc(ovfl_intr_regular_count);

	/*
	* invoke actual handler
	*/
	pfm_overflow_handler(ctx, set, ip, regs);

	/*
	* unfreeze PMU, monitoring may not actual be restarted
	* if context is MASKED
	*/
	pfm_arch_intr_unfreeze_pmu(ctx);

	spin_unlock(&ctx->lock);

	return;

	spurious:
	/* ctx may be NULL */
	pfm_arch_intr_unfreeze_pmu(ctx);
	if (ctx)
	spin_unlock(&ctx->lock);

	pfm_stats_inc(ovfl_intr_spurious_count);
	}


	/**
	* pfm_interrupt_handler - 1st level interrupt handler
	* @ip: interrupt instruction pointer
	* @regs: machine state
	*
	* Function called from the low-level assembly code or arch-specific perfmon
	* code. Simple wrapper used for timing purpose. Actual work done in
	* __pfm_overflow_handler()
	*/
	void pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs)
	{
	u64 start;

	pfm_stats_inc(ovfl_intr_all_count);

	BUG_ON(!irqs_disabled());

	start = sched_clock();

	__pfm_interrupt_handler(ip, regs);

	pfm_stats_add(ovfl_intr_ns, sched_clock() - start);
	}
	EXPORT_SYMBOL(pfm_interrupt_handler);