patches/hwlatdetect.patch - pub/scm/linux/kernel/git/paulg/4.8-rt-patches - Git at Google

 Subject: hwlatdetect.patch
 From: Carsten Emde <C.Emde@osadl.org>
 Date: Tue, 19 Jul 2011 13:53:12 +0100

 Jon Masters developed this wonderful SMI detector. For details please
 consult Documentation/hwlat_detector.txt. It could be ported to Linux
 3.0 RT without any major change.

 Signed-off-by: Carsten Emde <C.Emde@osadl.org>

 ---
  Documentation/hwlat_detector.txt |   64 ++
  drivers/misc/Kconfig             |   29
  drivers/misc/Makefile            |    1
  drivers/misc/hwlat_detector.c    | 1212 +++++++++++++++++++++++++++++++++++++++
  4 files changed, 1306 insertions(+)

 --- /dev/null
 +++ b/Documentation/hwlat_detector.txt
 @@ -0,0 +1,64 @@
 +Introduction:
 +-------------
 +
 +The module hwlat_detector is a special purpose kernel module that is used to
 +detect large system latencies induced by the behavior of certain underlying
 +hardware or firmware, independent of Linux itself. The code was developed
 +originally to detect SMIs (System Management Interrupts) on x86 systems,
 +however there is nothing x86 specific about this patchset. It was
 +originally written for use by the "RT" patch since the Real Time
 +kernel is highly latency sensitive.
 +
 +SMIs are usually not serviced by the Linux kernel, which typically does not
 +even know that they are occuring. SMIs are instead are set up by BIOS code
 +and are serviced by BIOS code, usually for "critical" events such as
 +management of thermal sensors and fans. Sometimes though, SMIs are used for
 +other tasks and those tasks can spend an inordinate amount of time in the
 +handler (sometimes measured in milliseconds). Obviously this is a problem if
 +you are trying to keep event service latencies down in the microsecond range.
 +
 +The hardware latency detector works by hogging all of the cpus for configurable
 +amounts of time (by calling stop_machine()), polling the CPU Time Stamp Counter
 +for some period, then looking for gaps in the TSC data. Any gap indicates a
 +time when the polling was interrupted and since the machine is stopped and
 +interrupts turned off the only thing that could do that would be an SMI.
 +
 +Note that the SMI detector should *NEVER* be used in a production environment.
 +It is intended to be run manually to determine if the hardware platform has a
 +problem with long system firmware service routines.
 +
 +Usage:
 +------
 +
 +Loading the module hwlat_detector passing the parameter "enabled=1" (or by
 +setting the "enable" entry in "hwlat_detector" debugfs toggled on) is the only
 +step required to start the hwlat_detector. It is possible to redefine the
 +threshold in microseconds (us) above which latency spikes will be taken
 +into account (parameter "threshold=").
 +
 +Example:
 +
 +	# modprobe hwlat_detector enabled=1 threshold=100
 +
 +After the module is loaded, it creates a directory named "hwlat_detector" under
 +the debugfs mountpoint, "/debug/hwlat_detector" for this text. It is necessary
 +to have debugfs mounted, which might be on /sys/debug on your system.
 +
 +The /debug/hwlat_detector interface contains the following files:
 +
 +count			- number of latency spikes observed since last reset
 +enable			- a global enable/disable toggle (0/1), resets count
 +max			- maximum hardware latency actually observed (usecs)
 +sample			- a pipe from which to read current raw sample data
 +			  in the format <timestamp> <latency observed usecs>
 +			  (can be opened O_NONBLOCK for a single sample)
 +threshold		- minimum latency value to be considered (usecs)
 +width			- time period to sample with CPUs held (usecs)
 +			  must be less than the total window size (enforced)
 +window			- total period of sampling, width being inside (usecs)
 +
 +By default we will set width to 500,000 and window to 1,000,000, meaning that
 +we will sample every 1,000,000 usecs (1s) for 500,000 usecs (0.5s). If we
 +observe any latencies that exceed the threshold (initially 100 usecs),
 +then we write to a global sample ring buffer of 8K samples, which is
 +consumed by reading from the "sample" (pipe) debugfs file interface.
 --- a/drivers/misc/Kconfig
 +++ b/drivers/misc/Kconfig
 @@ -122,6 +122,35 @@ config IBM_ASM
  	  for information on the specific driver level and support statement
  	  for your IBM server.

 +config HWLAT_DETECTOR
 +	tristate "Testing module to detect hardware-induced latencies"
 +	depends on DEBUG_FS
 +	depends on RING_BUFFER
 +	default m
 +	---help---
 +	  A simple hardware latency detector. Use this module to detect
 +	  large latencies introduced by the behavior of the underlying
 +	  system firmware external to Linux. We do this using periodic
 +	  use of stop_machine to grab all available CPUs and measure
 +	  for unexplainable gaps in the CPU timestamp counter(s). By
 +	  default, the module is not enabled until the "enable" file
 +	  within the "hwlat_detector" debugfs directory is toggled.
 +
 +	  This module is often used to detect SMI (System Management
 +	  Interrupts) on x86 systems, though is not x86 specific. To
 +	  this end, we default to using a sample window of 1 second,
 +	  during which we will sample for 0.5 seconds. If an SMI or
 +	  similar event occurs during that time, it is recorded
 +	  into an 8K samples global ring buffer until retreived.
 +
 +	  WARNING: This software should never be enabled (it can be built
 +	  but should not be turned on after it is loaded) in a production
 +	  environment where high latencies are a concern since the
 +	  sampling mechanism actually introduces latencies for
 +	  regular tasks while the CPU(s) are being held.
 +
 +	  If unsure, say N
 +
  config PHANTOM
  	tristate "Sensable PHANToM (PCI)"
  	depends on PCI
 --- a/drivers/misc/Makefile
 +++ b/drivers/misc/Makefile
 @@ -39,6 +39,7 @@ obj-$(CONFIG_C2PORT)		+= c2port/
  obj-$(CONFIG_HMC6352)		+= hmc6352.o
  obj-y				+= eeprom/
  obj-y				+= cb710/
 +obj-$(CONFIG_HWLAT_DETECTOR)	+= hwlat_detector.o
  obj-$(CONFIG_SPEAR13XX_PCIE_GADGET)	+= spear13xx_pcie_gadget.o
  obj-$(CONFIG_VMWARE_BALLOON)	+= vmw_balloon.o
  obj-$(CONFIG_ARM_CHARLCD)	+= arm-charlcd.o
 --- /dev/null
 +++ b/drivers/misc/hwlat_detector.c
 @@ -0,0 +1,1212 @@
 +/*
 + * hwlat_detector.c - A simple Hardware Latency detector.
 + *
 + * Use this module to detect large system latencies induced by the behavior of
 + * certain underlying system hardware or firmware, independent of Linux itself.
 + * The code was developed originally to detect the presence of SMIs on Intel
 + * and AMD systems, although there is no dependency upon x86 herein.
 + *
 + * The classical example usage of this module is in detecting the presence of
 + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
 + * somewhat special form of hardware interrupt spawned from earlier CPU debug
 + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
 + * LPC (or other device) to generate a special interrupt under certain
 + * circumstances, for example, upon expiration of a special SMI timer device,
 + * due to certain external thermal readings, on certain I/O address accesses,
 + * and other situations. An SMI hits a special CPU pin, triggers a special
 + * SMI mode (complete with special memory map), and the OS is unaware.
 + *
 + * Although certain hardware-inducing latencies are necessary (for example,
 + * a modern system often requires an SMI handler for correct thermal control
 + * and remote management) they can wreak havoc upon any OS-level performance
 + * guarantees toward low-latency, especially when the OS is not even made
 + * aware of the presence of these interrupts. For this reason, we need a
 + * somewhat brute force mechanism to detect these interrupts. In this case,
 + * we do it by hogging all of the CPU(s) for configurable timer intervals,
 + * sampling the built-in CPU timer, looking for discontiguous readings.
 + *
 + * WARNING: This implementation necessarily introduces latencies. Therefore,
 + *          you should NEVER use this module in a production environment
 + *          requiring any kind of low-latency performance guarantee(s).
 + *
 + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
 + *
 + * Includes useful feedback from Clark Williams <clark@redhat.com>
 + *
 + * This file is licensed under the terms of the GNU General Public
 + * License version 2. This program is licensed "as is" without any
 + * warranty of any kind, whether express or implied.
 + */
 +
 +#include <linux/module.h>
 +#include <linux/init.h>
 +#include <linux/ring_buffer.h>
 +#include <linux/stop_machine.h>
 +#include <linux/time.h>
 +#include <linux/hrtimer.h>
 +#include <linux/kthread.h>
 +#include <linux/debugfs.h>
 +#include <linux/seq_file.h>
 +#include <linux/uaccess.h>
 +#include <linux/version.h>
 +#include <linux/delay.h>
 +#include <linux/slab.h>
 +
 +#define BUF_SIZE_DEFAULT	262144UL		/* 8K*(sizeof(entry)) */
 +#define BUF_FLAGS		(RB_FL_OVERWRITE)	/* no block on full */
 +#define U64STR_SIZE		22			/* 20 digits max */
 +
 +#define VERSION			"1.0.0"
 +#define BANNER			"hwlat_detector: "
 +#define DRVNAME			"hwlat_detector"
 +#define DEFAULT_SAMPLE_WINDOW	1000000			/* 1s */
 +#define DEFAULT_SAMPLE_WIDTH	500000			/* 0.5s */
 +#define DEFAULT_LAT_THRESHOLD	10			/* 10us */
 +
 +/* Module metadata */
 +
 +MODULE_LICENSE("GPL");
 +MODULE_AUTHOR("Jon Masters <jcm@redhat.com>");
 +MODULE_DESCRIPTION("A simple hardware latency detector");
 +MODULE_VERSION(VERSION);
 +
 +/* Module parameters */
 +
 +static int debug;
 +static int enabled;
 +static int threshold;
 +
 +module_param(debug, int, 0);			/* enable debug */
 +module_param(enabled, int, 0);			/* enable detector */
 +module_param(threshold, int, 0);		/* latency threshold */
 +
 +/* Buffering and sampling */
 +
 +static struct ring_buffer *ring_buffer;		/* sample buffer */
 +static DEFINE_MUTEX(ring_buffer_mutex);		/* lock changes */
 +static unsigned long buf_size = BUF_SIZE_DEFAULT;
 +static struct task_struct *kthread;		/* sampling thread */
 +
 +/* DebugFS filesystem entries */
 +
 +static struct dentry *debug_dir;		/* debugfs directory */
 +static struct dentry *debug_max;		/* maximum TSC delta */
 +static struct dentry *debug_count;		/* total detect count */
 +static struct dentry *debug_sample_width;	/* sample width us */
 +static struct dentry *debug_sample_window;	/* sample window us */
 +static struct dentry *debug_sample;		/* raw samples us */
 +static struct dentry *debug_threshold;		/* threshold us */
 +static struct dentry *debug_enable;		/* enable/disable */
 +
 +/* Individual samples and global state */
 +
 +struct sample;					/* latency sample */
 +struct data;					/* Global state */
 +
 +/* Sampling functions */
 +static int __buffer_add_sample(struct sample *sample);
 +static struct sample *buffer_get_sample(struct sample *sample);
 +static int get_sample(void *unused);
 +
 +/* Threading and state */
 +static int kthread_fn(void *unused);
 +static int start_kthread(void);
 +static int stop_kthread(void);
 +static void __reset_stats(void);
 +static int init_stats(void);
 +
 +/* Debugfs interface */
 +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
 +				size_t cnt, loff_t *ppos, const u64 *entry);
 +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
 +				 size_t cnt, loff_t *ppos, u64 *entry);
 +static int debug_sample_fopen(struct inode *inode, struct file *filp);
 +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
 +				  size_t cnt, loff_t *ppos);
 +static int debug_sample_release(struct inode *inode, struct file *filp);
 +static int debug_enable_fopen(struct inode *inode, struct file *filp);
 +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
 +				  size_t cnt, loff_t *ppos);
 +static ssize_t debug_enable_fwrite(struct file *file,
 +				   const char __user *user_buffer,
 +				   size_t user_size, loff_t *offset);
 +
 +/* Initialization functions */
 +static int init_debugfs(void);
 +static void free_debugfs(void);
 +static int detector_init(void);
 +static void detector_exit(void);
 +
 +/* Individual latency samples are stored here when detected and packed into
 + * the ring_buffer circular buffer, where they are overwritten when
 + * more than buf_size/sizeof(sample) samples are received. */
 +struct sample {
 +	u64		seqnum;		/* unique sequence */
 +	u64		duration;	/* ktime delta */
 +	struct timespec	timestamp;	/* wall time */
 +	unsigned long   lost;
 +};
 +
 +/* keep the global state somewhere. Mostly used under stop_machine. */
 +static struct data {
 +
 +	struct mutex lock;		/* protect changes */
 +
 +	u64	count;			/* total since reset */
 +	u64	max_sample;		/* max hardware latency */
 +	u64	threshold;		/* sample threshold level */
 +
 +	u64	sample_window;		/* total sampling window (on+off) */
 +	u64	sample_width;		/* active sampling portion of window */
 +
 +	atomic_t sample_open;		/* whether the sample file is open */
 +
 +	wait_queue_head_t wq;		/* waitqeue for new sample values */
 +
 +} data;
 +
 +/**
 + * __buffer_add_sample - add a new latency sample recording to the ring buffer
 + * @sample: The new latency sample value
 + *
 + * This receives a new latency sample and records it in a global ring buffer.
 + * No additional locking is used in this case - suited for stop_machine use.
 + */
 +static int __buffer_add_sample(struct sample *sample)
 +{
 +	return ring_buffer_write(ring_buffer,
 +				 sizeof(struct sample), sample);
 +}
 +
 +/**
 + * buffer_get_sample - remove a hardware latency sample from the ring buffer
 + * @sample: Pre-allocated storage for the sample
 + *
 + * This retrieves a hardware latency sample from the global circular buffer
 + */
 +static struct sample *buffer_get_sample(struct sample *sample)
 +{
 +	struct ring_buffer_event *e = NULL;
 +	struct sample *s = NULL;
 +	unsigned int cpu = 0;
 +
 +	if (!sample)
 +		return NULL;
 +
 +	mutex_lock(&ring_buffer_mutex);
 +	for_each_online_cpu(cpu) {
 +		e = ring_buffer_consume(ring_buffer, cpu, NULL, &sample->lost);
 +		if (e)
 +			break;
 +	}
 +
 +	if (e) {
 +		s = ring_buffer_event_data(e);
 +		memcpy(sample, s, sizeof(struct sample));
 +	} else
 +		sample = NULL;
 +	mutex_unlock(&ring_buffer_mutex);
 +
 +	return sample;
 +}
 +
 +/**
 + * get_sample - sample the CPU TSC and look for likely hardware latencies
 + * @unused: This is not used but is a part of the stop_machine API
 + *
 + * Used to repeatedly capture the CPU TSC (or similar), looking for potential
 + * hardware-induced latency. Called under stop_machine, with data.lock held.
 + */
 +static int get_sample(void *unused)
 +{
 +	ktime_t start, t1, t2;
 +	s64 diff, total = 0;
 +	u64 sample = 0;
 +	int ret = 1;
 +
 +	start = ktime_get(); /* start timestamp */
 +
 +	do {
 +
 +		t1 = ktime_get();	/* we'll look for a discontinuity */
 +		t2 = ktime_get();
 +
 +		total = ktime_to_us(ktime_sub(t2, start)); /* sample width */
 +		diff = ktime_to_us(ktime_sub(t2, t1));     /* current diff */
 +
 +		/* This shouldn't happen */
 +		if (diff < 0) {
 +			pr_err(BANNER "time running backwards\n");
 +			goto out;
 +		}
 +
 +		if (diff > sample)
 +			sample = diff; /* only want highest value */
 +
 +	} while (total <= data.sample_width);
 +
 +	/* If we exceed the threshold value, we have found a hardware latency */
 +	if (sample > data.threshold) {
 +		struct sample s;
 +
 +		data.count++;
 +		s.seqnum = data.count;
 +		s.duration = sample;
 +		s.timestamp = CURRENT_TIME;
 +		__buffer_add_sample(&s);
 +
 +		/* Keep a running maximum ever recorded hardware latency */
 +		if (sample > data.max_sample)
 +			data.max_sample = sample;
 +	}
 +
 +	ret = 0;
 +out:
 +	return ret;
 +}
 +
 +/*
 + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
 + * @unused: A required part of the kthread API.
 + *
 + * Used to periodically sample the CPU TSC via a call to get_sample. We
 + * use stop_machine, whith does (intentionally) introduce latency since we
 + * need to ensure nothing else might be running (and thus pre-empting).
 + * Obviously this should never be used in production environments.
 + *
 + * stop_machine will schedule us typically only on CPU0 which is fine for
 + * almost every real-world hardware latency situation - but we might later
 + * generalize this if we find there are any actualy systems with alternate
 + * SMI delivery or other non CPU0 hardware latencies.
 + */
 +static int kthread_fn(void *unused)
 +{
 +	int err = 0;
 +	u64 interval = 0;
 +
 +	while (!kthread_should_stop()) {
 +
 +		mutex_lock(&data.lock);
 +
 +		err = stop_machine(get_sample, unused, 0);
 +		if (err) {
 +			/* Houston, we have a problem */
 +			mutex_unlock(&data.lock);
 +			goto err_out;
 +		}
 +
 +		wake_up(&data.wq); /* wake up reader(s) */
 +
 +		interval = data.sample_window - data.sample_width;
 +		do_div(interval, USEC_PER_MSEC); /* modifies interval value */
 +
 +		mutex_unlock(&data.lock);
 +
 +		if (msleep_interruptible(interval))
 +			goto out;
 +	}
 +		goto out;
 +err_out:
 +	pr_err(BANNER "could not call stop_machine, disabling\n");
 +	enabled = 0;
 +out:
 +	return err;
 +
 +}
 +
 +/**
 + * start_kthread - Kick off the hardware latency sampling/detector kthread
 + *
 + * This starts a kernel thread that will sit and sample the CPU timestamp
 + * counter (TSC or similar) and look for potential hardware latencies.
 + */
 +static int start_kthread(void)
 +{
 +	kthread = kthread_run(kthread_fn, NULL,
 +					DRVNAME);
 +	if (IS_ERR(kthread)) {
 +		pr_err(BANNER "could not start sampling thread\n");
 +		enabled = 0;
 +		return -ENOMEM;
 +	}
 +
 +	return 0;
 +}
 +
 +/**
 + * stop_kthread - Inform the hardware latency samping/detector kthread to stop
 + *
 + * This kicks the running hardware latency sampling/detector kernel thread and
 + * tells it to stop sampling now. Use this on unload and at system shutdown.
 + */
 +static int stop_kthread(void)
 +{
 +	int ret;
 +
 +	ret = kthread_stop(kthread);
 +
 +	return ret;
 +}
 +
 +/**
 + * __reset_stats - Reset statistics for the hardware latency detector
 + *
 + * We use data to store various statistics and global state. We call this
 + * function in order to reset those when "enable" is toggled on or off, and
 + * also at initialization. Should be called with data.lock held.
 + */
 +static void __reset_stats(void)
 +{
 +	data.count = 0;
 +	data.max_sample = 0;
 +	ring_buffer_reset(ring_buffer); /* flush out old sample entries */
 +}
 +
 +/**
 + * init_stats - Setup global state statistics for the hardware latency detector
 + *
 + * We use data to store various statistics and global state. We also use
 + * a global ring buffer (ring_buffer) to keep raw samples of detected hardware
 + * induced system latencies. This function initializes these structures and
 + * allocates the global ring buffer also.
 + */
 +static int init_stats(void)
 +{
 +	int ret = -ENOMEM;
 +
 +	mutex_init(&data.lock);
 +	init_waitqueue_head(&data.wq);
 +	atomic_set(&data.sample_open, 0);
 +
 +	ring_buffer = ring_buffer_alloc(buf_size, BUF_FLAGS);
 +
 +	if (WARN(!ring_buffer, KERN_ERR BANNER
 +			       "failed to allocate ring buffer!\n"))
 +		goto out;
 +
 +	__reset_stats();
 +	data.threshold = DEFAULT_LAT_THRESHOLD;	    /* threshold us */
 +	data.sample_window = DEFAULT_SAMPLE_WINDOW; /* window us */
 +	data.sample_width = DEFAULT_SAMPLE_WIDTH;   /* width us */
 +
 +	ret = 0;
 +
 +out:
 +	return ret;
 +
 +}
 +
 +/*
 + * simple_data_read - Wrapper read function for global state debugfs entries
 + * @filp: The active open file structure for the debugfs "file"
 + * @ubuf: The userspace provided buffer to read value into
 + * @cnt: The maximum number of bytes to read
 + * @ppos: The current "file" position
 + * @entry: The entry to read from
 + *
 + * This function provides a generic read implementation for the global state
 + * "data" structure debugfs filesystem entries. It would be nice to use
 + * simple_attr_read directly, but we need to make sure that the data.lock
 + * spinlock is held during the actual read (even though we likely won't ever
 + * actually race here as the updater runs under a stop_machine context).
 + */
 +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
 +				size_t cnt, loff_t *ppos, const u64 *entry)
 +{
 +	char buf[U64STR_SIZE];
 +	u64 val = 0;
 +	int len = 0;
 +
 +	memset(buf, 0, sizeof(buf));
 +
 +	if (!entry)
 +		return -EFAULT;
 +
 +	mutex_lock(&data.lock);
 +	val = *entry;
 +	mutex_unlock(&data.lock);
 +
 +	len = snprintf(buf, sizeof(buf), "%llu\n", (unsigned long long)val);
 +
 +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
 +
 +}
 +
 +/*
 + * simple_data_write - Wrapper write function for global state debugfs entries
 + * @filp: The active open file structure for the debugfs "file"
 + * @ubuf: The userspace provided buffer to write value from
 + * @cnt: The maximum number of bytes to write
 + * @ppos: The current "file" position
 + * @entry: The entry to write to
 + *
 + * This function provides a generic write implementation for the global state
 + * "data" structure debugfs filesystem entries. It would be nice to use
 + * simple_attr_write directly, but we need to make sure that the data.lock
 + * spinlock is held during the actual write (even though we likely won't ever
 + * actually race here as the updater runs under a stop_machine context).
 + */
 +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
 +				 size_t cnt, loff_t *ppos, u64 *entry)
 +{
 +	char buf[U64STR_SIZE];
 +	int csize = min(cnt, sizeof(buf));
 +	u64 val = 0;
 +	int err = 0;
 +
 +	memset(buf, '\0', sizeof(buf));
 +	if (copy_from_user(buf, ubuf, csize))
 +		return -EFAULT;
 +
 +	buf[U64STR_SIZE-1] = '\0';			/* just in case */
 +	err = kstrtoull(buf, 10, &val);
 +	if (err)
 +		return -EINVAL;
 +
 +	mutex_lock(&data.lock);
 +	*entry = val;
 +	mutex_unlock(&data.lock);
 +
 +	return csize;
 +}
 +
 +/**
 + * debug_count_fopen - Open function for "count" debugfs entry
 + * @inode: The in-kernel inode representation of the debugfs "file"
 + * @filp: The active open file structure for the debugfs "file"
 + *
 + * This function provides an open implementation for the "count" debugfs
 + * interface to the hardware latency detector.
 + */
 +static int debug_count_fopen(struct inode *inode, struct file *filp)
 +{
 +	return 0;
 +}
 +
 +/**
 + * debug_count_fread - Read function for "count" debugfs entry
 + * @filp: The active open file structure for the debugfs "file"
 + * @ubuf: The userspace provided buffer to read value into
 + * @cnt: The maximum number of bytes to read
 + * @ppos: The current "file" position
 + *
 + * This function provides a read implementation for the "count" debugfs
 + * interface to the hardware latency detector. Can be used to read the
 + * number of latency readings exceeding the configured threshold since
 + * the detector was last reset (e.g. by writing a zero into "count").
 + */
 +static ssize_t debug_count_fread(struct file *filp, char __user *ubuf,
 +				     size_t cnt, loff_t *ppos)
 +{
 +	return simple_data_read(filp, ubuf, cnt, ppos, &data.count);
 +}
 +
 +/**
 + * debug_count_fwrite - Write function for "count" debugfs entry
 + * @filp: The active open file structure for the debugfs "file"
 + * @ubuf: The user buffer that contains the value to write
 + * @cnt: The maximum number of bytes to write to "file"
 + * @ppos: The current position in the debugfs "file"
 + *
 + * This function provides a write implementation for the "count" debugfs
 + * interface to the hardware latency detector. Can be used to write a
 + * desired value, especially to zero the total count.
 + */
 +static ssize_t  debug_count_fwrite(struct file *filp,
 +				       const char __user *ubuf,
 +				       size_t cnt,
 +				       loff_t *ppos)
 +{
 +	return simple_data_write(filp, ubuf, cnt, ppos, &data.count);
 +}
 +
 +/**
 + * debug_enable_fopen - Dummy open function for "enable" debugfs interface
 + * @inode: The in-kernel inode representation of the debugfs "file"
 + * @filp: The active open file structure for the debugfs "file"
 + *
 + * This function provides an open implementation for the "enable" debugfs
 + * interface to the hardware latency detector.
 + */
 +static int debug_enable_fopen(struct inode *inode, struct file *filp)
 +{
 +	return 0;
 +}
 +
 +/**
 + * debug_enable_fread - Read function for "enable" debugfs interface
 + * @filp: The active open file structure for the debugfs "file"
 + * @ubuf: The userspace provided buffer to read value into
 + * @cnt: The maximum number of bytes to read
 + * @ppos: The current "file" position
 + *
 + * This function provides a read implementation for the "enable" debugfs
 + * interface to the hardware latency detector. Can be used to determine
 + * whether the detector is currently enabled ("0\n" or "1\n" returned).
 + */
 +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
 +				      size_t cnt, loff_t *ppos)
 +{
 +	char buf[4];
 +
 +	if ((cnt < sizeof(buf)) || (*ppos))
 +		return 0;
 +
 +	buf[0] = enabled ? '1' : '0';
 +	buf[1] = '\n';
 +	buf[2] = '\0';
 +	if (copy_to_user(ubuf, buf, strlen(buf)))
 +		return -EFAULT;
 +	return *ppos = strlen(buf);
 +}
 +
 +/**
 + * debug_enable_fwrite - Write function for "enable" debugfs interface
 + * @filp: The active open file structure for the debugfs "file"
 + * @ubuf: The user buffer that contains the value to write
 + * @cnt: The maximum number of bytes to write to "file"
 + * @ppos: The current position in the debugfs "file"
 + *
 + * This function provides a write implementation for the "enable" debugfs
 + * interface to the hardware latency detector. Can be used to enable or
 + * disable the detector, which will have the side-effect of possibly
 + * also resetting the global stats and kicking off the measuring
 + * kthread (on an enable) or the converse (upon a disable).
 + */
 +static ssize_t  debug_enable_fwrite(struct file *filp,
 +					const char __user *ubuf,
 +					size_t cnt,
 +					loff_t *ppos)
 +{
 +	char buf[4];
 +	int csize = min(cnt, sizeof(buf));
 +	long val = 0;
 +	int err = 0;
 +
 +	memset(buf, '\0', sizeof(buf));
 +	if (copy_from_user(buf, ubuf, csize))
 +		return -EFAULT;
 +
 +	buf[sizeof(buf)-1] = '\0';			/* just in case */
 +	err = kstrtoul(buf, 10, &val);
 +	if (err)
 +		return -EINVAL;
 +
 +	if (val) {
 +		if (enabled)
 +			goto unlock;
 +		enabled = 1;
 +		__reset_stats();
 +		if (start_kthread())
 +			return -EFAULT;
 +	} else {
 +		if (!enabled)
 +			goto unlock;
 +		enabled = 0;
 +		err = stop_kthread();
 +		if (err) {
 +			pr_err(BANNER "cannot stop kthread\n");
 +			return -EFAULT;
 +		}
 +		wake_up(&data.wq);		/* reader(s) should return */
 +	}
 +unlock:
 +	return csize;
 +}
 +
 +/**
 + * debug_max_fopen - Open function for "max" debugfs entry
 + * @inode: The in-kernel inode representation of the debugfs "file"
 + * @filp: The active open file structure for the debugfs "file"
 + *
 + * This function provides an open implementation for the "max" debugfs
 + * interface to the hardware latency detector.
 + */
 +static int debug_max_fopen(struct inode *inode, struct file *filp)
 +{
 +	return 0;
 +}
 +
 +/**
 + * debug_max_fread - Read function for "max" debugfs entry
 + * @filp: The active open file structure for the debugfs "file"
 + * @ubuf: The userspace provided buffer to read value into
 + * @cnt: The maximum number of bytes to read
 + * @ppos: The current "file" position
 + *
 + * This function provides a read implementation for the "max" debugfs
 + * interface to the hardware latency detector. Can be used to determine
 + * the maximum latency value observed since it was last reset.
 + */
 +static ssize_t debug_max_fread(struct file *filp, char __user *ubuf,
 +				   size_t cnt, loff_t *ppos)
 +{
 +	return simple_data_read(filp, ubuf, cnt, ppos, &data.max_sample);
 +}
 +
 +/**
 + * debug_max_fwrite - Write function for "max" debugfs entry
 + * @filp: The active open file structure for the debugfs "file"
 + * @ubuf: The user buffer that contains the value to write
 + * @cnt: The maximum number of bytes to write to "file"
 + * @ppos: The current position in the debugfs "file"
 + *
 + * This function provides a write implementation for the "max" debugfs
 + * interface to the hardware latency detector. Can be used to reset the
 + * maximum or set it to some other desired value - if, then, subsequent
 + * measurements exceed this value, the maximum will be updated.
 + */
 +static ssize_t  debug_max_fwrite(struct file *filp,
 +				     const char __user *ubuf,
 +				     size_t cnt,
 +				     loff_t *ppos)
 +{
 +	return simple_data_write(filp, ubuf, cnt, ppos, &data.max_sample);
 +}
 +
 +
 +/**
 + * debug_sample_fopen - An open function for "sample" debugfs interface
 + * @inode: The in-kernel inode representation of this debugfs "file"
 + * @filp: The active open file structure for the debugfs "file"
 + *
 + * This function handles opening the "sample" file within the hardware
 + * latency detector debugfs directory interface. This file is used to read
 + * raw samples from the global ring_buffer and allows the user to see a
 + * running latency history. Can be opened blocking or non-blocking,
 + * affecting whether it behaves as a buffer read pipe, or does not.
 + * Implements simple locking to prevent multiple simultaneous use.
 + */
 +static int debug_sample_fopen(struct inode *inode, struct file *filp)
 +{
 +	if (!atomic_add_unless(&data.sample_open, 1, 1))
 +		return -EBUSY;
 +	else
 +		return 0;
 +}
 +
 +/**
 + * debug_sample_fread - A read function for "sample" debugfs interface
 + * @filp: The active open file structure for the debugfs "file"
 + * @ubuf: The user buffer that will contain the samples read
 + * @cnt: The maximum bytes to read from the debugfs "file"
 + * @ppos: The current position in the debugfs "file"
 + *
 + * This function handles reading from the "sample" file within the hardware
 + * latency detector debugfs directory interface. This file is used to read
 + * raw samples from the global ring_buffer and allows the user to see a
 + * running latency history. By default this will block pending a new
 + * value written into the sample buffer, unless there are already a
 + * number of value(s) waiting in the buffer, or the sample file was
 + * previously opened in a non-blocking mode of operation.
 + */
 +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
 +					size_t cnt, loff_t *ppos)
 +{
 +	int len = 0;
 +	char buf[64];
 +	struct sample *sample = NULL;
 +
 +	if (!enabled)
 +		return 0;
 +
 +	sample = kzalloc(sizeof(struct sample), GFP_KERNEL);
 +	if (!sample)
 +		return -ENOMEM;
 +
 +	while (!buffer_get_sample(sample)) {
 +
 +		DEFINE_WAIT(wait);
 +
 +		if (filp->f_flags & O_NONBLOCK) {
 +			len = -EAGAIN;
 +			goto out;
 +		}
 +
 +		prepare_to_wait(&data.wq, &wait, TASK_INTERRUPTIBLE);
 +		schedule();
 +		finish_wait(&data.wq, &wait);
 +
 +		if (signal_pending(current)) {
 +			len = -EINTR;
 +			goto out;
 +		}
 +
 +		if (!enabled) {			/* enable was toggled */
 +			len = 0;
 +			goto out;
 +		}
 +	}
 +
 +	len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\n",
 +		      sample->timestamp.tv_sec,
 +		      sample->timestamp.tv_nsec,
 +		      sample->duration);
 +
 +
 +	/* handling partial reads is more trouble than it's worth */
 +	if (len > cnt)
 +		goto out;
 +
 +	if (copy_to_user(ubuf, buf, len))
 +		len = -EFAULT;
 +
 +out:
 +	kfree(sample);
 +	return len;
 +}
 +
 +/**
 + * debug_sample_release - Release function for "sample" debugfs interface
 + * @inode: The in-kernel inode represenation of the debugfs "file"
 + * @filp: The active open file structure for the debugfs "file"
 + *
 + * This function completes the close of the debugfs interface "sample" file.
 + * Frees the sample_open "lock" so that other users may open the interface.
 + */
 +static int debug_sample_release(struct inode *inode, struct file *filp)
 +{
 +	atomic_dec(&data.sample_open);
 +
 +	return 0;
 +}
 +
 +/**
 + * debug_threshold_fopen - Open function for "threshold" debugfs entry
 + * @inode: The in-kernel inode representation of the debugfs "file"
 + * @filp: The active open file structure for the debugfs "file"
 + *
 + * This function provides an open implementation for the "threshold" debugfs
 + * interface to the hardware latency detector.
 + */
 +static int debug_threshold_fopen(struct inode *inode, struct file *filp)
 +{
 +	return 0;
 +}
 +
 +/**
 + * debug_threshold_fread - Read function for "threshold" debugfs entry
 + * @filp: The active open file structure for the debugfs "file"
 + * @ubuf: The userspace provided buffer to read value into
 + * @cnt: The maximum number of bytes to read
 + * @ppos: The current "file" position
 + *
 + * This function provides a read implementation for the "threshold" debugfs
 + * interface to the hardware latency detector. It can be used to determine
 + * the current threshold level at which a latency will be recorded in the
 + * global ring buffer, typically on the order of 10us.
 + */
 +static ssize_t debug_threshold_fread(struct file *filp, char __user *ubuf,
 +					 size_t cnt, loff_t *ppos)
 +{
 +	return simple_data_read(filp, ubuf, cnt, ppos, &data.threshold);
 +}
 +
 +/**
 + * debug_threshold_fwrite - Write function for "threshold" debugfs entry
 + * @filp: The active open file structure for the debugfs "file"
 + * @ubuf: The user buffer that contains the value to write
 + * @cnt: The maximum number of bytes to write to "file"
 + * @ppos: The current position in the debugfs "file"
 + *
 + * This function provides a write implementation for the "threshold" debugfs
 + * interface to the hardware latency detector. It can be used to configure
 + * the threshold level at which any subsequently detected latencies will
 + * be recorded into the global ring buffer.
 + */
 +static ssize_t  debug_threshold_fwrite(struct file *filp,
 +					const char __user *ubuf,
 +					size_t cnt,
 +					loff_t *ppos)
 +{
 +	int ret;
 +
 +	ret = simple_data_write(filp, ubuf, cnt, ppos, &data.threshold);
 +
 +	if (enabled)
 +		wake_up_process(kthread);
 +
 +	return ret;
 +}
 +
 +/**
 + * debug_width_fopen - Open function for "width" debugfs entry
 + * @inode: The in-kernel inode representation of the debugfs "file"
 + * @filp: The active open file structure for the debugfs "file"
 + *
 + * This function provides an open implementation for the "width" debugfs
 + * interface to the hardware latency detector.
 + */
 +static int debug_width_fopen(struct inode *inode, struct file *filp)
 +{
 +	return 0;
 +}
 +
 +/**
 + * debug_width_fread - Read function for "width" debugfs entry
 + * @filp: The active open file structure for the debugfs "file"
 + * @ubuf: The userspace provided buffer to read value into
 + * @cnt: The maximum number of bytes to read
 + * @ppos: The current "file" position
 + *
 + * This function provides a read implementation for the "width" debugfs
 + * interface to the hardware latency detector. It can be used to determine
 + * for how many us of the total window us we will actively sample for any
 + * hardware-induced latecy periods. Obviously, it is not possible to
 + * sample constantly and have the system respond to a sample reader, or,
 + * worse, without having the system appear to have gone out to lunch.
 + */
 +static ssize_t debug_width_fread(struct file *filp, char __user *ubuf,
 +				     size_t cnt, loff_t *ppos)
 +{
 +	return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_width);
 +}
 +
 +/**
 + * debug_width_fwrite - Write function for "width" debugfs entry
 + * @filp: The active open file structure for the debugfs "file"
 + * @ubuf: The user buffer that contains the value to write
 + * @cnt: The maximum number of bytes to write to "file"
 + * @ppos: The current position in the debugfs "file"
 + *
 + * This function provides a write implementation for the "width" debugfs
 + * interface to the hardware latency detector. It can be used to configure
 + * for how many us of the total window us we will actively sample for any
 + * hardware-induced latency periods. Obviously, it is not possible to
 + * sample constantly and have the system respond to a sample reader, or,
 + * worse, without having the system appear to have gone out to lunch. It
 + * is enforced that width is less that the total window size.
 + */
 +static ssize_t  debug_width_fwrite(struct file *filp,
 +				       const char __user *ubuf,
 +				       size_t cnt,
 +				       loff_t *ppos)
 +{
 +	char buf[U64STR_SIZE];
 +	int csize = min(cnt, sizeof(buf));
 +	u64 val = 0;
 +	int err = 0;
 +
 +	memset(buf, '\0', sizeof(buf));
 +	if (copy_from_user(buf, ubuf, csize))
 +		return -EFAULT;
 +
 +	buf[U64STR_SIZE-1] = '\0';			/* just in case */
 +	err = kstrtoull(buf, 10, &val);
 +	if (err)
 +		return -EINVAL;
 +
 +	mutex_lock(&data.lock);
 +	if (val < data.sample_window)
 +		data.sample_width = val;
 +	else {
 +		mutex_unlock(&data.lock);
 +		return -EINVAL;
 +	}
 +	mutex_unlock(&data.lock);
 +
 +	if (enabled)
 +		wake_up_process(kthread);
 +
 +	return csize;
 +}
 +
 +/**
 + * debug_window_fopen - Open function for "window" debugfs entry
 + * @inode: The in-kernel inode representation of the debugfs "file"
 + * @filp: The active open file structure for the debugfs "file"
 + *
 + * This function provides an open implementation for the "window" debugfs
 + * interface to the hardware latency detector. The window is the total time
 + * in us that will be considered one sample period. Conceptually, windows
 + * occur back-to-back and contain a sample width period during which
 + * actual sampling occurs.
 + */
 +static int debug_window_fopen(struct inode *inode, struct file *filp)
 +{
 +	return 0;
 +}
 +
 +/**
 + * debug_window_fread - Read function for "window" debugfs entry
 + * @filp: The active open file structure for the debugfs "file"
 + * @ubuf: The userspace provided buffer to read value into
 + * @cnt: The maximum number of bytes to read
 + * @ppos: The current "file" position
 + *
 + * This function provides a read implementation for the "window" debugfs
 + * interface to the hardware latency detector. The window is the total time
 + * in us that will be considered one sample period. Conceptually, windows
 + * occur back-to-back and contain a sample width period during which
 + * actual sampling occurs. Can be used to read the total window size.
 + */
 +static ssize_t debug_window_fread(struct file *filp, char __user *ubuf,
 +				      size_t cnt, loff_t *ppos)
 +{
 +	return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_window);
 +}
 +
 +/**
 + * debug_window_fwrite - Write function for "window" debugfs entry
 + * @filp: The active open file structure for the debugfs "file"
 + * @ubuf: The user buffer that contains the value to write
 + * @cnt: The maximum number of bytes to write to "file"
 + * @ppos: The current position in the debugfs "file"
 + *
 + * This function provides a write implementation for the "window" debufds
 + * interface to the hardware latency detetector. The window is the total time
 + * in us that will be considered one sample period. Conceptually, windows
 + * occur back-to-back and contain a sample width period during which
 + * actual sampling occurs. Can be used to write a new total window size. It
 + * is enfoced that any value written must be greater than the sample width
 + * size, or an error results.
 + */
 +static ssize_t  debug_window_fwrite(struct file *filp,
 +					const char __user *ubuf,
 +					size_t cnt,
 +					loff_t *ppos)
 +{
 +	char buf[U64STR_SIZE];
 +	int csize = min(cnt, sizeof(buf));
 +	u64 val = 0;
 +	int err = 0;
 +
 +	memset(buf, '\0', sizeof(buf));
 +	if (copy_from_user(buf, ubuf, csize))
 +		return -EFAULT;
 +
 +	buf[U64STR_SIZE-1] = '\0';			/* just in case */
 +	err = kstrtoull(buf, 10, &val);
 +	if (err)
 +		return -EINVAL;
 +
 +	mutex_lock(&data.lock);
 +	if (data.sample_width < val)
 +		data.sample_window = val;
 +	else {
 +		mutex_unlock(&data.lock);
 +		return -EINVAL;
 +	}
 +	mutex_unlock(&data.lock);
 +
 +	return csize;
 +}
 +
 +/*
 + * Function pointers for the "count" debugfs file operations
 + */
 +static const struct file_operations count_fops = {
 +	.open		= debug_count_fopen,
 +	.read		= debug_count_fread,
 +	.write		= debug_count_fwrite,
 +	.owner		= THIS_MODULE,
 +};
 +
 +/*
 + * Function pointers for the "enable" debugfs file operations
 + */
 +static const struct file_operations enable_fops = {
 +	.open		= debug_enable_fopen,
 +	.read		= debug_enable_fread,
 +	.write		= debug_enable_fwrite,
 +	.owner		= THIS_MODULE,
 +};
 +
 +/*
 + * Function pointers for the "max" debugfs file operations
 + */
 +static const struct file_operations max_fops = {
 +	.open		= debug_max_fopen,
 +	.read		= debug_max_fread,
 +	.write		= debug_max_fwrite,
 +	.owner		= THIS_MODULE,
 +};
 +
 +/*
 + * Function pointers for the "sample" debugfs file operations
 + */
 +static const struct file_operations sample_fops = {
 +	.open		= debug_sample_fopen,
 +	.read		= debug_sample_fread,
 +	.release	= debug_sample_release,
 +	.owner		= THIS_MODULE,
 +};
 +
 +/*
 + * Function pointers for the "threshold" debugfs file operations
 + */
 +static const struct file_operations threshold_fops = {
 +	.open		= debug_threshold_fopen,
 +	.read		= debug_threshold_fread,
 +	.write		= debug_threshold_fwrite,
 +	.owner		= THIS_MODULE,
 +};
 +
 +/*
 + * Function pointers for the "width" debugfs file operations
 + */
 +static const struct file_operations width_fops = {
 +	.open		= debug_width_fopen,
 +	.read		= debug_width_fread,
 +	.write		= debug_width_fwrite,
 +	.owner		= THIS_MODULE,
 +};
 +
 +/*
 + * Function pointers for the "window" debugfs file operations
 + */
 +static const struct file_operations window_fops = {
 +	.open		= debug_window_fopen,
 +	.read		= debug_window_fread,
 +	.write		= debug_window_fwrite,
 +	.owner		= THIS_MODULE,
 +};
 +
 +/**
 + * init_debugfs - A function to initialize the debugfs interface files
 + *
 + * This function creates entries in debugfs for "hwlat_detector", including
 + * files to read values from the detector, current samples, and the
 + * maximum sample that has been captured since the hardware latency
 + * dectector was started.
 + */
 +static int init_debugfs(void)
 +{
 +	int ret = -ENOMEM;
 +
 +	debug_dir = debugfs_create_dir(DRVNAME, NULL);
 +	if (!debug_dir)
 +		goto err_debug_dir;
 +
 +	debug_sample = debugfs_create_file("sample", 0444,
 +					       debug_dir, NULL,
 +					       &sample_fops);
 +	if (!debug_sample)
 +		goto err_sample;
 +
 +	debug_count = debugfs_create_file("count", 0444,
 +					      debug_dir, NULL,
 +					      &count_fops);
 +	if (!debug_count)
 +		goto err_count;
 +
 +	debug_max = debugfs_create_file("max", 0444,
 +					    debug_dir, NULL,
 +					    &max_fops);
 +	if (!debug_max)
 +		goto err_max;
 +
 +	debug_sample_window = debugfs_create_file("window", 0644,
 +						      debug_dir, NULL,
 +						      &window_fops);
 +	if (!debug_sample_window)
 +		goto err_window;
 +
 +	debug_sample_width = debugfs_create_file("width", 0644,
 +						     debug_dir, NULL,
 +						     &width_fops);
 +	if (!debug_sample_width)
 +		goto err_width;
 +
 +	debug_threshold = debugfs_create_file("threshold", 0644,
 +						  debug_dir, NULL,
 +						  &threshold_fops);
 +	if (!debug_threshold)
 +		goto err_threshold;
 +
 +	debug_enable = debugfs_create_file("enable", 0644,
 +					       debug_dir, &enabled,
 +					       &enable_fops);
 +	if (!debug_enable)
 +		goto err_enable;
 +
 +	else {
 +		ret = 0;
 +		goto out;
 +	}
 +
 +err_enable:
 +	debugfs_remove(debug_threshold);
 +err_threshold:
 +	debugfs_remove(debug_sample_width);
 +err_width:
 +	debugfs_remove(debug_sample_window);
 +err_window:
 +	debugfs_remove(debug_max);
 +err_max:
 +	debugfs_remove(debug_count);
 +err_count:
 +	debugfs_remove(debug_sample);
 +err_sample:
 +	debugfs_remove(debug_dir);
 +err_debug_dir:
 +out:
 +	return ret;
 +}
 +
 +/**
 + * free_debugfs - A function to cleanup the debugfs file interface
 + */
 +static void free_debugfs(void)
 +{
 +	/* could also use a debugfs_remove_recursive */
 +	debugfs_remove(debug_enable);
 +	debugfs_remove(debug_threshold);
 +	debugfs_remove(debug_sample_width);
 +	debugfs_remove(debug_sample_window);
 +	debugfs_remove(debug_max);
 +	debugfs_remove(debug_count);
 +	debugfs_remove(debug_sample);
 +	debugfs_remove(debug_dir);
 +}
 +
 +/**
 + * detector_init - Standard module initialization code
 + */
 +static int detector_init(void)
 +{
 +	int ret = -ENOMEM;
 +
 +	pr_info(BANNER "version %s\n", VERSION);
 +
 +	ret = init_stats();
 +	if (ret)
 +		goto out;
 +
 +	ret = init_debugfs();
 +	if (ret)
 +		goto err_stats;
 +
 +	if (enabled)
 +		ret = start_kthread();
 +
 +	goto out;
 +
 +err_stats:
 +	ring_buffer_free(ring_buffer);
 +out:
 +	return ret;
 +
 +}
 +
 +/**
 + * detector_exit - Standard module cleanup code
 + */
 +static void detector_exit(void)
 +{
 +	int err;
 +
 +	if (enabled) {
 +		enabled = 0;
 +		err = stop_kthread();
 +		if (err)
 +			pr_err(BANNER "cannot stop kthread\n");
 +	}
 +
 +	free_debugfs();
 +	ring_buffer_free(ring_buffer);	/* free up the ring buffer */
 +
 +}
 +
 +module_init(detector_init);
 +module_exit(detector_exit);