utilities/datablows.sh - pub/scm/linux/kernel/git/paulmck/perfbook - Git at Google

 #!/bin/sh
 #
 # Input a file where each line has an x-value followed by a series of
 # y-values in sorted order.  Output a line containing the x-value, the
 # average, min, and max of the good data, the number of good data values,
 # and the number of original data values.  This script uses a variant of
 # the old Sonatech data-cleaning algorithm, but incorporates the assumption
 # that the smallest data values are good.  Similar algorithms have been
 # used by Dave Mills in NTP and by Larry McVoy in lmbench.
 #
 # This script takes the following arguments:
 #
 #	--divisor:  Reciprocal of the leading fraction of data assumed
 #		to be good, defaults to 3 (for one-third of the data).
 #	--relerr:  Relative error inherent in the data, defaults to 0.01.
 #	--trendbreak:  Multiple of average difference deemed to constitute
 #		a trend break.  Defaults to 2.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation; either version 2 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, you can access it online at
 # http://www.gnu.org/licenses/gpl-2.0.html.
 #
 # Copyright (C) IBM Corporation, 2012-2019
 # Copyright (C) Facebook, 2019
 #
 # Authors: Paul E. McKenney <paulmck@kernel.org>

 #\begin{snippet}[labelbase=ln:debugging:datablows:whole,commandchars=\!\@\%]
 div=3				#\lnlbl{param:b}
 rel=0.01
 tre=10				#\lnlbl{param:e}
 while test $# -gt 0		#\lnlbl{parse:b}
 do
 	case "$1" in
 	--divisor)
 		shift
 		div=$1
 		;;
 	--relerr)
 		shift
 		rel=$1
 		;;
 	--trendbreak)
 		shift
 		tre=$1
 		;;
 	esac
 	shift
 done				#\lnlbl{parse:e}
 # echo divisor: $div relerr: $rel trendbreak: $tre #\fcvexclude

 awk -v divisor=$div -v relerr=$rel -v trendbreak=$tre '{#\lnlbl{awk:invoke}
 	for (i = 2; i <= NF; i++)		#\lnlbl{awk:copy:b}
 		d[i - 1] = $i;			#\lnlbl{awk:copy:e}
 	asort(d);				#\lnlbl{awk:asort}
 	i = int((NF + divisor - 1) / divisor);	#\lnlbl{awk:comp_i}
 	delta = d[i] - d[1];			#\lnlbl{awk:delta}
 	maxdelta = delta * divisor;		#\lnlbl{awk:maxdelta}
 	maxdelta1 = delta + d[i] * relerr;	#\lnlbl{awk:maxdelta1}
 	if (maxdelta1 > maxdelta)		#\lnlbl{awk:comp_max:b}
 		maxdelta = maxdelta1;		#\lnlbl{awk:comp_max:e}
 	for (j = i + 1; j < NF; j++) {		#\lnlbl{awk:add:b}
 		if (j <= 2)			#\lnlbl{awk:chk_engh}
 			maxdiff = d[NF - 1] - d[1];
 		else
 			maxdiff = trendbreak * (d[j - 1] - d[1]) / (j - 2); #\lnlbl{awk:mul_avr}
 # print "i: " i, "j: " j, "maxdelta: " maxdelta, "maxdiff: " maxdiff, "d[j] - d[j - 1]: " d[j] - d[j - 1] #\fcvexclude
 		if (d[j] - d[1] > maxdelta && d[j] - d[j - 1] > maxdiff) #\lnlbl{awk:chk_max}
 			break;			#\lnlbl{awk:break}
 	}					#\lnlbl{awk:add:e}
 	n = sum = 0;				#\lnlbl{awk:comp_stat:b}
 	for (k = 1; k < j; k++) {
 		sum += d[k];
 		n++;
 	}
 	min = d[1];
 	max = d[j - 1];
 	avg = sum / n;
 	print $1, avg, min, max, n, NF - 1;	#\lnlbl{awk:comp_stat:e}
 }'						#\lnlbl{awk:end}
 #\end{snippet}
	#!/bin/sh
	#
	# Input a file where each line has an x-value followed by a series of
	# y-values in sorted order. Output a line containing the x-value, the
	# average, min, and max of the good data, the number of good data values,
	# and the number of original data values. This script uses a variant of
	# the old Sonatech data-cleaning algorithm, but incorporates the assumption
	# that the smallest data values are good. Similar algorithms have been
	# used by Dave Mills in NTP and by Larry McVoy in lmbench.
	#
	# This script takes the following arguments:
	#
	# --divisor: Reciprocal of the leading fraction of data assumed
	# to be good, defaults to 3 (for one-third of the data).
	# --relerr: Relative error inherent in the data, defaults to 0.01.
	# --trendbreak: Multiple of average difference deemed to constitute
	# a trend break. Defaults to 2.
	#
	# This program is free software; you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation; either version 2 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program; if not, you can access it online at
	# http://www.gnu.org/licenses/gpl-2.0.html.
	#
	# Copyright (C) IBM Corporation, 2012-2019
	# Copyright (C) Facebook, 2019
	#
	# Authors: Paul E. McKenney <paulmck@kernel.org>

	#\begin{snippet}[labelbase=ln:debugging:datablows:whole,commandchars=\!\@\%]
	div=3 #\lnlbl{param:b}
	rel=0.01
	tre=10 #\lnlbl{param:e}
	while test $# -gt 0 #\lnlbl{parse:b}
	do
	case "$1" in
	--divisor)
	shift
	div=$1
	;;
	--relerr)
	shift
	rel=$1
	;;
	--trendbreak)
	shift
	tre=$1
	;;
	esac
	shift
	done #\lnlbl{parse:e}
	# echo divisor: $div relerr: $rel trendbreak: $tre #\fcvexclude

	awk -v divisor=$div -v relerr=$rel -v trendbreak=$tre '{#\lnlbl{awk:invoke}
	for (i = 2; i <= NF; i++) #\lnlbl{awk:copy:b}
	d[i - 1] = $i; #\lnlbl{awk:copy:e}
	asort(d); #\lnlbl{awk:asort}
	i = int((NF + divisor - 1) / divisor); #\lnlbl{awk:comp_i}
	delta = d[i] - d[1]; #\lnlbl{awk:delta}
	maxdelta = delta * divisor; #\lnlbl{awk:maxdelta}
	maxdelta1 = delta + d[i] * relerr; #\lnlbl{awk:maxdelta1}
	if (maxdelta1 > maxdelta) #\lnlbl{awk:comp_max:b}
	maxdelta = maxdelta1; #\lnlbl{awk:comp_max:e}
	for (j = i + 1; j < NF; j++) { #\lnlbl{awk:add:b}
	if (j <= 2) #\lnlbl{awk:chk_engh}
	maxdiff = d[NF - 1] - d[1];
	else
	maxdiff = trendbreak * (d[j - 1] - d[1]) / (j - 2); #\lnlbl{awk:mul_avr}
	# print "i: " i, "j: " j, "maxdelta: " maxdelta, "maxdiff: " maxdiff, "d[j] - d[j - 1]: " d[j] - d[j - 1] #\fcvexclude
	if (d[j] - d[1] > maxdelta && d[j] - d[j - 1] > maxdiff) #\lnlbl{awk:chk_max}
	break; #\lnlbl{awk:break}
	} #\lnlbl{awk:add:e}
	n = sum = 0; #\lnlbl{awk:comp_stat:b}
	for (k = 1; k < j; k++) {
	sum += d[k];
	n++;
	}
	min = d[1];
	max = d[j - 1];
	avg = sum / n;
	print $1, avg, min, max, n, NF - 1; #\lnlbl{awk:comp_stat:e}
	}' #\lnlbl{awk:end}
	#\end{snippet}