blob: 752cab4e93826d18066edd06bfa40ba1335551ec [file] [log] [blame]
#!/bin/sh
#
# Input a file where each line has an x-value followed by a series of
# y-values in sorted order. Output a line containing the x-value, the
# average, min, and max of the good data, the number of good data values,
# and the number of original data values. This script uses a variant of
# the old Sonatech data-cleaning algorithm, but incorporates the assumption
# that the smallest data values are good. Similar algorithms have been
# used by Dave Mills in NTP and by Larry McVoy in lmbench.
#
# This script takes the following arguments:
#
# --divisor: Reciprocal of the leading fraction of data assumed
# to be good, defaults to 3 (for one-third of the data).
# --relerr: Relative error inherent in the data, defaults to 0.01.
# --trendbreak: Multiple of average difference deemed to constitute
# a trend break. Defaults to 2.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, you can access it online at
# http://www.gnu.org/licenses/gpl-2.0.html.
#
# Copyright (C) IBM Corporation, 2012-2019
# Copyright (C) Facebook, 2019
#
# Authors: Paul E. McKenney <paulmck@kernel.org>
#\begin{snippet}[labelbase=ln:debugging:datablows:whole,commandchars=\!\@\%]
div=3 #\lnlbl{param:b}
rel=0.01
tre=10 #\lnlbl{param:e}
while test $# -gt 0 #\lnlbl{parse:b}
do
case "$1" in
--divisor)
shift
div=$1
;;
--relerr)
shift
rel=$1
;;
--trendbreak)
shift
tre=$1
;;
esac
shift
done #\lnlbl{parse:e}
# echo divisor: $div relerr: $rel trendbreak: $tre #\fcvexclude
awk -v divisor=$div -v relerr=$rel -v trendbreak=$tre '{#\lnlbl{awk:invoke}
for (i = 2; i <= NF; i++) #\lnlbl{awk:copy:b}
d[i - 1] = $i; #\lnlbl{awk:copy:e}
asort(d); #\lnlbl{awk:asort}
i = int((NF + divisor - 1) / divisor); #\lnlbl{awk:comp_i}
delta = d[i] - d[1]; #\lnlbl{awk:delta}
maxdelta = delta * divisor; #\lnlbl{awk:maxdelta}
maxdelta1 = delta + d[i] * relerr; #\lnlbl{awk:maxdelta1}
if (maxdelta1 > maxdelta) #\lnlbl{awk:comp_max:b}
maxdelta = maxdelta1; #\lnlbl{awk:comp_max:e}
for (j = i + 1; j < NF; j++) { #\lnlbl{awk:add:b}
if (j <= 2) #\lnlbl{awk:chk_engh}
maxdiff = d[NF - 1] - d[1];
else
maxdiff = trendbreak * (d[j - 1] - d[1]) / (j - 2); #\lnlbl{awk:mul_avr}
# print "i: " i, "j: " j, "maxdelta: " maxdelta, "maxdiff: " maxdiff, "d[j] - d[j - 1]: " d[j] - d[j - 1] #\fcvexclude
if (d[j] - d[1] > maxdelta && d[j] - d[j - 1] > maxdiff) #\lnlbl{awk:chk_max}
break; #\lnlbl{awk:break}
} #\lnlbl{awk:add:e}
n = sum = 0; #\lnlbl{awk:comp_stat:b}
for (k = 1; k < j; k++) {
sum += d[k];
n++;
}
min = d[1];
max = d[j - 1];
avg = sum / n;
print $1, avg, min, max, n, NF - 1; #\lnlbl{awk:comp_stat:e}
}' #\lnlbl{awk:end}
#\end{snippet}