Add support for latency probing over an interval of load

Provide a way to easily run a latency probe on the device.
You define a job with peak parameters, and then probe settings
for generating iops/latency numbers based on that workload.
The latter looks something like this:

iodepth_mode=stepped:10-130/10,5/10

which has the format of:

low_percentage-high_percentage/step,ramp_time/run_time

The above would probe from 10% of peak performance to 130%,
in steps of 10%. For each step, it would run a 5 second ramp,
then do 10 seconds of testing. For percentages <= 100%,
fio will limit the IOPS. For percentages above, it'll ramp up
the queue depth. For each section run, it'll look the avg
completion latency associated with that queue depth / iops
setting.

Has normal output (which sucks), and json output. Still
experimenting, not final form yet.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/Makefile b/Makefile
index 4721b78..62c6ddf 100644
--- a/Makefile
+++ b/Makefile
@@ -50,7 +50,7 @@
 		gettime-thread.c helpers.c json.c idletime.c td_error.c \
 		profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
 		workqueue.c rate-submit.c optgroup.c helper_thread.c \
-		steadystate.c zone-dist.c
+		steadystate.c zone-dist.c target.c
 
 ifdef CONFIG_LIBHDFS
   HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE)
diff --git a/backend.c b/backend.c
index d6450ba..3c734f0 100644
--- a/backend.c
+++ b/backend.c
@@ -49,6 +49,7 @@
 #include "helper_thread.h"
 #include "pshared.h"
 #include "zone-dist.h"
+#include "target.h"
 
 static struct fio_sem *startup_sem;
 static struct flist_head *cgroup_list;
@@ -1090,8 +1091,8 @@
 				break;
 			}
 		}
-		if (!in_ramp_time(td) && td->o.latency_target)
-			lat_target_check(td);
+		if (!in_ramp_time(td) && lat_target_check(td))
+			break;
 
 		if (ddir_rw(ddir) && td->o.thinktime)
 			handle_thinktime(td, ddir);
@@ -1867,7 +1868,8 @@
 	 * (Are we not missing other flags that can be ignored ?)
 	 */
 	if ((td->o.size || td->o.io_size) && !ddir_rw_sum(bytes_done) &&
-	    !did_some_io && !td->o.create_only &&
+	    !did_some_io && (td->o.iodepth_mode != IOD_STEPPED) &&
+	    !td->o.create_only &&
 	    !(td_ioengine_flagged(td, FIO_NOIO) ||
 	      td_ioengine_flagged(td, FIO_DISKLESSIO)))
 		log_err("%s: No I/O performed by %s, "
diff --git a/cconv.c b/cconv.c
index 50e45c6..4040be2 100644
--- a/cconv.c
+++ b/cconv.c
@@ -100,6 +100,12 @@
 	o->iodepth_batch_complete_min = le32_to_cpu(top->iodepth_batch_complete_min);
 	o->iodepth_batch_complete_max = le32_to_cpu(top->iodepth_batch_complete_max);
 	o->serialize_overlap = le32_to_cpu(top->serialize_overlap);
+	o->iodepth_mode = le32_to_cpu(top->iodepth_mode);
+	o->lat_step_low = le32_to_cpu(top->lat_step_low);
+	o->lat_step_high = le32_to_cpu(top->lat_step_high);
+	o->lat_step_inc = le32_to_cpu(top->lat_step_inc);
+	o->lat_step_ramp = le32_to_cpu(top->lat_step_ramp);
+	o->lat_step_run = le32_to_cpu(top->lat_step_run);
 	o->size = le64_to_cpu(top->size);
 	o->io_size = le64_to_cpu(top->io_size);
 	o->size_percent = le32_to_cpu(top->size_percent);
@@ -363,6 +369,12 @@
 	top->iodepth_batch_complete_min = cpu_to_le32(o->iodepth_batch_complete_min);
 	top->iodepth_batch_complete_max = cpu_to_le32(o->iodepth_batch_complete_max);
 	top->serialize_overlap = cpu_to_le32(o->serialize_overlap);
+	top->iodepth_mode = cpu_to_le32(o->iodepth_mode);
+	top->lat_step_low = cpu_to_le32(o->lat_step_low);
+	top->lat_step_high = cpu_to_le32(o->lat_step_high);
+	top->lat_step_inc = cpu_to_le32(o->lat_step_inc);
+	top->lat_step_ramp = cpu_to_le32(o->lat_step_ramp);
+	top->lat_step_run = cpu_to_le32(o->lat_step_run);
 	top->size_percent = cpu_to_le32(o->size_percent);
 	top->fill_device = cpu_to_le32(o->fill_device);
 	top->file_append = cpu_to_le32(o->file_append);
diff --git a/client.c b/client.c
index 3248906..0c87eb5 100644
--- a/client.c
+++ b/client.c
@@ -1024,6 +1024,15 @@
 	for (i = 0; i < dst->nr_block_infos; i++)
 		dst->block_infos[i] = le32_to_cpu(src->block_infos[i]);
 
+	for (i = 0; i < ARRAY_SIZE(dst->step_stats); i++) {
+		struct lat_step_stats *ls = &src->step_stats[i];
+
+		for (j = 0; j < DDIR_RWDIR_CNT; j++) {
+			dst->step_stats[i].iops[j] = le64_to_cpu(ls->iops[j]);
+			dst->step_stats[i].avg[j].u.f = fio_uint64_to_double(le64_to_cpu(ls->avg[j].u.i));
+		}
+	}
+
 	dst->ss_dur		= le64_to_cpu(src->ss_dur);
 	dst->ss_state		= le32_to_cpu(src->ss_state);
 	dst->ss_head		= le32_to_cpu(src->ss_head);
diff --git a/examples/iodepth_mode_stepped.fio b/examples/iodepth_mode_stepped.fio
new file mode 100644
index 0000000..fc2b9f4
--- /dev/null
+++ b/examples/iodepth_mode_stepped.fio
@@ -0,0 +1,19 @@
+# Job demonstrating how to use the iodepth_mode=stepped feature
+#
+[step]
+ioengine=libaio
+# iodepth / step_high (130% here) must be high enough to saturate performance
+iodepth=64
+numjobs=1
+direct=1
+# Step from 10% to 130%, in 5% intervals. For each step, use a ramp time
+# of 5s, then 30 seconds of runtime
+iodepth_mode=stepped:10-130/5,5/30
+rw=randread
+norandommap
+filename=/dev/nvme0n1p9
+runtime=1h
+time_based=1
+numjobs=2
+group_reporting=1
+cpus_allowed=0,2
diff --git a/fio.h b/fio.h
index b3ba5db..081998b 100644
--- a/fio.h
+++ b/fio.h
@@ -155,6 +155,11 @@
 	F_ADV_SEQUENTIAL,
 };
 
+enum {
+	IOD_NONE = 0,
+	IOD_STEPPED,
+};
+
 /*
  * Per-thread/process specific data. Only used for the network client
  * for now.
@@ -374,9 +379,14 @@
 	unsigned int latency_qd;
 	unsigned int latency_qd_high;
 	unsigned int latency_qd_low;
+	unsigned int latency_qd_step;
 	unsigned int latency_failed;
-	uint64_t latency_ios;
+	unsigned int latency_state;
+	unsigned int latency_iops[DDIR_RWDIR_CNT];
+	unsigned int latency_step;
+	uint64_t latency_ios[DDIR_RWDIR_CNT];
 	int latency_end_run;
+	unsigned int nr_lat_stats;
 
 	/*
 	 * read/write mixed workload state
@@ -688,13 +698,6 @@
 		   struct timespec *comp_time);
 
 /*
- * Latency target helpers
- */
-extern void lat_target_check(struct thread_data *);
-extern void lat_target_init(struct thread_data *);
-extern void lat_target_reset(struct thread_data *);
-
-/*
  * Iterates all threads/processes within all the defined jobs
  */
 #define for_each_td(td, i)	\
@@ -751,6 +754,8 @@
 	return ddir_rw_sum(td->bytes_done) != 0;
 }
 
+int setup_rate(struct thread_data *td);
+
 static inline unsigned long long td_max_bs(struct thread_data *td)
 {
 	unsigned long long max_bs;
diff --git a/init.c b/init.c
index a2b70c4..6912480 100644
--- a/init.c
+++ b/init.c
@@ -559,7 +559,7 @@
 	return 0;
 }
 
-static int setup_rate(struct thread_data *td)
+int setup_rate(struct thread_data *td)
 {
 	int ret = 0;
 
diff --git a/io_u.c b/io_u.c
index 56abe6f..e1ac209 100644
--- a/io_u.c
+++ b/io_u.c
@@ -11,6 +11,7 @@
 #include "lib/pow2.h"
 #include "minmax.h"
 #include "zbd.h"
+#include "target.h"
 
 struct io_completion_data {
 	int nr;				/* input */
@@ -1356,146 +1357,6 @@
 	return 0;
 }
 
-static void lat_fatal(struct thread_data *td, struct io_completion_data *icd,
-		      unsigned long long tnsec, unsigned long long max_nsec)
-{
-	if (!td->error)
-		log_err("fio: latency of %llu nsec exceeds specified max (%llu nsec)\n", tnsec, max_nsec);
-	td_verror(td, ETIMEDOUT, "max latency exceeded");
-	icd->error = ETIMEDOUT;
-}
-
-static void lat_new_cycle(struct thread_data *td)
-{
-	fio_gettime(&td->latency_ts, NULL);
-	td->latency_ios = ddir_rw_sum(td->io_blocks);
-	td->latency_failed = 0;
-}
-
-/*
- * We had an IO outside the latency target. Reduce the queue depth. If we
- * are at QD=1, then it's time to give up.
- */
-static bool __lat_target_failed(struct thread_data *td)
-{
-	if (td->latency_qd == 1)
-		return true;
-
-	td->latency_qd_high = td->latency_qd;
-
-	if (td->latency_qd == td->latency_qd_low)
-		td->latency_qd_low--;
-
-	td->latency_qd = (td->latency_qd + td->latency_qd_low) / 2;
-
-	dprint(FD_RATE, "Ramped down: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
-
-	/*
-	 * When we ramp QD down, quiesce existing IO to prevent
-	 * a storm of ramp downs due to pending higher depth.
-	 */
-	io_u_quiesce(td);
-	lat_new_cycle(td);
-	return false;
-}
-
-static bool lat_target_failed(struct thread_data *td)
-{
-	if (td->o.latency_percentile.u.f == 100.0)
-		return __lat_target_failed(td);
-
-	td->latency_failed++;
-	return false;
-}
-
-void lat_target_init(struct thread_data *td)
-{
-	td->latency_end_run = 0;
-
-	if (td->o.latency_target) {
-		dprint(FD_RATE, "Latency target=%llu\n", td->o.latency_target);
-		fio_gettime(&td->latency_ts, NULL);
-		td->latency_qd = 1;
-		td->latency_qd_high = td->o.iodepth;
-		td->latency_qd_low = 1;
-		td->latency_ios = ddir_rw_sum(td->io_blocks);
-	} else
-		td->latency_qd = td->o.iodepth;
-}
-
-void lat_target_reset(struct thread_data *td)
-{
-	if (!td->latency_end_run)
-		lat_target_init(td);
-}
-
-static void lat_target_success(struct thread_data *td)
-{
-	const unsigned int qd = td->latency_qd;
-	struct thread_options *o = &td->o;
-
-	td->latency_qd_low = td->latency_qd;
-
-	/*
-	 * If we haven't failed yet, we double up to a failing value instead
-	 * of bisecting from highest possible queue depth. If we have set
-	 * a limit other than td->o.iodepth, bisect between that.
-	 */
-	if (td->latency_qd_high != o->iodepth)
-		td->latency_qd = (td->latency_qd + td->latency_qd_high) / 2;
-	else
-		td->latency_qd *= 2;
-
-	if (td->latency_qd > o->iodepth)
-		td->latency_qd = o->iodepth;
-
-	dprint(FD_RATE, "Ramped up: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
-
-	/*
-	 * Same as last one, we are done. Let it run a latency cycle, so
-	 * we get only the results from the targeted depth.
-	 */
-	if (td->latency_qd == qd) {
-		if (td->latency_end_run) {
-			dprint(FD_RATE, "We are done\n");
-			td->done = 1;
-		} else {
-			dprint(FD_RATE, "Quiesce and final run\n");
-			io_u_quiesce(td);
-			td->latency_end_run = 1;
-			reset_all_stats(td);
-			reset_io_stats(td);
-		}
-	}
-
-	lat_new_cycle(td);
-}
-
-/*
- * Check if we can bump the queue depth
- */
-void lat_target_check(struct thread_data *td)
-{
-	uint64_t usec_window;
-	uint64_t ios;
-	double success_ios;
-
-	usec_window = utime_since_now(&td->latency_ts);
-	if (usec_window < td->o.latency_window)
-		return;
-
-	ios = ddir_rw_sum(td->io_blocks) - td->latency_ios;
-	success_ios = (double) (ios - td->latency_failed) / (double) ios;
-	success_ios *= 100.0;
-
-	dprint(FD_RATE, "Success rate: %.2f%% (target %.2f%%)\n", success_ios, td->o.latency_percentile.u.f);
-
-	if (success_ios >= td->o.latency_percentile.u.f)
-		lat_target_success(td);
-	else
-		__lat_target_failed(td);
-}
-
 /*
  * If latency target is enabled, we might be ramping up or down and not
  * using the full queue depth available.
@@ -1506,7 +1367,7 @@
 
 	if (qempty)
 		return true;
-	if (!td->o.latency_target)
+	if (!td->o.latency_target || td->o.iodepth_mode != IOD_STEPPED)
 		return false;
 
 	return td->cur_depth >= td->latency_qd;
@@ -1837,11 +1698,15 @@
 				icd->error = ops->io_u_lat(td, tnsec);
 		}
 
-		if (td->o.max_latency && tnsec > td->o.max_latency)
-			lat_fatal(td, icd, tnsec, td->o.max_latency);
+		if (td->o.max_latency && tnsec > td->o.max_latency) {
+			icd->error = ETIMEDOUT;
+			lat_fatal(td, tnsec, td->o.max_latency);
+		}
 		if (td->o.latency_target && tnsec > td->o.latency_target) {
-			if (lat_target_failed(td))
-				lat_fatal(td, icd, tnsec, td->o.latency_target);
+			if (lat_target_failed(td)) {
+				icd->error = ETIMEDOUT;
+				lat_fatal(td, tnsec, td->o.latency_target);
+			}
 		}
 	}
 
@@ -1887,8 +1752,8 @@
 
 static bool should_account(struct thread_data *td)
 {
-	return ramp_time_over(td) && (td->runstate == TD_RUNNING ||
-					   td->runstate == TD_VERIFYING);
+	return lat_step_account(td) && ramp_time_over(td) &&
+		(td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING);
 }
 
 static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
diff --git a/libfio.c b/libfio.c
index 674bc1d..a672dd5 100644
--- a/libfio.c
+++ b/libfio.c
@@ -34,6 +34,7 @@
 #include "filelock.h"
 #include "helper_thread.h"
 #include "filehash.h"
+#include "target.h"
 
 FLIST_HEAD(disk_list);
 
diff --git a/options.c b/options.c
index 98187de..52acf97 100644
--- a/options.c
+++ b/options.c
@@ -13,6 +13,7 @@
 #include "lib/pattern.h"
 #include "options.h"
 #include "optgroup.h"
+#include "target.h"
 
 char client_sockaddr_str[INET6_ADDRSTRLEN] = { 0 };
 
@@ -480,6 +481,51 @@
 	return 0;
 }
 
+static int str_iodepth_mode_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	struct thread_options *o = &td->o;
+	char *str, *p, *n;
+	int ret = 1;
+
+	if (o->iodepth_mode == IOD_NONE)
+		return 0;
+
+	if (parse_dryrun())
+		return 0;
+
+	p = str = strdup(input);
+
+	strip_blank_front(&str);
+	strip_blank_end(str);
+
+	n = strchr(p, ':');
+	if (!n)
+		goto err;
+
+	*n++ = '\0';
+
+	/* format is now 'low-min/step' */
+	ret = sscanf(n, "%u-%u/%u,%u/%u", &o->lat_step_low, &o->lat_step_high,
+					&o->lat_step_inc, &o->lat_step_ramp,
+					&o->lat_step_run);
+	if (ret == 5) {
+		ret = 0;
+		o->lat_step_ramp *= 1000;
+		o->lat_step_run *= 1000;
+	} else if (ret == 3) {
+		o->lat_step_ramp = IOD_STEPPED_DEF_RAMP;
+		o->lat_step_run = IOD_STEPPED_DEF_RUN;
+		ret = 0;
+	} else
+		ret = 1;
+err:
+	if (ret)
+		log_err("fio: failed parsing <%s>\n", input);
+	free(str);
+	return ret;
+}
+
 static int str_exitall_cb(void)
 {
 	exitall_on_terminate = true;
@@ -1960,6 +2006,30 @@
 		.group	= FIO_OPT_G_IO_BASIC,
 	},
 	{
+		.name	= "iodepth_mode",
+		.lname	= "IO Depth Mode",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, iodepth_mode),
+		.cb	= str_iodepth_mode_cb,
+		.help	= "How to vary the queue depth",
+		.parent	= "iodepth",
+		.hide	= 1,
+		.interval = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+		.posval = {
+			  { .ival = "none",
+			    .oval = IOD_NONE,
+			    .help = "No depth modification",
+			  },
+			  { .ival = "stepped",
+			    .oval = IOD_STEPPED,
+			    .help = "Stepped IO depth:hi-lo/inc,ramp/run",
+			  },
+		},
+	},
+
+	{
 		.name	= "serialize_overlap",
 		.lname	= "Serialize overlap",
 		.off1	= offsetof(struct thread_options, serialize_overlap),
diff --git a/server.c b/server.c
index 90d3396..a636f27 100644
--- a/server.c
+++ b/server.c
@@ -1550,6 +1550,15 @@
 
 	p.ts.sig_figs		= cpu_to_le32(ts->sig_figs);
 
+	for (i = 0; i < ARRAY_SIZE(ts->step_stats); i++) {
+		struct lat_step_stats *ls = &ts->step_stats[i];
+
+		for (j = 0; j < DDIR_RWDIR_CNT; j++) {
+			p.ts.step_stats[i].iops[j] = cpu_to_le64(ls->iops[j]);
+			p.ts.step_stats[i].avg[j].u.i = cpu_to_le64(fio_double_to_uint64(ls->avg[j].u.f));
+		}
+	}
+
 	p.ts.nr_block_infos	= cpu_to_le64(ts->nr_block_infos);
 	for (i = 0; i < p.ts.nr_block_infos; i++)
 		p.ts.block_infos[i] = cpu_to_le32(ts->block_infos[i]);
diff --git a/server.h b/server.h
index 371e51e..abb23ba 100644
--- a/server.h
+++ b/server.h
@@ -48,7 +48,7 @@
 };
 
 enum {
-	FIO_SERVER_VER			= 77,
+	FIO_SERVER_VER			= 78,
 
 	FIO_SERVER_MAX_FRAGMENT_PDU	= 1024,
 	FIO_SERVER_MAX_CMD_MB		= 2048,
diff --git a/stat.c b/stat.c
index 331abf6..26125fa 100644
--- a/stat.c
+++ b/stat.c
@@ -15,6 +15,7 @@
 #include "helper_thread.h"
 #include "smalloc.h"
 #include "zbd.h"
+#include "target.h"
 
 #define LOG_MSEC_SLACK	1
 
@@ -391,7 +392,7 @@
 	stat_calc_lat(ts, io_u_lat, ts->io_u_lat_m, FIO_IO_U_LAT_M_NR);
 }
 
-static void display_lat(const char *name, unsigned long long min,
+void display_lat(const char *name, unsigned long long min,
 			unsigned long long max, double mean, double dev,
 			struct buf_output *out)
 {
@@ -887,6 +888,11 @@
 
 	if (ts->ss_dur)
 		show_ss_normal(ts, out);
+
+	if (lat_ts_has_stats(ts)) {
+		log_buf(out, "  Stepped latency report\n");
+		lat_step_report(ts, out);
+	}
 }
 
 static void show_ddir_status_terse(struct thread_stat *ts,
@@ -1264,7 +1270,7 @@
 	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
 	double io_u_lat_m[FIO_IO_U_LAT_M_NR];
 	double usr_cpu, sys_cpu;
-	int i;
+	int i, j;
 	size_t size;
 
 	root = json_create_object();
@@ -1488,6 +1494,32 @@
 		json_object_add_value_array(data, "bw", bw);
 	}
 
+	if (lat_ts_has_stats(ts)) {
+		tmp = json_create_object();
+		json_object_add_value_object(root, "lat_step", tmp);
+	}
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		struct json_object *val;
+
+		if (!__lat_ts_has_stats(ts, i))
+			continue;
+
+		val = json_create_object();
+		json_object_add_value_object(tmp, io_ddir_name(i), val);
+
+		for (j = 0; j < ARRAY_SIZE(ts->step_stats); j++) {
+			struct lat_step_stats *ls = &ts->step_stats[j];
+			char name[32];
+
+			if (!ls->iops[i])
+				continue;
+
+			sprintf(name, "%llu", (unsigned long long) ls->iops[i]);
+			json_object_add_value_float(val, name, ls->avg[i].u.f);
+		}
+	}
+
 	return root;
 }
 
@@ -1553,6 +1585,25 @@
 	dst->S.u.f = S;
 }
 
+static void sum_lat_step_stats(struct lat_step_stats *dst,
+			       struct lat_step_stats *src, bool first)
+{
+	int i;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if (!dst->iops[i] && !src->iops[i])
+			continue;
+		if (first)
+			dst->avg[i].u.f = src->avg[i].u.f;
+		else {
+			dst->avg[i].u.f = ((src->avg[i].u.f * src->iops[i]) +
+				(dst->avg[i].u.f * dst->iops[i])) /
+				(dst->iops[i] + src->iops[i]);
+		}
+		dst->iops[i] += src->iops[i];
+	}
+}
+
 void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src)
 {
 	int i;
@@ -1665,6 +1716,9 @@
 	dst->total_submit += src->total_submit;
 	dst->total_complete += src->total_complete;
 	dst->nr_zone_resets += src->nr_zone_resets;
+
+	for (l = 0; l < ARRAY_SIZE(dst->step_stats); l++)
+		sum_lat_step_stats(&dst->step_stats[l], &src->step_stats[l], first);
 }
 
 void init_group_run_stat(struct group_run_stats *gs)
@@ -1711,6 +1765,9 @@
 	for (i = 0; i < groupid + 1; i++)
 		init_group_run_stat(&runstats[i]);
 
+	for (i = 0; i < FIO_OUTPUT_NR; i++)
+		buf_output_init(&output[i]);
+
 	/*
 	 * find out how many threads stats we need. if group reporting isn't
 	 * enabled, it's one-per-td.
@@ -1887,9 +1944,6 @@
 		}
 	}
 
-	for (i = 0; i < FIO_OUTPUT_NR; i++)
-		buf_output_init(&output[i]);
-
 	/*
 	 * don't overwrite last signal output
 	 */
diff --git a/stat.h b/stat.h
index b4ba71e..8a165b7 100644
--- a/stat.h
+++ b/stat.h
@@ -4,6 +4,11 @@
 #include "iolog.h"
 #include "lib/output_buffer.h"
 
+struct lat_step_stats {
+	uint64_t iops[DDIR_RWDIR_CNT];
+	fio_fp64_t avg[DDIR_RWDIR_CNT];
+};
+
 struct group_run_stats {
 	uint64_t max_run[DDIR_RWDIR_CNT], min_run[DDIR_RWDIR_CNT];
 	uint64_t max_bw[DDIR_RWDIR_CNT], min_bw[DDIR_RWDIR_CNT];
@@ -145,6 +150,8 @@
 #define FIO_JOBDESC_SIZE	256
 #define FIO_VERROR_SIZE		128
 
+#define MAX_STEP_STATS		64
+
 struct thread_stat {
 	char name[FIO_JOBNAME_SIZE];
 	char verror[FIO_VERROR_SIZE];
@@ -227,6 +234,9 @@
 	uint64_t latency_window;
 
 	uint32_t sig_figs;
+	uint32_t pad4;
+
+	struct lat_step_stats step_stats[MAX_STEP_STATS];
 
 	uint64_t ss_dur;
 	uint32_t ss_state;
@@ -239,12 +249,12 @@
 
 	union {
 		uint64_t *ss_iops_data;
-		uint64_t pad4;
+		uint64_t pad5;
 	};
 
 	union {
 		uint64_t *ss_bw_data;
-		uint64_t pad5;
+		uint64_t pad6;
 	};
 } __attribute__((packed));
 
diff --git a/target.c b/target.c
new file mode 100644
index 0000000..d372ff1
--- /dev/null
+++ b/target.c
@@ -0,0 +1,375 @@
+#include <unistd.h>
+
+#include "fio.h"
+#include "target.h"
+#include "smalloc.h"
+#include "stat.h"
+
+void lat_fatal(struct thread_data *td, unsigned long long tnsec,
+	       unsigned long long max_nsec)
+{
+	if (!td->error)
+		log_err("fio: latency of %llu nsec exceeds specified max (%llu nsec)\n", tnsec, max_nsec);
+	td_verror(td, ETIMEDOUT, "max latency exceeded");
+}
+
+static void lat_ios_note(struct thread_data *td)
+{
+	int i;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		td->latency_ios[i] = td->io_blocks[i];
+}
+
+static void lat_new_cycle(struct thread_data *td)
+{
+	fio_gettime(&td->latency_ts, NULL);
+	lat_ios_note(td);
+	td->latency_failed = 0;
+}
+
+/*
+ * We had an IO outside the latency target. Reduce the queue depth. If we
+ * are at QD=1, then it's time to give up.
+ */
+static bool __lat_target_failed(struct thread_data *td)
+{
+	if (td->latency_qd == 1)
+		return true;
+
+	td->latency_qd_high = td->latency_qd;
+
+	if (td->latency_qd == td->latency_qd_low)
+		td->latency_qd_low--;
+
+	td->latency_qd = (td->latency_qd + td->latency_qd_low) / 2;
+
+	dprint(FD_RATE, "Ramped down: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
+
+	/*
+	 * When we ramp QD down, quiesce existing IO to prevent
+	 * a storm of ramp downs due to pending higher depth.
+	 */
+	io_u_quiesce(td);
+	lat_new_cycle(td);
+	return false;
+}
+
+bool lat_target_failed(struct thread_data *td)
+{
+	if (td->o.latency_percentile.u.f == 100.0)
+		return __lat_target_failed(td);
+
+	td->latency_failed++;
+	return false;
+}
+
+static void lat_step_init(struct thread_data *td)
+{
+	struct thread_options *o = &td->o;
+
+	fio_gettime(&td->latency_ts, NULL);
+	td->latency_state = IOD_STATE_PROBE_RAMP;
+	td->latency_step = 0;
+	td->latency_qd = td->o.iodepth;
+	dprint(FD_RATE, "Stepped: %d-%d/%d,%d/%d\n", o->lat_step_low,
+				o->lat_step_high, o->lat_step_inc,
+				o->lat_step_ramp, o->lat_step_run);
+}
+
+void lat_target_init(struct thread_data *td)
+{
+	td->latency_end_run = 0;
+
+	if (td->o.latency_target) {
+		dprint(FD_RATE, "Latency target=%llu\n", td->o.latency_target);
+		fio_gettime(&td->latency_ts, NULL);
+		td->latency_qd = 1;
+		td->latency_qd_high = td->o.iodepth;
+		td->latency_qd_low = 1;
+		lat_ios_note(td);
+	} else if (td->o.iodepth_mode == IOD_STEPPED)
+		lat_step_init(td);
+	else
+		td->latency_qd = td->o.iodepth;
+}
+
+void lat_target_reset(struct thread_data *td)
+{
+	if (td->o.latency_target && !td->latency_end_run)
+		lat_target_init(td);
+}
+
+static void lat_target_success(struct thread_data *td)
+{
+	const unsigned int qd = td->latency_qd;
+	struct thread_options *o = &td->o;
+
+	td->latency_qd_low = td->latency_qd;
+
+	/*
+	 * If we haven't failed yet, we double up to a failing value instead
+	 * of bisecting from highest possible queue depth. If we have set
+	 * a limit other than td->o.iodepth, bisect between that.
+	 */
+	if (td->latency_qd_high != o->iodepth)
+		td->latency_qd = (td->latency_qd + td->latency_qd_high) / 2;
+	else
+		td->latency_qd *= 2;
+
+	if (td->latency_qd > o->iodepth)
+		td->latency_qd = o->iodepth;
+
+	dprint(FD_RATE, "Ramped up: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
+
+	/*
+	 * Same as last one, we are done. Let it run a latency cycle, so
+	 * we get only the results from the targeted depth.
+	 */
+	if (td->latency_qd == qd) {
+		if (td->latency_end_run) {
+			dprint(FD_RATE, "We are done\n");
+			td->done = 1;
+		} else {
+			dprint(FD_RATE, "Quiesce and final run\n");
+			io_u_quiesce(td);
+			td->latency_end_run = 1;
+			reset_all_stats(td);
+			reset_io_stats(td);
+		}
+	}
+
+	lat_new_cycle(td);
+}
+
+void __lat_target_check(struct thread_data *td)
+{
+	uint64_t usec_window;
+	uint64_t ios;
+	double success_ios;
+
+	usec_window = utime_since_now(&td->latency_ts);
+	if (usec_window < td->o.latency_window)
+		return;
+
+	ios = ddir_rw_sum(td->io_blocks) - ddir_rw_sum(td->latency_ios);
+	success_ios = (double) (ios - td->latency_failed) / (double) ios;
+	success_ios *= 100.0;
+
+	dprint(FD_RATE, "Success rate: %.2f%% (target %.2f%%)\n", success_ios, td->o.latency_percentile.u.f);
+
+	if (success_ios >= td->o.latency_percentile.u.f)
+		lat_target_success(td);
+	else
+		__lat_target_failed(td);
+}
+
+static void lat_clear_rate(struct thread_data *td)
+{
+	int i;
+
+	td->flags &= ~TD_F_CHECK_RATE;
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		td->o.rate_iops[i] = 0;
+}
+
+/*
+ * Returns true if we're done stepping
+ */
+static bool lat_step_recalc(struct thread_data *td)
+{
+	struct thread_options *o = &td->o;
+	unsigned int cur, perc;
+
+	cur = td->latency_step * o->lat_step_inc;
+	if (cur >= o->lat_step_high)
+		return true;
+
+	perc = (td->latency_step + 1) * o->lat_step_inc;
+	if (perc < 100) {
+		int i;
+
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			unsigned int this_iops;
+
+			this_iops = (perc * td->latency_iops[i]) / 100;
+			td->o.rate_iops[i] = this_iops;
+		}
+		setup_rate(td);
+		td->flags |= TD_F_CHECK_RATE;
+		td->latency_qd = td->o.iodepth * 100 / o->lat_step_high;
+	} else {
+		td->latency_qd = td->o.iodepth * perc / o->lat_step_high;
+		lat_clear_rate(td);
+	}
+		
+	dprint(FD_RATE, "Stepped: step=%d, perc=%d, qd=%d\n", td->latency_step,
+						perc, td->latency_qd);
+	return false;
+}
+
+static void lat_step_reset(struct thread_data *td)
+{
+	struct thread_stat *ts = &td->ts;
+	struct io_stat *ios = &ts->clat_stat[DDIR_RWDIR_CNT];
+
+	ios->max_val = ios->min_val = ios->samples = 0;
+	ios->mean.u.f = ios->S.u.f = 0;
+
+	lat_clear_rate(td);
+	reset_all_stats(td);
+	reset_io_stats(td);
+}
+
+static uint64_t lat_iops_since(struct thread_data *td, uint64_t msec,
+			       enum fio_ddir ddir)
+{
+	if (msec) {
+		uint64_t ios;
+
+		ios = td->io_blocks[ddir] - td->latency_ios[ddir];
+		return (ios * 1000) / msec;
+	}
+
+	return 0;
+}
+
+static void lat_step_add_sample(struct thread_data *td, uint64_t msec)
+{
+	struct thread_stat *ts = &td->ts;
+	unsigned long long min, max;
+	struct lat_step_stats *ls;
+	double mean[DDIR_RWDIR_CNT], dev;
+	int i;
+
+	if (td->nr_lat_stats == ARRAY_SIZE(td->ts.step_stats)) {
+		log_err("fio: ts->step_stats too small, dropping entries\n");
+		return;
+	}
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		calc_lat(&ts->clat_stat[i], &min, &max, &mean[i], &dev);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		ls = &td->ts.step_stats[td->nr_lat_stats];
+
+		ls->iops[i] = lat_iops_since(td, msec, i);
+		ls->avg[i].u.f = mean[i];
+	}
+
+	td->nr_lat_stats++;
+}
+
+bool __lat_ts_has_stats(struct thread_stat *ts, enum fio_ddir ddir)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ts->step_stats); i++) {
+		struct lat_step_stats *ls = &ts->step_stats[i];
+
+		if (ls->iops[ddir])
+			return true;
+	}
+
+	return false;
+}
+
+bool lat_ts_has_stats(struct thread_stat *ts)
+{
+	int i;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		if (__lat_ts_has_stats(ts, i))
+			return true;
+
+	return false;
+}
+
+void lat_step_report(struct thread_stat *ts, struct buf_output *out)
+{
+	int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(ts->step_stats); i++) {
+		struct lat_step_stats *ls = &ts->step_stats[i];
+
+		for (j = 0; j < DDIR_RWDIR_CNT; j++) {
+			if (!ls->iops[j])
+				continue;
+
+			__log_buf(out, "    %s: iops=%llu, lat=%.1f nsec\n",
+					io_ddir_name(j),
+					(unsigned long long) ls->iops[j],
+					ls->avg[j].u.f);
+		}
+	}
+}
+
+static void lat_next_state(struct thread_data *td, int new_state)
+{
+	td->latency_state = new_state;
+	fio_gettime(&td->latency_ts, NULL);
+}
+
+bool lat_step_check(struct thread_data *td)
+{
+	struct thread_options *o = &td->o;
+	uint64_t msec;
+
+	msec = mtime_since_now(&td->latency_ts);
+
+	switch (td->latency_state) {
+	case IOD_STATE_PROBE_RAMP:
+		if (msec < o->lat_step_ramp)
+			break;
+
+		lat_step_reset(td);
+		lat_ios_note(td);
+
+		lat_next_state(td, IOD_STATE_PROBE_RUN);
+		break;
+	case IOD_STATE_PROBE_RUN: {
+		int i;
+
+		if (msec < o->lat_step_run)
+			break;
+
+		io_u_quiesce(td);
+
+		for (i = 0; i < DDIR_RWDIR_CNT; i++)
+			td->latency_iops[i] = lat_iops_since(td, msec, i);
+
+		lat_step_reset(td);
+		lat_step_recalc(td);
+
+		io_u_quiesce(td);
+		lat_next_state(td, IOD_STATE_RAMP);
+		break;
+		}
+	case IOD_STATE_RAMP:
+		if (msec < o->lat_step_ramp)
+			break;
+
+		lat_ios_note(td);
+		lat_next_state(td, IOD_STATE_RUN);
+		break;
+	case IOD_STATE_RUN:
+		if (msec < o->lat_step_run)
+			break;
+
+		io_u_quiesce(td);
+		fio_gettime(&td->latency_ts, NULL);
+		td->latency_step++;
+
+		lat_step_add_sample(td, msec);
+		lat_step_reset(td);
+
+		if (!lat_step_recalc(td))
+			break;
+
+		td->done = 1;
+		lat_next_state(td, IOD_STATE_DONE);
+		break;
+	};
+
+	return td->latency_state == IOD_STATE_DONE;
+}
diff --git a/target.h b/target.h
new file mode 100644
index 0000000..a794285
--- /dev/null
+++ b/target.h
@@ -0,0 +1,58 @@
+#ifndef FIO_LAT_TARGET_H
+#define FIO_LAT_TARGET_H
+
+#include "fio.h"
+
+enum {
+	IOD_STEPPED_DEF_RAMP	= 5000,
+	IOD_STEPPED_DEF_RUN	= 30000,
+};
+
+/*
+ * Starts out as PROBE_RAMP -> PROBE_RUN, then iterations of
+ * RAMP -> RUN with various iops limiting settings
+ */
+enum {
+	IOD_STATE_PROBE_RAMP = 1,
+	IOD_STATE_PROBE_RUN,
+	IOD_STATE_RAMP,
+	IOD_STATE_RUN,
+	IOD_STATE_DONE,
+};
+
+/*
+ * Latency target helpers
+ */
+void lat_target_init(struct thread_data *);
+void lat_target_reset(struct thread_data *);
+bool lat_target_failed(struct thread_data *td);
+void lat_step_report(struct thread_stat *ts, struct buf_output *out);
+bool lat_ts_has_stats(struct thread_stat *ts);
+bool __lat_ts_has_stats(struct thread_stat *ts, enum fio_ddir);
+
+void lat_fatal(struct thread_data *td, unsigned long long tnsec,
+		unsigned long long max_nsec);
+
+bool lat_step_check(struct thread_data *td);
+void __lat_target_check(struct thread_data *td);
+
+static inline bool lat_target_check(struct thread_data *td)
+{
+	if (td->o.latency_target) {
+		__lat_target_check(td);
+		return false;
+	} else if (td->o.iodepth_mode == IOD_STEPPED)
+		return lat_step_check(td);
+
+	return false;
+}
+
+static inline bool lat_step_account(struct thread_data *td)
+{
+	if (td->o.iodepth_mode != IOD_STEPPED)
+		return true;
+
+	return td->latency_state == IOD_STATE_RUN;
+}
+
+#endif
diff --git a/thread_options.h b/thread_options.h
index 14c6969..e062fa6 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -77,6 +77,13 @@
 	unsigned int iodepth_batch_complete_min;
 	unsigned int iodepth_batch_complete_max;
 	unsigned int serialize_overlap;
+	unsigned int iodepth_mode;
+
+	unsigned int lat_step_low;
+	unsigned int lat_step_high;
+	unsigned int lat_step_inc;
+	unsigned int lat_step_ramp;
+	unsigned int lat_step_run;
 
 	unsigned int unique_filename;
 
@@ -361,6 +368,7 @@
 	uint32_t kb_base;
 	uint32_t unit_base;
 	uint32_t ddir_seq_nr;
+	uint32_t pad;
 	uint64_t ddir_seq_add;
 	uint32_t iodepth;
 	uint32_t iodepth_low;
@@ -368,6 +376,15 @@
 	uint32_t iodepth_batch_complete_min;
 	uint32_t iodepth_batch_complete_max;
 	uint32_t serialize_overlap;
+
+	uint32_t iodepth_mode;
+	uint32_t lat_step_low;
+	uint32_t lat_step_high;
+	uint32_t lat_step_inc;
+	uint32_t lat_step_ramp;
+	uint32_t lat_step_run;
+
+	uint32_t pad2;
 	uint32_t lat_percentiles;
 
 	uint64_t size;
@@ -416,6 +433,7 @@
 	uint32_t verify_fatal;
 	uint32_t verify_dump;
 	uint32_t verify_async;
+	uint32_t pad3;
 	uint64_t verify_backlog;
 	uint32_t verify_batch;
 	uint32_t experimental_verify;
@@ -428,7 +446,7 @@
 	uint32_t override_sync;
 	uint32_t rand_repeatable;
 	uint32_t allrand_repeatable;
-	uint32_t pad;
+	uint32_t pad4;
 	uint64_t rand_seed;
 	uint32_t log_avg_msec;
 	uint32_t log_hist_msec;
@@ -451,6 +469,7 @@
 
 	struct zone_split zone_split[DDIR_RWDIR_CNT][ZONESPLIT_MAX];
 	uint32_t zone_split_nr[DDIR_RWDIR_CNT];
+	uint32_t pad5;
 
 	fio_fp64_t zipf_theta;
 	fio_fp64_t pareto_h;
@@ -459,10 +478,10 @@
 	uint32_t random_generator;
 
 	uint32_t perc_rand[DDIR_RWDIR_CNT];
+	uint32_t pad6;
 
 	uint32_t hugepage_size;
 	uint64_t rw_min_bs;
-	uint32_t pad2;
 	uint32_t thinktime;
 	uint32_t thinktime_spin;
 	uint32_t thinktime_blocks;
@@ -476,6 +495,7 @@
 	uint64_t ss_dur;
 	uint64_t ss_ramp_time;
 	uint32_t ss_state;
+	uint32_t pad7;
 	fio_fp64_t ss_limit;
 	uint32_t overwrite;
 	uint32_t bw_avg_time;
@@ -534,6 +554,7 @@
 	uint32_t trim_percentage;
 	uint32_t trim_batch;
 	uint32_t trim_zero;
+	uint32_t pad8;
 	uint64_t trim_backlog;
 	uint32_t clat_percentiles;
 	uint32_t percentile_precision;
@@ -570,7 +591,6 @@
 	uint32_t rate_iops_min[DDIR_RWDIR_CNT];
 	uint32_t rate_process;
 	uint32_t rate_ign_think;
-	uint32_t pad3;
 
 	uint8_t ioscheduler[FIO_TOP_STR_MAX];
 
@@ -598,6 +618,7 @@
 	int32_t flow;
 	int32_t flow_watermark;
 	uint32_t flow_sleep;
+	uint32_t pad9;
 
 	uint64_t offset_increment;
 	uint64_t number_ios;