blob: c8f91ea1b263f966b36a861181c56980c77e4226 [file] [log] [blame]
/*
* schbench.c
*
* Copyright (C) 2016 Facebook
* Chris Mason <clm@fb.com>
*
* GPLv2, portions copied from the kernel and from Jens Axboe's fio
*
* gcc -Wall -O0 -W schbench.c -o schbench -lpthread
*/
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <getopt.h>
#include <sys/time.h>
#include <time.h>
#include <string.h>
#include <linux/futex.h>
#include <sys/syscall.h>
#define PLAT_BITS 8
#define PLAT_VAL (1 << PLAT_BITS)
#define PLAT_GROUP_NR 19
#define PLAT_NR (PLAT_GROUP_NR * PLAT_VAL)
#define PLAT_LIST_MAX 20
/* -m number of message threads */
static int message_threads = 2;
/* -t number of workers per message thread */
static int worker_threads = 16;
/* -r seconds */
static int runtime = 30;
/* -s usec */
static int sleeptime = 30000;
/* -c usec */
static unsigned long long cputime = 30000;
/* -a, bool */
static int autobench = 0;
/* the latency histogram uses this to pitch outliers */
static unsigned int max_us = 50000;
/* main() sets this to the time when we should all stop doing work */
static struct timeval global_stop;
/* the message threads flip this to true when they decide runtime is up */
static unsigned long stopping = 0;
/*
* one stat struct per thread data, when the workers sleep this records the
* latency between when they are woken up and when they actually get the
* CPU again. The message threads sum up the stats of all the workers and
* then bubble them up to main() for printing
*/
struct stats {
unsigned int plat[PLAT_NR];
unsigned int nr_samples;
unsigned int max;
unsigned int min;
unsigned int over;
};
/* this defines which latency profiles get printed */
#define PLIST_P99 4
static double plist[PLAT_LIST_MAX] = { 50.0, 75.0, 90.0, 95.0, 99.0, 99.5, 99.9 };
enum {
HELP_LONG_OPT = 1,
};
char *option_string = "am:t:s:c:r:";
static struct option long_options[] = {
{"auto", no_argument, 0, 'a'},
{"message-threads", required_argument, 0, 'm'},
{"threads", required_argument, 0, 't'},
{"runtime", required_argument, 0, 'r'},
{"sleeptime", required_argument, 0, 's'},
{"cputime", required_argument, 0, 'c'},
{"help", no_argument, 0, HELP_LONG_OPT},
{0, 0, 0, 0}
};
static void print_usage(void)
{
fprintf(stderr, "schbench usage:\n"
"\t-d (--dispatch-threads): number of message threads (def: 2)\n"
"\t-t (--threads): worker threads per message thread (def: 16)\n"
"\t-r (--runtime): How long to run before exiting (seconds, def: 30)\n"
"\t-s (--sleeptime): Message thread latency (usec, def: 10000\n"
"\t-c (--cputime): How long to think during loop (usec, def: 10000\n"
);
exit(1);
}
static void parse_options(int ac, char **av)
{
int c;
while (1) {
int option_index = 0;
c = getopt_long(ac, av, option_string,
long_options, &option_index);
if (c == -1)
break;
switch(c) {
case 'a':
autobench = 1;
break;
case 's':
sleeptime = atoi(optarg);
break;
case 'c':
cputime = atoi(optarg);
break;
case 'd':
message_threads = atoi(optarg);
break;
case 't':
worker_threads = atoi(optarg);
break;
case 'r':
runtime = atoi(optarg);
break;
case '?':
case HELP_LONG_OPT:
print_usage();
break;
default:
break;
}
}
if (optind < ac) {
fprintf(stderr, "Error Extra arguments '%s'\n", av[optind]);
exit(1);
}
}
void tvsub(struct timeval * tdiff, struct timeval * t1, struct timeval * t0)
{
tdiff->tv_sec = t1->tv_sec - t0->tv_sec;
tdiff->tv_usec = t1->tv_usec - t0->tv_usec;
if (tdiff->tv_usec < 0 && tdiff->tv_sec > 0) {
tdiff->tv_sec--;
tdiff->tv_usec += 1000000;
if (tdiff->tv_usec < 0) {
fprintf(stderr, "lat_fs: tvsub shows test time ran backwards!\n");
exit(1);
}
}
/* time shouldn't go backwards!!! */
if (tdiff->tv_usec < 0 || t1->tv_sec < t0->tv_sec) {
tdiff->tv_sec = 0;
tdiff->tv_usec = 0;
}
}
/*
* returns the difference between start and stop in usecs. Negative values
* are turned into 0
*/
unsigned long long tvdelta(struct timeval *start, struct timeval *stop)
{
struct timeval td;
unsigned long long usecs;
tvsub(&td, stop, start);
usecs = td.tv_sec;
usecs *= 1000000;
usecs += td.tv_usec;
return (usecs);
}
/* mr axboe's magic latency histogram */
static unsigned int plat_val_to_idx(unsigned int val)
{
unsigned int msb, error_bits, base, offset;
/* Find MSB starting from bit 0 */
if (val == 0)
msb = 0;
else
msb = sizeof(val)*8 - __builtin_clz(val) - 1;
/*
* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
* all bits of the sample as index
*/
if (msb <= PLAT_BITS)
return val;
/* Compute the number of error bits to discard*/
error_bits = msb - PLAT_BITS;
/* Compute the number of buckets before the group */
base = (error_bits + 1) << PLAT_BITS;
/*
* Discard the error bits and apply the mask to find the
* index for the buckets in the group
*/
offset = (PLAT_VAL - 1) & (val >> error_bits);
/* Make sure the index does not exceed (array size - 1) */
return (base + offset) < (PLAT_NR - 1) ?
(base + offset) : (PLAT_NR - 1);
}
/*
* Convert the given index of the bucket array to the value
* represented by the bucket
*/
static unsigned int plat_idx_to_val(unsigned int idx)
{
unsigned int error_bits, k, base;
if (idx >= PLAT_NR) {
fprintf(stderr, "idx %u is too large\n", idx);
exit(1);
}
/* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
* all bits of the sample as index */
if (idx < (PLAT_VAL << 1))
return idx;
/* Find the group and compute the minimum value of that group */
error_bits = (idx >> PLAT_BITS) - 1;
base = 1 << (error_bits + PLAT_BITS);
/* Find its bucket number of the group */
k = idx % PLAT_VAL;
/* Return the mean of the range of the bucket */
return base + ((k + 0.5) * (1 << error_bits));
}
static unsigned int calc_percentiles(unsigned int *io_u_plat, unsigned long nr,
unsigned int **output)
{
unsigned long sum = 0;
unsigned int len, i, j = 0;
unsigned int oval_len = 0;
unsigned int *ovals = NULL;
int is_last;
len = 0;
while (len < PLAT_LIST_MAX && plist[len] != 0.0)
len++;
if (!len)
return 0;
/*
* Calculate bucket values, note down max and min values
*/
is_last = 0;
for (i = 0; i < PLAT_NR && !is_last; i++) {
sum += io_u_plat[i];
while (sum >= (plist[j] / 100.0 * nr)) {
if (j == oval_len) {
oval_len += 100;
ovals = realloc(ovals, oval_len * sizeof(unsigned int));
}
ovals[j] = plat_idx_to_val(i);
is_last = (j == len - 1);
if (is_last)
break;
j++;
}
}
*output = ovals;
return len;
}
static int calc_p99(struct stats *s)
{
unsigned int *ovals = NULL;
int ret = 0;
int len;
len = calc_percentiles(s->plat, s->nr_samples, &ovals);
if (len && len > PLIST_P99)
ret = ovals[PLIST_P99];
if (ovals)
free(ovals);
return ret;
}
static void show_latencies(struct stats *s)
{
unsigned int *ovals = NULL;
unsigned int len, i;
len = calc_percentiles(s->plat, s->nr_samples, &ovals);
if (len) {
fprintf(stderr, "Latency percentiles (usec)\n");
for (i = 0; i < len; i++)
fprintf(stderr, "\t%s%2.4fth: %u\n",
i == PLIST_P99 ? "*" : "",
plist[i], ovals[i]);
}
if (ovals)
free(ovals);
fprintf(stderr, "\tOver=%u, min=%u, max=%u\n", s->over, s->min, s->max);
}
/* fold latency info from s into d */
void combine_stats(struct stats *d, struct stats *s)
{
int i;
for (i = 0; i < PLAT_NR; i++)
d->plat[i] += s->plat[i];
d->nr_samples += s->nr_samples;
d->over += s->over;
if (s->max > d->max)
d->max = s->max;
if (s->min < d->min)
d->min = s->min;
}
/* record a latency result into the histogram */
static void add_lat(struct stats *s, unsigned int us)
{
int lat_index = 0;
if (us > s->max)
s->max = us;
if (us < s->min)
s->min = us;
if (us > max_us) {
fprintf(stderr, "latency=%u usec\n", us);
s->over++;
}
lat_index = plat_val_to_idx(us);
__sync_fetch_and_add(&s->plat[lat_index], 1);
__sync_fetch_and_add(&s->nr_samples, 1);
}
/*
* every thread has one of these, it comes out to about 19K thanks to the
* giant stats struct
*/
struct thread_data {
pthread_t tid;
/* ->next is for placing us on the msg_thread's list for waking */
struct thread_data *next;
/* our parent thread and messaging partner */
struct thread_data *msg_thread;
/*
* the msg thread stuffs gtod in here before waking us, so we can
* measure scheduler latency
*/
struct timeval wake_time;
/* keep the futex and the wake_time in the same cacheline */
int futex;
/* mr axboe's magic latency histogram */
struct stats stats;
};
/* we're so fancy we make our own futex wrappers */
#define FUTEX_BLOCKED 0
#define FUTEX_RUNNING 1
static int futex(int *uaddr, int futex_op, int val,
const struct timespec *timeout, int *uaddr2, int val3)
{
return syscall(SYS_futex, uaddr, futex_op, val, timeout, uaddr2, val3);
}
/*
* wakeup a process waiting on a futex, making sure they are really waiting
* first
*/
static void fpost(int *futexp)
{
int s;
if (__sync_bool_compare_and_swap(futexp, FUTEX_BLOCKED,
FUTEX_RUNNING)) {
s = futex(futexp, FUTEX_WAKE, 1, NULL, NULL, 0);
if (s == -1) {
perror("FUTEX_WAKE");
exit(1);
}
}
}
/*
* wait on a futex, with an optional timeout. Make sure to set
* the futex to FUTEX_BLOCKED beforehand.
*
* This will return zero if all went well, or return -ETIMEDOUT if you
* hit the timeout without getting posted
*/
static int fwait(int *futexp, struct timespec *timeout)
{
int s;
while (1) {
/* Is the futex available? */
if (__sync_bool_compare_and_swap(futexp, FUTEX_RUNNING,
FUTEX_BLOCKED)) {
break; /* Yes */
}
/* Futex is not available; wait */
s = futex(futexp, FUTEX_WAIT, FUTEX_BLOCKED, timeout, NULL, 0);
if (s == -1 && errno != EAGAIN) {
if (errno == ETIMEDOUT)
return -ETIMEDOUT;
perror("futex-FUTEX_WAIT");
exit(1);
}
}
return 0;
}
/*
* cmpxchg based list prepend
*/
static void xlist_add(struct thread_data *head, struct thread_data *add)
{
struct thread_data *old;
struct thread_data *ret;
while (1) {
old = head->next;
add->next = old;
ret = __sync_val_compare_and_swap(&head->next, old, add);
if (ret == old)
break;
}
}
/*
* xchg based list splicing. This returns the entire list and
* replaces the head->next with NULL
*/
static struct thread_data *xlist_splice(struct thread_data *head)
{
struct thread_data *old;
struct thread_data *ret;
while (1) {
old = head->next;
ret = __sync_val_compare_and_swap(&head->next, old, NULL);
if (ret == old)
break;
}
return ret;
}
/*
* Wake everyone currently waiting on the message list, filling in their
* thread_data->wake_time with the current time.
*
* It's not exactly the current time, it's really the time at the start of
* the list run. We want to detect when the scheduler is just preempting the
* waker and giving away the rest of its timeslice. So we gtod once at
* the start of the loop and use that for all the threads we wake.
*/
static void xlist_wake_all(struct thread_data *td)
{
struct thread_data *list;
struct thread_data *next;
struct timeval now;
list = xlist_splice(td);
gettimeofday(&now, NULL);
while (list) {
next = list->next;
list->next = NULL;
memcpy(&list->wake_time, &now, sizeof(now));
fpost(&list->futex);
list = next;
}
}
/*
* called by worker threads to send a message and wait for the answer.
* In reality we're just trading one cacheline with the gtod and futex in
* it, but that's good enough. We gtod after waking and use that to
* record scheduler latency.
*/
static void msg_and_wait(struct thread_data *td)
{
struct timeval now;
unsigned long long delta;
struct timespec timeout;
timeout.tv_sec = 0;
timeout.tv_nsec = 5000 * 1000;
/* set ourselves to blocked */
td->futex = FUTEX_BLOCKED;
gettimeofday(&td->wake_time, NULL);
/* add us to the list */
xlist_add(td->msg_thread, td);
fpost(&td->msg_thread->futex);
/*
* don't wait if the main threads are shutting down,
* they will never kick us fpost has a full barrier, so as long
* as the message thread walks his list after setting stopping,
* we shouldn't miss the wakeup
*/
if (!stopping) {
/* if he hasn't already woken us up, wait */
fwait(&td->futex, NULL);
}
gettimeofday(&now, NULL);
delta = tvdelta(&td->wake_time, &now);
if (delta > 0)
add_lat(&td->stats, delta);
}
/*
* once the message thread starts all his children, this is where he
* loops until our runtime is up. Basically this sits around waiting
* for posting by the worker threads, replying to their messages after
* a delay of 'sleeptime' + some jitter.
*/
static void run_msg_thread(struct thread_data *td)
{
struct timeval now;
struct timespec timeout;
unsigned int seed = pthread_self();
int max_jitter = sleeptime / 4;
int jitter;
jitter = rand_r(&seed) % max_jitter;
timeout.tv_sec = 0;
timeout.tv_nsec = (sleeptime + jitter) * 1000;
while (1) {
td->futex = FUTEX_BLOCKED;
xlist_wake_all(td);
gettimeofday(&now, NULL);
if (now.tv_sec > global_stop.tv_sec) {
stopping = 1;
__sync_synchronize();
xlist_wake_all(td);
break;
}
fwait(&td->futex, &timeout);
/*
* messages shouldn't be instant, sleep a little to make them
* wait
*/
jitter = rand_r(&seed) % max_jitter;
usleep(sleeptime + jitter);
}
}
#define nop __asm__ __volatile__("rep;nop": : :"memory")
static void usec_spin(unsigned long spin_time)
{
struct timeval now;
struct timeval start;
unsigned long long delta;
gettimeofday(&start, NULL);
while (1) {
gettimeofday(&now, NULL);
delta = tvdelta(&start, &now);
if (delta > spin_time)
return;
nop;
}
}
/*
* the worker thread is pretty simple, it just does a single spin and
* then waits on a message from the message thread
*/
void *worker_thread(void *arg)
{
struct thread_data *td = arg;
while(1) {
if (stopping)
break;
usec_spin(cputime);
msg_and_wait(td);
}
return NULL;
}
/*
* the message thread starts his own gaggle of workers and then sits around
* replying when they post him. He collects latency stats as all the threads
* exit
*/
void *message_thread(void *arg)
{
struct thread_data *td = arg;
struct thread_data *worker_threads_mem = NULL;
int i;
int ret;
worker_threads_mem = calloc(worker_threads, sizeof(struct thread_data));
if (!worker_threads_mem) {
perror("unable to allocate ram");
pthread_exit((void *)-ENOMEM);
}
for (i = 0; i < worker_threads; i++) {
pthread_t tid;
worker_threads_mem[i].msg_thread = td;
ret = pthread_create(&tid, NULL, worker_thread,
worker_threads_mem + i);
if (ret) {
fprintf(stderr, "error %d from pthread_create\n", ret);
exit(1);
}
worker_threads_mem[i].tid = tid;
}
run_msg_thread(td);
for (i = 0; i < worker_threads; i++) {
pthread_join(worker_threads_mem[i].tid, NULL);
combine_stats(&td->stats, &worker_threads_mem[i].stats);
}
free(worker_threads_mem);
return NULL;
}
int main(int ac, char **av)
{
int i;
int ret;
struct thread_data *message_threads_mem = NULL;
struct stats stats;
parse_options(ac, av);
again:
stopping = 0;
memset(&stats, 0, sizeof(stats));
message_threads_mem = calloc(message_threads,
sizeof(struct thread_data));
if (!message_threads_mem) {
perror("unable to allocate ram");
exit(1);
}
gettimeofday(&global_stop, NULL);
global_stop.tv_sec += runtime;
/* start our message threads, each one starts its own workers */
for (i = 0; i < message_threads; i++) {
pthread_t tid;
ret = pthread_create(&tid, NULL, message_thread,
message_threads_mem + i);
if (ret) {
fprintf(stderr, "error %d from pthread_create\n", ret);
exit(1);
}
message_threads_mem[i].tid = tid;
}
for (i = 0; i < message_threads; i++) {
pthread_join(message_threads_mem[i].tid, NULL);
combine_stats(&stats, &message_threads_mem[i].stats);
}
free(message_threads_mem);
/*
* in auto bench mode, keep adding workers until our latencies get
* horrible
*/
if (autobench) {
int p99 = calc_p99(&stats);
fprintf(stderr, "cputime %Lu threads %d p99 %d\n",
cputime, worker_threads, p99);
if (p99 < 2000) {
worker_threads++;
goto again;
}
}
show_latencies(&stats);
return 0;
}