| /* |
| * schbench.c |
| * |
| * Copyright (C) 2016 Facebook |
| * Chris Mason <clm@fb.com> |
| * |
| * GPLv2, portions copied from the kernel and from Jens Axboe's fio |
| * |
| * gcc -Wall -O0 -W schbench.c -o schbench -lpthread |
| */ |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <pthread.h> |
| #include <fcntl.h> |
| #include <unistd.h> |
| #include <errno.h> |
| #include <getopt.h> |
| #include <sys/time.h> |
| #include <time.h> |
| #include <string.h> |
| #include <linux/futex.h> |
| #include <sys/syscall.h> |
| |
| #define PLAT_BITS 8 |
| #define PLAT_VAL (1 << PLAT_BITS) |
| #define PLAT_GROUP_NR 19 |
| #define PLAT_NR (PLAT_GROUP_NR * PLAT_VAL) |
| #define PLAT_LIST_MAX 20 |
| |
| /* -m number of message threads */ |
| static int message_threads = 2; |
| /* -t number of workers per message thread */ |
| static int worker_threads = 16; |
| /* -r seconds */ |
| static int runtime = 30; |
| /* -s usec */ |
| static int sleeptime = 30000; |
| /* -c usec */ |
| static unsigned long long cputime = 30000; |
| /* -a, bool */ |
| static int autobench = 0; |
| |
| /* the latency histogram uses this to pitch outliers */ |
| static unsigned int max_us = 50000; |
| |
| /* main() sets this to the time when we should all stop doing work */ |
| static struct timeval global_stop; |
| |
| /* the message threads flip this to true when they decide runtime is up */ |
| static unsigned long stopping = 0; |
| |
| |
| /* |
| * one stat struct per thread data, when the workers sleep this records the |
| * latency between when they are woken up and when they actually get the |
| * CPU again. The message threads sum up the stats of all the workers and |
| * then bubble them up to main() for printing |
| */ |
| struct stats { |
| unsigned int plat[PLAT_NR]; |
| unsigned int nr_samples; |
| unsigned int max; |
| unsigned int min; |
| unsigned int over; |
| }; |
| |
| /* this defines which latency profiles get printed */ |
| #define PLIST_P99 4 |
| static double plist[PLAT_LIST_MAX] = { 50.0, 75.0, 90.0, 95.0, 99.0, 99.5, 99.9 }; |
| |
| enum { |
| HELP_LONG_OPT = 1, |
| }; |
| |
| char *option_string = "am:t:s:c:r:"; |
| static struct option long_options[] = { |
| {"auto", no_argument, 0, 'a'}, |
| {"message-threads", required_argument, 0, 'm'}, |
| {"threads", required_argument, 0, 't'}, |
| {"runtime", required_argument, 0, 'r'}, |
| {"sleeptime", required_argument, 0, 's'}, |
| {"cputime", required_argument, 0, 'c'}, |
| {"help", no_argument, 0, HELP_LONG_OPT}, |
| {0, 0, 0, 0} |
| }; |
| |
| static void print_usage(void) |
| { |
| fprintf(stderr, "schbench usage:\n" |
| "\t-d (--dispatch-threads): number of message threads (def: 2)\n" |
| "\t-t (--threads): worker threads per message thread (def: 16)\n" |
| "\t-r (--runtime): How long to run before exiting (seconds, def: 30)\n" |
| "\t-s (--sleeptime): Message thread latency (usec, def: 10000\n" |
| "\t-c (--cputime): How long to think during loop (usec, def: 10000\n" |
| ); |
| exit(1); |
| } |
| |
| static void parse_options(int ac, char **av) |
| { |
| int c; |
| |
| while (1) { |
| int option_index = 0; |
| |
| c = getopt_long(ac, av, option_string, |
| long_options, &option_index); |
| |
| if (c == -1) |
| break; |
| |
| switch(c) { |
| case 'a': |
| autobench = 1; |
| break; |
| case 's': |
| sleeptime = atoi(optarg); |
| break; |
| case 'c': |
| cputime = atoi(optarg); |
| break; |
| case 'd': |
| message_threads = atoi(optarg); |
| break; |
| case 't': |
| worker_threads = atoi(optarg); |
| break; |
| case 'r': |
| runtime = atoi(optarg); |
| break; |
| case '?': |
| case HELP_LONG_OPT: |
| print_usage(); |
| break; |
| default: |
| break; |
| } |
| } |
| |
| if (optind < ac) { |
| fprintf(stderr, "Error Extra arguments '%s'\n", av[optind]); |
| exit(1); |
| } |
| } |
| |
| void tvsub(struct timeval * tdiff, struct timeval * t1, struct timeval * t0) |
| { |
| tdiff->tv_sec = t1->tv_sec - t0->tv_sec; |
| tdiff->tv_usec = t1->tv_usec - t0->tv_usec; |
| if (tdiff->tv_usec < 0 && tdiff->tv_sec > 0) { |
| tdiff->tv_sec--; |
| tdiff->tv_usec += 1000000; |
| if (tdiff->tv_usec < 0) { |
| fprintf(stderr, "lat_fs: tvsub shows test time ran backwards!\n"); |
| exit(1); |
| } |
| } |
| |
| /* time shouldn't go backwards!!! */ |
| if (tdiff->tv_usec < 0 || t1->tv_sec < t0->tv_sec) { |
| tdiff->tv_sec = 0; |
| tdiff->tv_usec = 0; |
| } |
| } |
| |
| /* |
| * returns the difference between start and stop in usecs. Negative values |
| * are turned into 0 |
| */ |
| unsigned long long tvdelta(struct timeval *start, struct timeval *stop) |
| { |
| struct timeval td; |
| unsigned long long usecs; |
| |
| tvsub(&td, stop, start); |
| usecs = td.tv_sec; |
| usecs *= 1000000; |
| usecs += td.tv_usec; |
| return (usecs); |
| } |
| |
| /* mr axboe's magic latency histogram */ |
| static unsigned int plat_val_to_idx(unsigned int val) |
| { |
| unsigned int msb, error_bits, base, offset; |
| |
| /* Find MSB starting from bit 0 */ |
| if (val == 0) |
| msb = 0; |
| else |
| msb = sizeof(val)*8 - __builtin_clz(val) - 1; |
| |
| /* |
| * MSB <= (PLAT_BITS-1), cannot be rounded off. Use |
| * all bits of the sample as index |
| */ |
| if (msb <= PLAT_BITS) |
| return val; |
| |
| /* Compute the number of error bits to discard*/ |
| error_bits = msb - PLAT_BITS; |
| |
| /* Compute the number of buckets before the group */ |
| base = (error_bits + 1) << PLAT_BITS; |
| |
| /* |
| * Discard the error bits and apply the mask to find the |
| * index for the buckets in the group |
| */ |
| offset = (PLAT_VAL - 1) & (val >> error_bits); |
| |
| /* Make sure the index does not exceed (array size - 1) */ |
| return (base + offset) < (PLAT_NR - 1) ? |
| (base + offset) : (PLAT_NR - 1); |
| } |
| |
| /* |
| * Convert the given index of the bucket array to the value |
| * represented by the bucket |
| */ |
| static unsigned int plat_idx_to_val(unsigned int idx) |
| { |
| unsigned int error_bits, k, base; |
| |
| if (idx >= PLAT_NR) { |
| fprintf(stderr, "idx %u is too large\n", idx); |
| exit(1); |
| } |
| |
| /* MSB <= (PLAT_BITS-1), cannot be rounded off. Use |
| * all bits of the sample as index */ |
| if (idx < (PLAT_VAL << 1)) |
| return idx; |
| |
| /* Find the group and compute the minimum value of that group */ |
| error_bits = (idx >> PLAT_BITS) - 1; |
| base = 1 << (error_bits + PLAT_BITS); |
| |
| /* Find its bucket number of the group */ |
| k = idx % PLAT_VAL; |
| |
| /* Return the mean of the range of the bucket */ |
| return base + ((k + 0.5) * (1 << error_bits)); |
| } |
| |
| |
| static unsigned int calc_percentiles(unsigned int *io_u_plat, unsigned long nr, |
| unsigned int **output) |
| { |
| unsigned long sum = 0; |
| unsigned int len, i, j = 0; |
| unsigned int oval_len = 0; |
| unsigned int *ovals = NULL; |
| int is_last; |
| |
| len = 0; |
| while (len < PLAT_LIST_MAX && plist[len] != 0.0) |
| len++; |
| |
| if (!len) |
| return 0; |
| |
| /* |
| * Calculate bucket values, note down max and min values |
| */ |
| is_last = 0; |
| for (i = 0; i < PLAT_NR && !is_last; i++) { |
| sum += io_u_plat[i]; |
| while (sum >= (plist[j] / 100.0 * nr)) { |
| if (j == oval_len) { |
| oval_len += 100; |
| ovals = realloc(ovals, oval_len * sizeof(unsigned int)); |
| } |
| |
| ovals[j] = plat_idx_to_val(i); |
| is_last = (j == len - 1); |
| if (is_last) |
| break; |
| |
| j++; |
| } |
| } |
| |
| *output = ovals; |
| return len; |
| } |
| |
| static int calc_p99(struct stats *s) |
| { |
| unsigned int *ovals = NULL; |
| int ret = 0; |
| int len; |
| |
| len = calc_percentiles(s->plat, s->nr_samples, &ovals); |
| if (len && len > PLIST_P99) |
| ret = ovals[PLIST_P99]; |
| if (ovals) |
| free(ovals); |
| return ret; |
| } |
| |
| static void show_latencies(struct stats *s) |
| { |
| unsigned int *ovals = NULL; |
| unsigned int len, i; |
| |
| len = calc_percentiles(s->plat, s->nr_samples, &ovals); |
| if (len) { |
| fprintf(stderr, "Latency percentiles (usec)\n"); |
| for (i = 0; i < len; i++) |
| fprintf(stderr, "\t%s%2.4fth: %u\n", |
| i == PLIST_P99 ? "*" : "", |
| plist[i], ovals[i]); |
| } |
| |
| if (ovals) |
| free(ovals); |
| |
| fprintf(stderr, "\tOver=%u, min=%u, max=%u\n", s->over, s->min, s->max); |
| } |
| |
| /* fold latency info from s into d */ |
| void combine_stats(struct stats *d, struct stats *s) |
| { |
| int i; |
| for (i = 0; i < PLAT_NR; i++) |
| d->plat[i] += s->plat[i]; |
| d->nr_samples += s->nr_samples; |
| d->over += s->over; |
| if (s->max > d->max) |
| d->max = s->max; |
| if (s->min < d->min) |
| d->min = s->min; |
| } |
| |
| /* record a latency result into the histogram */ |
| static void add_lat(struct stats *s, unsigned int us) |
| { |
| int lat_index = 0; |
| |
| if (us > s->max) |
| s->max = us; |
| if (us < s->min) |
| s->min = us; |
| |
| if (us > max_us) { |
| fprintf(stderr, "latency=%u usec\n", us); |
| s->over++; |
| } |
| |
| lat_index = plat_val_to_idx(us); |
| __sync_fetch_and_add(&s->plat[lat_index], 1); |
| __sync_fetch_and_add(&s->nr_samples, 1); |
| } |
| |
| /* |
| * every thread has one of these, it comes out to about 19K thanks to the |
| * giant stats struct |
| */ |
| struct thread_data { |
| pthread_t tid; |
| /* ->next is for placing us on the msg_thread's list for waking */ |
| struct thread_data *next; |
| |
| /* our parent thread and messaging partner */ |
| struct thread_data *msg_thread; |
| |
| /* |
| * the msg thread stuffs gtod in here before waking us, so we can |
| * measure scheduler latency |
| */ |
| struct timeval wake_time; |
| |
| /* keep the futex and the wake_time in the same cacheline */ |
| int futex; |
| |
| /* mr axboe's magic latency histogram */ |
| struct stats stats; |
| }; |
| |
| /* we're so fancy we make our own futex wrappers */ |
| #define FUTEX_BLOCKED 0 |
| #define FUTEX_RUNNING 1 |
| |
| static int futex(int *uaddr, int futex_op, int val, |
| const struct timespec *timeout, int *uaddr2, int val3) |
| { |
| return syscall(SYS_futex, uaddr, futex_op, val, timeout, uaddr2, val3); |
| } |
| |
| /* |
| * wakeup a process waiting on a futex, making sure they are really waiting |
| * first |
| */ |
| static void fpost(int *futexp) |
| { |
| int s; |
| |
| if (__sync_bool_compare_and_swap(futexp, FUTEX_BLOCKED, |
| FUTEX_RUNNING)) { |
| s = futex(futexp, FUTEX_WAKE, 1, NULL, NULL, 0); |
| if (s == -1) { |
| perror("FUTEX_WAKE"); |
| exit(1); |
| } |
| } |
| } |
| |
| /* |
| * wait on a futex, with an optional timeout. Make sure to set |
| * the futex to FUTEX_BLOCKED beforehand. |
| * |
| * This will return zero if all went well, or return -ETIMEDOUT if you |
| * hit the timeout without getting posted |
| */ |
| static int fwait(int *futexp, struct timespec *timeout) |
| { |
| int s; |
| while (1) { |
| /* Is the futex available? */ |
| if (__sync_bool_compare_and_swap(futexp, FUTEX_RUNNING, |
| FUTEX_BLOCKED)) { |
| break; /* Yes */ |
| } |
| /* Futex is not available; wait */ |
| s = futex(futexp, FUTEX_WAIT, FUTEX_BLOCKED, timeout, NULL, 0); |
| if (s == -1 && errno != EAGAIN) { |
| if (errno == ETIMEDOUT) |
| return -ETIMEDOUT; |
| perror("futex-FUTEX_WAIT"); |
| exit(1); |
| } |
| } |
| return 0; |
| } |
| |
| /* |
| * cmpxchg based list prepend |
| */ |
| static void xlist_add(struct thread_data *head, struct thread_data *add) |
| { |
| struct thread_data *old; |
| struct thread_data *ret; |
| |
| while (1) { |
| old = head->next; |
| add->next = old; |
| ret = __sync_val_compare_and_swap(&head->next, old, add); |
| if (ret == old) |
| break; |
| } |
| } |
| |
| /* |
| * xchg based list splicing. This returns the entire list and |
| * replaces the head->next with NULL |
| */ |
| static struct thread_data *xlist_splice(struct thread_data *head) |
| { |
| struct thread_data *old; |
| struct thread_data *ret; |
| |
| while (1) { |
| old = head->next; |
| ret = __sync_val_compare_and_swap(&head->next, old, NULL); |
| if (ret == old) |
| break; |
| } |
| return ret; |
| } |
| |
| /* |
| * Wake everyone currently waiting on the message list, filling in their |
| * thread_data->wake_time with the current time. |
| * |
| * It's not exactly the current time, it's really the time at the start of |
| * the list run. We want to detect when the scheduler is just preempting the |
| * waker and giving away the rest of its timeslice. So we gtod once at |
| * the start of the loop and use that for all the threads we wake. |
| */ |
| static void xlist_wake_all(struct thread_data *td) |
| { |
| struct thread_data *list; |
| struct thread_data *next; |
| struct timeval now; |
| |
| list = xlist_splice(td); |
| gettimeofday(&now, NULL); |
| while (list) { |
| next = list->next; |
| list->next = NULL; |
| memcpy(&list->wake_time, &now, sizeof(now)); |
| fpost(&list->futex); |
| list = next; |
| } |
| } |
| |
| /* |
| * called by worker threads to send a message and wait for the answer. |
| * In reality we're just trading one cacheline with the gtod and futex in |
| * it, but that's good enough. We gtod after waking and use that to |
| * record scheduler latency. |
| */ |
| static void msg_and_wait(struct thread_data *td) |
| { |
| struct timeval now; |
| unsigned long long delta; |
| struct timespec timeout; |
| |
| timeout.tv_sec = 0; |
| timeout.tv_nsec = 5000 * 1000; |
| |
| /* set ourselves to blocked */ |
| td->futex = FUTEX_BLOCKED; |
| gettimeofday(&td->wake_time, NULL); |
| |
| /* add us to the list */ |
| xlist_add(td->msg_thread, td); |
| |
| fpost(&td->msg_thread->futex); |
| |
| /* |
| * don't wait if the main threads are shutting down, |
| * they will never kick us fpost has a full barrier, so as long |
| * as the message thread walks his list after setting stopping, |
| * we shouldn't miss the wakeup |
| */ |
| if (!stopping) { |
| /* if he hasn't already woken us up, wait */ |
| fwait(&td->futex, NULL); |
| } |
| |
| gettimeofday(&now, NULL); |
| delta = tvdelta(&td->wake_time, &now); |
| if (delta > 0) |
| add_lat(&td->stats, delta); |
| } |
| |
| /* |
| * once the message thread starts all his children, this is where he |
| * loops until our runtime is up. Basically this sits around waiting |
| * for posting by the worker threads, replying to their messages after |
| * a delay of 'sleeptime' + some jitter. |
| */ |
| static void run_msg_thread(struct thread_data *td) |
| { |
| struct timeval now; |
| struct timespec timeout; |
| unsigned int seed = pthread_self(); |
| int max_jitter = sleeptime / 4; |
| int jitter; |
| |
| jitter = rand_r(&seed) % max_jitter; |
| timeout.tv_sec = 0; |
| timeout.tv_nsec = (sleeptime + jitter) * 1000; |
| |
| while (1) { |
| td->futex = FUTEX_BLOCKED; |
| xlist_wake_all(td); |
| |
| gettimeofday(&now, NULL); |
| if (now.tv_sec > global_stop.tv_sec) { |
| stopping = 1; |
| __sync_synchronize(); |
| xlist_wake_all(td); |
| break; |
| } |
| fwait(&td->futex, &timeout); |
| |
| /* |
| * messages shouldn't be instant, sleep a little to make them |
| * wait |
| */ |
| jitter = rand_r(&seed) % max_jitter; |
| usleep(sleeptime + jitter); |
| } |
| } |
| |
| #define nop __asm__ __volatile__("rep;nop": : :"memory") |
| |
| static void usec_spin(unsigned long spin_time) |
| { |
| struct timeval now; |
| struct timeval start; |
| unsigned long long delta; |
| |
| gettimeofday(&start, NULL); |
| while (1) { |
| gettimeofday(&now, NULL); |
| delta = tvdelta(&start, &now); |
| if (delta > spin_time) |
| return; |
| nop; |
| } |
| } |
| |
| /* |
| * the worker thread is pretty simple, it just does a single spin and |
| * then waits on a message from the message thread |
| */ |
| void *worker_thread(void *arg) |
| { |
| struct thread_data *td = arg; |
| |
| while(1) { |
| if (stopping) |
| break; |
| |
| usec_spin(cputime); |
| msg_and_wait(td); |
| } |
| return NULL; |
| } |
| |
| /* |
| * the message thread starts his own gaggle of workers and then sits around |
| * replying when they post him. He collects latency stats as all the threads |
| * exit |
| */ |
| void *message_thread(void *arg) |
| { |
| struct thread_data *td = arg; |
| struct thread_data *worker_threads_mem = NULL; |
| int i; |
| int ret; |
| |
| worker_threads_mem = calloc(worker_threads, sizeof(struct thread_data)); |
| |
| if (!worker_threads_mem) { |
| perror("unable to allocate ram"); |
| pthread_exit((void *)-ENOMEM); |
| } |
| |
| for (i = 0; i < worker_threads; i++) { |
| pthread_t tid; |
| |
| worker_threads_mem[i].msg_thread = td; |
| ret = pthread_create(&tid, NULL, worker_thread, |
| worker_threads_mem + i); |
| if (ret) { |
| fprintf(stderr, "error %d from pthread_create\n", ret); |
| exit(1); |
| } |
| worker_threads_mem[i].tid = tid; |
| } |
| |
| run_msg_thread(td); |
| |
| for (i = 0; i < worker_threads; i++) { |
| pthread_join(worker_threads_mem[i].tid, NULL); |
| combine_stats(&td->stats, &worker_threads_mem[i].stats); |
| } |
| free(worker_threads_mem); |
| |
| return NULL; |
| } |
| |
| int main(int ac, char **av) |
| { |
| int i; |
| int ret; |
| struct thread_data *message_threads_mem = NULL; |
| struct stats stats; |
| |
| parse_options(ac, av); |
| again: |
| stopping = 0; |
| memset(&stats, 0, sizeof(stats)); |
| |
| message_threads_mem = calloc(message_threads, |
| sizeof(struct thread_data)); |
| |
| |
| if (!message_threads_mem) { |
| perror("unable to allocate ram"); |
| exit(1); |
| } |
| gettimeofday(&global_stop, NULL); |
| global_stop.tv_sec += runtime; |
| |
| /* start our message threads, each one starts its own workers */ |
| for (i = 0; i < message_threads; i++) { |
| pthread_t tid; |
| ret = pthread_create(&tid, NULL, message_thread, |
| message_threads_mem + i); |
| if (ret) { |
| fprintf(stderr, "error %d from pthread_create\n", ret); |
| exit(1); |
| } |
| message_threads_mem[i].tid = tid; |
| } |
| for (i = 0; i < message_threads; i++) { |
| pthread_join(message_threads_mem[i].tid, NULL); |
| combine_stats(&stats, &message_threads_mem[i].stats); |
| } |
| |
| free(message_threads_mem); |
| |
| /* |
| * in auto bench mode, keep adding workers until our latencies get |
| * horrible |
| */ |
| if (autobench) { |
| int p99 = calc_p99(&stats); |
| fprintf(stderr, "cputime %Lu threads %d p99 %d\n", |
| cputime, worker_threads, p99); |
| if (p99 < 2000) { |
| worker_threads++; |
| goto again; |
| } |
| } |
| |
| show_latencies(&stats); |
| |
| return 0; |
| } |