| /* |
| * shepherd.c |
| * Scheduler Helper Daemon |
| * |
| * see shepherd.1 for details |
| * |
| * This program is free software, released under GPLv2 |
| * Copyright (C) 2016 Len Brown, Intel Corp <len.brown@intel.com> |
| */ |
| |
| #define _GNU_SOURCE /* See feature_test_macros(7) */ |
| #include <sched.h> |
| |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <unistd.h> |
| #include <getopt.h> |
| #include <errno.h> |
| #include <signal.h> |
| #include <sched.h> |
| #include <stddef.h> |
| #include <string.h> |
| #include <sys/prctl.h> |
| #include <time.h> |
| #include <err.h> |
| #include <sys/wait.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| #include <sys/mount.h> |
| #include <syslog.h> |
| |
| #include "cpuset.h" |
| |
| #include "cpuset.c" |
| |
| #define UNUSED(x) unsigned int UNUSED_ ## x __attribute__((__unused__)) |
| |
| unsigned int debug; |
| |
| unsigned int max_cpu_num; |
| unsigned int present_num_cpus; |
| |
| cpu_set_t *cpuset_present; |
| cpu_set_t *cpuset_active; |
| size_t system_cpuset_size; |
| |
| char *proc_stat = "/proc/stat"; |
| char *proc_loadavg = "/proc/loadavg"; |
| |
| long user_tick_per_sec; |
| double poll_slow_sec; |
| double poll_norm_sec; |
| double poll_fast_sec; |
| |
| int exit_main_loop; |
| FILE *dfp; |
| char *output_path; |
| |
| /* |
| * cpuset_size_floor |
| * the CPU set will not shrink smaller than cpuset_size_floor |
| */ |
| int cpuset_size_floor = 1; |
| |
| /* |
| * cpuset_size_ceiling |
| * the CPU set will not grow no larger than cpuset_size_ceiling |
| */ |
| int cpuset_size_ceiling; |
| |
| #define HISTORY_LENGTH 10 |
| |
| struct cpuset_history { |
| double seconds; |
| unsigned cpuset_num_cpus; |
| unsigned long long system_idle_tick_sum; |
| } history[HISTORY_LENGTH]; |
| |
| unsigned int *set_size_choosen; |
| double *set_size_seconds; |
| |
| static void debug_dump_history(void) |
| { |
| int i; |
| |
| for (i = 0; i < HISTORY_LENGTH; ++i) |
| { |
| fprintf(dfp, "history[%d] %d cpus %f sec %lld sum\n", |
| i, |
| history[i].cpuset_num_cpus, |
| history[i].seconds, |
| history[i].system_idle_tick_sum); |
| } |
| } |
| #ifdef NOTYET |
| /* |
| * up_slope_max -- how fast a set can grow per interval |
| * 1.00 means it can increase by up to 100% of its current size |
| * 0.50 means it can increase by up to 50% of its current size |
| * etc. |
| */ |
| double up_slope_max = 0.50; |
| #endif |
| |
| #ifdef NOTYET |
| /* |
| * up_cpus_max |
| * maximum number of CPUs to increase set per interval |
| * default = cpuset_size_ceiling/4; |
| */ |
| int up_cpus_max; |
| #endif |
| |
| #ifdef NOTYET |
| /* |
| * down_rate |
| * 0.50 means shrink at 50% of the way to ideal_set size on each interval |
| */ |
| double down_rate = 0.5; |
| #endif |
| |
| |
| /* |
| * get_next_polling_interval_sec() |
| * |
| * Policy decision: |
| * return ns for next polling interval |
| */ |
| static double get_next_polling_interval_sec(double average_utilization, int set_size) |
| { |
| |
| /* |
| * If max set size, then poll normally, |
| * no matter the utilization. We can't grow more, |
| * and we are not in a big rush to shrink. |
| */ |
| if (set_size == cpuset_size_ceiling) |
| return poll_norm_sec; |
| |
| /* |
| * If min set and very low utilization, poll slowly, |
| * to minimally disturb profoundly idle system. |
| */ |
| if ((set_size == cpuset_size_floor) && (average_utilization < 5)) |
| return poll_slow_sec; |
| |
| /* |
| * If high utilzation, poll quickly |
| * to promptly expand capacity with demand |
| */ |
| if (average_utilization > 80) |
| return poll_fast_sec; |
| |
| /* |
| * poll at moderate rate |
| */ |
| return poll_norm_sec; |
| } |
| |
| #ifdef NOTYET |
| /* |
| * Timer slack gives the kernel permission to coalesce our timeout |
| * with nearby system timers. System default is +/- 0.050 ms, |
| * which is much tighter than we need. Increase timer slack |
| * to 1/16th of the requested timeout duration, eg. |
| * |
| * 10 ms / 16 = +/- 0.625 ms |
| * 100 ms / 16 = +/- 6.25 ms |
| * 1,000 ms / 16 = +/- 62.5 ms |
| * 2,000 ms / 16 = +/- 125.0 ms |
| */ |
| |
| unsigned int debug_timer_slack_updates; |
| |
| unsigned long timer_slack_current_ns; |
| static void timer_slack_check(double sleep_sec) |
| { |
| unsigned long timer_slack_desired_ns; |
| |
| timer_slack_desired_ns = (unsigned long long)sleep_sec / 16 * 1000000000; |
| |
| if (timer_slack_desired_ns == timer_slack_current_ns) |
| return; |
| |
| debug_timer_slack_updates++; |
| |
| if (prctl(PR_SET_TIMERSLACK, timer_slack_desired_ns) < 0) |
| err(-1, "PR_SET_TIMERSLACK"); |
| |
| timer_slack_current_ns = timer_slack_desired_ns; |
| } |
| #endif |
| |
| static double timestamp_sec(void) |
| { |
| struct timespec ts; |
| |
| if (clock_gettime(CLOCK_REALTIME, &ts) < 0) |
| err(-1, "clock_gettime"); |
| |
| return ((double)ts.tv_sec + (double)ts.tv_nsec/1000000000); |
| } |
| |
| struct timespec ts_begin, ts_end; |
| |
| #define DEBUG_SLEEP_ARRAY_SIZE 12 |
| |
| unsigned int sleep_request_histogram[DEBUG_SLEEP_ARRAY_SIZE]; |
| unsigned int sleep_actual_histogram[DEBUG_SLEEP_ARRAY_SIZE]; |
| |
| unsigned int debug_sleep_count; |
| double debug_sleep_stats_start_time; |
| |
| static void debug_sleep_histogram_record(unsigned int *a, double interval) |
| { |
| int i; |
| |
| for (i = 0; i < DEBUG_SLEEP_ARRAY_SIZE; ++i) { |
| if (interval >= 2.0) { |
| a[i]++; |
| return; |
| } |
| interval *= 2; |
| } |
| } |
| static void debug_sleep_histograms_clear(void) |
| { |
| int i; |
| |
| for (i = 0; i < DEBUG_SLEEP_ARRAY_SIZE; ++i) { |
| sleep_request_histogram[i] = 0; |
| sleep_actual_histogram[i] = 0; |
| } |
| |
| debug_sleep_stats_start_time = timestamp_sec(); |
| } |
| static void debug_sleep_histograms_dump(void) |
| { |
| int i; |
| double seconds = 2.0; |
| unsigned int request_count, actual_count; |
| double now, interval; |
| |
| request_count = actual_count = 0; |
| |
| fprintf(dfp, "Sleep Wake Seconds\n"); |
| for (i = 0; i < DEBUG_SLEEP_ARRAY_SIZE; ++i) { |
| request_count += sleep_request_histogram[i]; |
| actual_count += sleep_actual_histogram[i]; |
| fprintf(dfp, "%5d %4d >= %f\n", |
| sleep_request_histogram[i], |
| sleep_actual_histogram[i], seconds); |
| seconds /= 2; |
| } |
| now = timestamp_sec(); |
| interval = now - debug_sleep_stats_start_time; |
| fprintf(dfp, "%5d %4d Total, %f per second over %.2f seconds\n", |
| request_count, actual_count, |
| (double)request_count/(interval), interval); |
| } |
| |
| static void stats_clear(void) |
| { |
| unsigned int cpus; |
| |
| for (cpus = 0; cpus <= present_num_cpus; ++cpus) { |
| set_size_choosen[cpus] = 0; |
| set_size_seconds[cpus] = 0.0; |
| } |
| debug_sleep_histograms_clear(); |
| } |
| static void stats_init(void) |
| { |
| set_size_choosen = calloc(present_num_cpus + 1, sizeof(unsigned int)); |
| if (set_size_choosen == NULL) |
| err(1, "calloc(%d, %zd)", present_num_cpus + 1, sizeof(unsigned int)); |
| |
| set_size_seconds = calloc(present_num_cpus + 1, sizeof(double)); |
| if (set_size_seconds == NULL) |
| err(1, "calloc(%d, %zd)", present_num_cpus + 1, sizeof(double)); |
| |
| stats_clear(); |
| } |
| |
| static void stats_dump(void) |
| { |
| unsigned int cpus; |
| |
| fprintf(dfp, "CPUs Used Duration[sec]\n"); |
| |
| for (cpus = 1; cpus <= present_num_cpus; ++cpus) { |
| fprintf(dfp, "%4d %4d %8.2f\n", |
| cpus, set_size_choosen[cpus], set_size_seconds[cpus]); |
| } |
| } |
| |
| static void dump_statistics(void) |
| { |
| stats_dump(); |
| debug_sleep_histograms_dump(); |
| |
| #if NOTYET |
| fprintf(dfp, "%u%% timer slack updates (%u)\n", |
| 100 * debug_timer_slack_updates/debug_sleep_count, debug_timer_slack_updates); |
| |
| debug_timer_slack_updates = 0; |
| debug_sleep_count = 0; |
| #endif |
| stats_clear(); |
| debug_sleep_histograms_clear(); |
| } |
| |
| /* |
| * sleep_for_sec(sec) |
| * |
| * Sleep for 'ns' nanosec |
| * First allow adjustment to timer_slack |
| */ |
| |
| static void sleep_for_sec(double seconds) |
| { |
| struct timespec ts; |
| |
| debug_sleep_count++; |
| debug_sleep_histogram_record(sleep_request_histogram, seconds); |
| |
| #if NOTYET |
| timer_slack_check(seconds); |
| #endif |
| |
| ts.tv_sec = (time_t)seconds; |
| |
| if (debug > 1) |
| fprintf(dfp, "input seconds %f -> tv seconds %lld\n", |
| seconds, (long long int)ts.tv_sec); |
| |
| seconds = seconds - ts.tv_sec; |
| ts.tv_nsec = (long)(seconds * 1000000000.0); |
| |
| if (debug > 1) |
| fprintf(dfp, "input seconds * 1B %f -> tv nsec %ld\n", |
| seconds * 1000000000, ts.tv_nsec); |
| |
| if (nanosleep(&ts, NULL) < 0) { |
| if (errno == EINTR) { |
| /* early return if signaled */ |
| return; |
| } |
| err(-1, "nanosleep"); |
| } |
| } |
| static void __attribute__((__noreturn__)) usage(FILE *out) |
| { |
| fprintf(out, "shepherd 17.05.21a (C) 2017 Len Brown <len.brown@intel.com>\n"); |
| fprintf(out, "Usage: shepherd [options]\n"); |
| fprintf(out, "'man shepherd' for details\n"); |
| |
| exit(out == stderr ? EXIT_FAILURE: EXIT_SUCCESS); |
| } |
| |
| static void cmdline(int argc, char **argv) |
| { |
| int c; |
| |
| static const struct option longopts[] = { |
| { "debug", 0, NULL, 'd' }, |
| { "help", 0, NULL, 'h' }, |
| { "output", 0, NULL, 'o' }, |
| }; |
| |
| while ((c = getopt_long_only(argc, argv, "+dho:", longopts, NULL)) != -1) { |
| switch (c) { |
| case 'd': |
| debug++; |
| break; |
| case 'h': |
| usage(stdout); |
| break; |
| case 'o': |
| output_path = optarg; |
| break; |
| default: |
| usage(stderr); |
| break; |
| } |
| } |
| } |
| |
| static void init_default_sample_rates(void) |
| { |
| |
| double user_tick_sec; |
| |
| user_tick_per_sec = sysconf(_SC_CLK_TCK); |
| |
| if (user_tick_per_sec != 100) |
| warnx("User ticks %ld, expected 100", user_tick_per_sec); |
| |
| user_tick_sec = 1.0 / user_tick_per_sec; |
| |
| /* |
| * Nominal wakeup frequency is once per second |
| * 1.0 sec/sample = 1 sample/sec |
| */ |
| poll_norm_sec = 1; |
| |
| /* |
| * Slow wakeup frequency is used during profound idle |
| * and at max utilization of full set size |
| * 2.0 sec/sample = 0.5 sample/sec |
| */ |
| poll_slow_sec = 2; |
| |
| /* |
| * Fast wakeup frequency is used at high utilization |
| * Sample at 10 * user user tick rate, which means |
| * we will see at most 10 counter ticks/cpu/sample. |
| * Since user_tick_sec = 10ms, |
| * sample rate = 10 * 10ms = 100ms = 10 samples/sec |
| */ |
| poll_fast_sec = user_tick_sec * 10; |
| |
| } |
| |
| static void defaults_init(void) |
| { |
| init_default_sample_rates(); |
| |
| // set_size_ceiling = TBD; |
| |
| } |
| /* |
| * Open a file, and exit on failure |
| */ |
| static FILE *fopen_or_die(const char *path, const char *mode) |
| { |
| FILE *filep = fopen(path, mode); |
| if (!filep) |
| err(1, "%s: open failed", path); |
| return filep; |
| } |
| /* |
| * run func(cpu) on every cpu in /proc/stat |
| * return sum of return values of func(cpu) |
| * else, if func() returns negative value, return that. |
| */ |
| static unsigned long long sum_for_all_proc_cpus(int (func)(unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int), unsigned int arg) |
| { |
| FILE *fp; |
| int cpu_num, user, nice, system, idle; |
| int retval; |
| unsigned long long sum = 0; |
| |
| fp = fopen_or_die(proc_stat, "r"); |
| |
| retval = fscanf(fp, "cpu %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n"); |
| if (retval != 0) |
| err(1, "%s: failed to parse format", proc_stat); |
| |
| while (1) { |
| retval = fscanf(fp, "cpu%u %d %d %d %d %*d %*d %*d %*d %*d %*d\n", |
| &cpu_num, &user, &nice, &system, &idle); |
| if (retval != 5) |
| break; |
| |
| retval = func(arg, cpu_num, user, nice, system, idle); |
| if (retval < 0) { |
| fclose(fp); |
| return(retval); |
| } |
| sum += retval; |
| } |
| fclose(fp); |
| return sum; |
| } |
| |
| static unsigned int get_num_runnable_threads(void) |
| { |
| FILE *fp; |
| int retval; |
| unsigned int num_threads; |
| |
| fp = fopen_or_die(proc_loadavg, "r"); |
| |
| retval = fscanf(fp, "%*f %*f %*f %ud/%*d %*d\n", &num_threads); |
| if (retval != 1) |
| err(1, "%s", proc_stat); |
| |
| if (debug > 2) |
| fprintf(dfp, "%d threads\n", num_threads); |
| fclose(fp); |
| |
| return num_threads; |
| } |
| #if NOTYET |
| static int for_all_cpuset_cpus(cpu_set_t *set, int (func)(unsigned int)) |
| { |
| unsigned int cpu; |
| int retval; |
| |
| for (cpu = 0; cpu <= max_cpu_num; ++cpu) { |
| if (!CPU_ISSET_S(cpu, system_cpuset_size, set)) |
| continue; |
| retval = func(cpu); |
| if (retval) |
| return retval; |
| } |
| return 0; |
| } |
| #endif |
| /* |
| * cpu_count() |
| * update global max_cpu_num, return 1 (per cpu) |
| */ |
| static int cpu_count(UNUSED(arg), unsigned int cpu, UNUSED(a), UNUSED(b), UNUSED(c), UNUSED(d)) |
| { |
| if (cpu > max_cpu_num) |
| max_cpu_num = cpu; |
| |
| if (debug > 1) |
| fprintf(dfp, "cpu%d: %d\n", cpu, present_num_cpus); |
| return 1; |
| } |
| |
| static int cpu_mark_present(UNUSED(arg), unsigned int cpu, UNUSED(a), UNUSED(b), UNUSED(c), UNUSED(d)) |
| { |
| CPU_SET_S(cpu, system_cpuset_size, cpuset_present); |
| return 0; |
| } |
| |
| /* |
| * for given CPU, starting with INDEX 0 |
| * Find first /sys/devices/system/cpu/$CPU/cache/$INDEX/shared_cpu_list |
| * and return it in cpuset_cache_topology arg |
| * |
| * return 0 on success, non-0 on failure |
| */ |
| static int get_cache_shared_cpu_list(unsigned int cpu_num, cpu_set_t *cpuset_cache_topology) |
| { |
| FILE *fp; |
| int index; |
| char buf[256]; |
| |
| int retval; |
| |
| for (index = 0; index < 10; ++index) { |
| char *s; |
| |
| sprintf(buf, "/sys/devices/system/cpu/cpu%d/cache/index%d", cpu_num, index); |
| if (access(buf, R_OK) != 0) { |
| if (debug) |
| perror(buf); |
| return -1; |
| } |
| sprintf(buf, "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_list", cpu_num, index); |
| fp = fopen(buf, "r"); |
| if (fp == NULL) { |
| if (debug) perror(buf); |
| continue; |
| } |
| |
| if (fgets(buf, sizeof(buf), fp) == NULL) |
| err(1, "%s", buf); |
| |
| s = strchr(buf, '\n'); |
| *s = '\0'; |
| |
| if (debug > 1) |
| fprintf(dfp, "get_cache_shared_cpu_list(%d): read '%s'\n", cpu_num, buf); |
| retval = cpulist_parse(buf, cpuset_cache_topology, system_cpuset_size, 1); |
| return retval; |
| } |
| return -1; |
| } |
| /* |
| * cpus[i] is the order that CPUs are added to the set. |
| * This mapping is used to allocate cpus to the set in "topology order". |
| * But even for a linear mapping it is needed to deal with sparse numbering. |
| */ |
| unsigned int *cpus; |
| |
| static void topology_init_cpu_order(void) |
| { |
| int index; |
| unsigned int cpu_num; |
| cpu_set_t *cpuset_cache_topology, *cpuset_unprocessed; |
| |
| cpuset_cache_topology = CPU_ALLOC((max_cpu_num + 1)); |
| if (cpuset_cache_topology == NULL) |
| err(3, "CPU_ALLOC"); |
| |
| cpuset_unprocessed = CPU_ALLOC((max_cpu_num + 1)); |
| if (cpuset_unprocessed == NULL) |
| err(3, "CPU_ALLOC"); |
| |
| CPU_ZERO_S(system_cpuset_size, cpuset_unprocessed); |
| |
| CPU_OR_S(system_cpuset_size, cpuset_unprocessed, cpuset_unprocessed, cpuset_active); |
| |
| for (index = 0, cpu_num = 0; cpu_num <= max_cpu_num; ++cpu_num) { |
| unsigned int cache_cpu_num; |
| |
| if (!CPU_ISSET_S(cpu_num, system_cpuset_size, cpuset_unprocessed)) |
| continue; |
| |
| CPU_ZERO_S(system_cpuset_size, cpuset_cache_topology); |
| if (get_cache_shared_cpu_list(cpu_num, cpuset_cache_topology)) { |
| if (debug) fprintf(dfp, "BAILING to LINEAR\n"); |
| break; |
| } |
| for (cache_cpu_num = 0; cache_cpu_num <= max_cpu_num; ++cache_cpu_num) |
| { |
| if (!CPU_ISSET_S(cache_cpu_num, system_cpuset_size, cpuset_cache_topology)) |
| continue; |
| cpus[index] = cache_cpu_num; |
| if (debug) |
| fprintf(dfp, "%d: cpus[%d] = cpu%d\n", cpu_num, index, cache_cpu_num); |
| CPU_CLR_S(cache_cpu_num, system_cpuset_size, cpuset_unprocessed); |
| index++; |
| } |
| } |
| |
| CPU_FREE(cpuset_cache_topology); |
| CPU_FREE(cpuset_unprocessed); |
| |
| if (cpu_num == max_cpu_num + 1) |
| return; |
| |
| fprintf(dfp, "BACKUP plan: linear %d\n", cpu_num); |
| /* linear */ |
| for (index = 0, cpu_num = 0; cpu_num <= max_cpu_num; ++cpu_num) { |
| if (CPU_ISSET_S(cpu_num, system_cpuset_size, cpuset_active)) { |
| cpus[index] = cpu_num; |
| if (debug > 1) |
| fprintf(dfp, "cpus[%d] = cpu%d\n", index, cpu_num); |
| index++; |
| } |
| } |
| } |
| |
| static void topology_probe(void) |
| { |
| present_num_cpus = 0; |
| max_cpu_num = 0; |
| present_num_cpus = sum_for_all_proc_cpus(cpu_count, 0); |
| cpuset_size_ceiling = present_num_cpus; |
| |
| if (debug > 1) |
| fprintf(dfp, "%d cpus, cpu%d is highest numbered\n", |
| present_num_cpus, max_cpu_num); |
| |
| /* |
| * Allocate the ordered CPU topology list |
| * indexed by size of active_set |
| * each entry containes cpu# which is last |
| * onlined to complete a set of that size |
| */ |
| cpus = calloc(present_num_cpus, sizeof(unsigned int)); |
| if (cpus == NULL) |
| err(1, "calloc(%d, %zd)", present_num_cpus, sizeof(unsigned int)); |
| |
| /* |
| * Allocate cpuset_present, initialize to all CPUs in system |
| */ |
| cpuset_present = CPU_ALLOC((max_cpu_num + 1)); |
| if (cpuset_present == NULL) |
| err(3, "CPU_ALLOC"); |
| system_cpuset_size = CPU_ALLOC_SIZE((max_cpu_num + 1)); |
| CPU_ZERO_S(system_cpuset_size, cpuset_present); |
| sum_for_all_proc_cpus(cpu_mark_present, 0); |
| |
| /* |
| * Allocate cpuset_active, initialize = cpuset_present |
| */ |
| cpuset_active = CPU_ALLOC((max_cpu_num + 1)); |
| if (cpuset_active == NULL) |
| err(3, "CPU_ALLOC"); |
| CPU_ZERO_S(system_cpuset_size, cpuset_active); |
| CPU_OR_S(system_cpuset_size, cpuset_active, cpuset_present, cpuset_active); |
| |
| /* |
| * TODO: make this 2 on HT boxes |
| */ |
| cpuset_size_floor = 1; |
| |
| topology_init_cpu_order(); |
| } |
| |
| static int cpu_record_utilization(UNUSED(index), unsigned int cpu, |
| UNUSED(a), UNUSED(b), UNUSED(c), unsigned int idle_count) |
| { |
| if (debug > 1) |
| fprintf(dfp, "cpu%d idle %d\n", cpu, idle_count); |
| |
| // if (CPU_ISSET_S(cpu, system_cpuset_size, cpuset_active)) |
| return idle_count; |
| // else |
| // return 0; |
| } |
| |
| static void system_snapshot_utilization(unsigned int index) |
| { |
| |
| history[index].seconds = timestamp_sec(); |
| if(debug > 1) |
| fprintf(dfp, "util timestamp %f\n", history[index].seconds); |
| |
| history[index].system_idle_tick_sum = |
| sum_for_all_proc_cpus(cpu_record_utilization, 0); |
| if (debug > 2) |
| debug_dump_history(); |
| } |
| |
| #if NOTYET |
| static int per_cpu_stub(unsigned int cpu) |
| { |
| fprintf(dfp, "hello world %d\n", cpu); |
| return 0; |
| } |
| #endif |
| /* |
| * cpuset_calculate_utilization() |
| * |
| * return average_utilization of set during previous interval from 0.0 to 100.0 % |
| */ |
| double ema; |
| |
| static double cpuset_calculate_utilization(unsigned int util_index_prev, unsigned int util_index, unsigned int active_num_cpus) |
| { |
| double idle_ticks; |
| double ticks_per_interval_per_cpu; |
| double delta_sec; |
| double average_utilization; |
| double total_possible_ticks; |
| double busy_ticks; |
| |
| #if NOYET |
| double ema_period_sec = 2.0; |
| double ema_weight; |
| #endif |
| |
| if (util_index == util_index_prev) |
| return -1; |
| |
| delta_sec = history[util_index].seconds - history[util_index_prev].seconds; |
| debug_sleep_histogram_record(sleep_actual_histogram, delta_sec); |
| |
| if (delta_sec <= 0) { |
| fprintf(dfp, "BUG: delta_sec %f\n", delta_sec); |
| delta_sec = 0.01; |
| } |
| set_size_choosen[active_num_cpus]++; |
| set_size_seconds[active_num_cpus] += delta_sec; |
| |
| if(debug > 1) |
| fprintf(dfp, "delta_sec %f = [%d] %f - [%d] %f\n", |
| delta_sec, util_index, history[util_index].seconds, |
| util_index_prev, history[util_index_prev].seconds); |
| |
| ticks_per_interval_per_cpu = user_tick_per_sec * delta_sec; |
| if(debug > 1) |
| fprintf(dfp, "ticks_per_interval_per_cpu %f = %ld * %f\n", |
| ticks_per_interval_per_cpu, user_tick_per_sec, delta_sec); |
| |
| idle_ticks = history[util_index].system_idle_tick_sum - |
| history[util_index_prev].system_idle_tick_sum; |
| if (debug > 1) |
| fprintf(dfp, "idle_ticks %f = %lld - %lld\n", |
| idle_ticks, history[util_index].system_idle_tick_sum, |
| history[util_index_prev].system_idle_tick_sum); |
| |
| total_possible_ticks = ticks_per_interval_per_cpu * present_num_cpus; |
| busy_ticks = total_possible_ticks - idle_ticks; |
| |
| if (debug > 1) |
| fprintf(dfp, "total_possible_ticks %f busy_ticks %f\n", |
| total_possible_ticks, busy_ticks); |
| |
| |
| average_utilization = 100 * busy_ticks / ticks_per_interval_per_cpu / active_num_cpus; |
| |
| if (average_utilization < 0) |
| average_utilization = 0; |
| |
| |
| #if NOTYET |
| /* |
| * Exponential Moving Average (EMA) |
| * |
| * EMA(t) = EMA(t-1) + EMA_WEIGHT * (SAMPLE(t) - EMA(t-1)) |
| * |
| * Larger EMA_WEIGHT favors SAMPLE(t) |
| * at the expense of EMA history, EMA(t-1) |
| * |
| * An N-period EMA will forget 86% of history |
| * after N samples, and EMA_WEIGHT = 1/(N + 1). |
| * |
| * We make the policy decision of selecting ema_period_sec = 2.0. |
| * We use a different EMA_WEIGHT depending on the sample rate. |
| * |
| * EMA_WEIGHT(2.0 sec, 100 samples/sec) = 2/(200+1) = 0.00995 |
| * EMA_WEIGHT(2.0 sec, 10 samples/sec) = 2/(20+1) = 0.0952 |
| * EMA_WEIGHT(2.0 sec, 1 samples/sec) = 2/(2+1) = 0.666 |
| * EMA_WEIGHT(2.0 sec, .5 samples/sec) = 2/(1+1) = 1.0 |
| */ |
| ema_weight = 2 / (ema_period_sec * 1/delta_sec + 1); |
| ema = ema + ema_weight * average_utilization - ema_weight * ema; |
| #endif |
| |
| if(debug) { |
| char cpulist[256]; |
| |
| cpulist_create(cpulist, 256, cpuset_active, system_cpuset_size); |
| |
| fprintf(dfp, "%d cpus util%%: %.2f%% TGT: %.2f sec: %.4f set: %s\n", |
| active_num_cpus, average_utilization, |
| active_num_cpus * average_utilization / 100, |
| delta_sec, cpulist); |
| } |
| |
| return average_utilization; |
| } |
| /* |
| * cpuset_change_size(target_num, num_active) |
| * cpu[num_active] is the next available cpu |
| */ |
| static void cpuset_change_size(unsigned int target_num, unsigned int num_active) |
| { |
| unsigned int i; |
| if (debug > 1) |
| fprintf(dfp, "ACTIVE %d TARGET: %d ", |
| num_active, target_num); |
| |
| if (target_num > num_active) { |
| if (debug > 1) fprintf(dfp, "ADD:"); |
| for (i = num_active; i < target_num; ++i) { |
| CPU_SET_S(cpus[i], system_cpuset_size, cpuset_active); |
| if(debug > 1) fprintf(dfp, " cpu%d", cpus[i]); |
| } |
| if (debug > 1) fprintf(dfp, "\n"); |
| } |
| if (target_num < num_active) { |
| if (debug > 1) fprintf(dfp, "SUB:"); |
| for (i = num_active - 1; i > target_num - 1; --i) { |
| CPU_CLR_S(cpus[i], system_cpuset_size, cpuset_active); |
| if(debug > 1) fprintf(dfp, " cpu%d", cpus[i]); |
| } |
| if (debug > 1) fprintf(dfp, "\n"); |
| } |
| } |
| static void do_binding(cpu_set_t *set) |
| { |
| int retval; |
| char cpulist[256]; |
| FILE *fp; |
| |
| cpulist_create(cpulist, 256, set, system_cpuset_size); |
| fp = fopen_or_die("balloon/cpuset.cpus", "w"); |
| retval = fputs(cpulist, fp); |
| if (retval == EOF) |
| err(1, "failed to write %s to balloon/cpuset.cpus\n", cpulist); |
| |
| if (debug) fprintf(dfp, "balloon/cpuset.cpus %s\n", cpulist); |
| retval = fclose(fp); |
| if (retval) |
| err(1, "fcose balloon/cpuset.cpus"); |
| #if NOT_ANYMORE |
| { |
| char command[256]; |
| snprintf(command, 256, "taskset -cp %s all", cpulist); |
| retval = system(command); |
| if (debug > 1) |
| fprintf(dfp, "\t\t\t\t\tSYSTEM \"%s\" = %d\n", command, retval); |
| } |
| #endif |
| } |
| |
| /* |
| * cpuset_update() |
| * |
| * The "ideal" target set size is the size that could retire |
| * the previous load with all CPUs exactly 100% utilized. |
| * But the math rarely works out evenly, and we need to |
| * decided on an integer target number of CPUs in the set. |
| * |
| * Say utilization * active_num_cpus = 3.2? |
| * We can't allocate 0.2 of a CPU... |
| * |
| */ |
| static unsigned int cpuset_update(double avg_util, unsigned int active_num_cpus) |
| { |
| double ideal_target_num_cpus; |
| double target_num_cpus; |
| double roundup; |
| int truncated_int; |
| |
| if (avg_util > 95) { |
| if (get_num_runnable_threads() > active_num_cpus + 2) { |
| target_num_cpus = active_num_cpus + 1; |
| goto done; |
| } |
| } |
| ideal_target_num_cpus = (avg_util * active_num_cpus)/100; |
| truncated_int = (unsigned int)ideal_target_num_cpus; |
| roundup = ideal_target_num_cpus - truncated_int; |
| if (roundup > 0.5) |
| roundup = 1; |
| else if (roundup > ideal_target_num_cpus/10) |
| roundup = 1; |
| |
| target_num_cpus = ideal_target_num_cpus + roundup; |
| |
| if (debug > 1) |
| fprintf(dfp, "ideal %f = %f * %d; target %d\n", |
| ideal_target_num_cpus, avg_util, active_num_cpus, |
| (unsigned int)target_num_cpus); |
| |
| #if 0 |
| ideal_set_size = util_sum / 100.0; |
| |
| if (ideal_set_size > (up_threshold * active_num_cpus)) { |
| /* |
| * Utilization is above up_threshold |
| * Increase set size |
| */ |
| |
| delta_set_size = active_num_cpus * up_slope; |
| if (delta_set_size < 1 ) |
| delta_set_size == 1; |
| |
| target_set_size = active_num_cpus + increase; |
| } else { |
| /* |
| * Utilization is below up_threshold |
| * Possibly decrease set size |
| */ |
| delta_set_size = active_num_cpus - ideal_set_size; |
| delta_set_size *= down_rate; |
| target_set_size = active_num_cpus - delta_set_size; |
| } |
| #endif |
| |
| done: |
| if (target_num_cpus > cpuset_size_ceiling) |
| target_num_cpus = cpuset_size_ceiling; |
| else if (target_num_cpus < cpuset_size_floor) |
| target_num_cpus = cpuset_size_floor; |
| |
| if ((unsigned int) target_num_cpus != active_num_cpus) { |
| cpuset_change_size(target_num_cpus, active_num_cpus); |
| do_binding(cpuset_active); |
| } |
| |
| return (unsigned int) target_num_cpus; |
| } |
| static void main_loop(void) |
| { |
| int i; |
| unsigned int util_index_prev = 0; |
| unsigned int util_index = 1; |
| unsigned int active_num_cpus = present_num_cpus; // TBD |
| |
| syslog(LOG_NOTICE, "shepherd started."); |
| /* |
| * Initial snapshot |
| */ |
| system_snapshot_utilization(util_index_prev); |
| history[util_index_prev].cpuset_num_cpus = active_num_cpus; |
| sleep_for_sec(poll_fast_sec); |
| |
| /* |
| * run for(ever) |
| */ |
| for (i = 0; ; ++i) { |
| double avg_util; |
| |
| system_snapshot_utilization(util_index); |
| history[util_index].cpuset_num_cpus = active_num_cpus; |
| avg_util = cpuset_calculate_utilization(util_index_prev, util_index, active_num_cpus); |
| |
| if (exit_main_loop) |
| break; |
| |
| active_num_cpus = cpuset_update(avg_util, active_num_cpus); |
| |
| sleep_for_sec(get_next_polling_interval_sec(avg_util, active_num_cpus)); |
| |
| util_index_prev = util_index; |
| util_index += 1; |
| if (util_index >= HISTORY_LENGTH) |
| util_index = 0; |
| |
| } |
| syslog(LOG_NOTICE, "shepherd terminated."); |
| } |
| pid_t child_pid; |
| |
| /* |
| * nanny - babysit the child |
| * leave a note for the parrent main loop |
| * if the child exits |
| */ |
| static void nanny(__attribute__((unused))int signal) |
| { |
| if (waitpid(child_pid, NULL, WNOHANG) == child_pid) |
| exit_main_loop = 1; |
| } |
| /* |
| * fork_it() |
| */ |
| static void fork_it(char **argv) |
| { |
| |
| child_pid = fork(); |
| |
| if (!child_pid) { /* child */ |
| |
| if (!child_pid) { /* command child */ |
| execvp(argv[0], argv); |
| err(1, "%s", argv[0]); |
| } |
| } else { /* parent */ |
| signal(SIGCHLD, nanny); |
| } |
| } |
| static void signal_handler_stats(int signal) |
| { |
| if (debug) |
| fprintf(dfp, "signal %d received\n", signal); |
| |
| dump_statistics(); |
| } |
| |
| static void signal_handler_end_it(int signal) |
| { |
| if (debug) |
| fprintf(dfp, "caught signal %d\n", signal); |
| exit_main_loop = 1; |
| } |
| |
| static void signals_init(void) |
| { |
| signal(SIGALRM, signal_handler_stats); |
| signal(SIGUSR1, signal_handler_stats); |
| signal(SIGINT, signal_handler_end_it); |
| } |
| |
| void copy_file_by_line(char *from_path, char *to_path) |
| { |
| FILE *from_fp, *to_fp; |
| char line_buf[1024]; |
| char *s; |
| int retval; |
| |
| from_fp = fopen_or_die(from_path, "r"); |
| to_fp = fopen_or_die(to_path, "w"); |
| |
| while (fgets(line_buf, sizeof(line_buf), from_fp) != NULL) { |
| |
| if(debug > 1) fprintf(dfp, "copying: %s", line_buf); |
| |
| retval = fputs(line_buf, to_fp); |
| if (retval == EOF) |
| err(1, "fputs cpuset.mems"); |
| |
| rewind(to_fp); |
| } |
| |
| retval = fclose(from_fp); |
| if (retval) |
| err(1, "close %s", from_path); |
| |
| retval = fclose(to_fp); |
| if (retval) |
| err(1, "close %s", to_path); |
| |
| |
| } |
| |
| char cpuset_base[] = "/shepherd"; |
| char cpuset_balloon[] = "balloon"; |
| |
| static void cpuset_init(void) |
| { |
| int retval; |
| |
| retval = access(cpuset_base, F_OK); |
| if (retval) { |
| if (debug) |
| fprintf(dfp, "making %s\n", cpuset_base); |
| retval = mkdir(cpuset_base, 0755); |
| if (retval) { |
| if (errno == EEXIST) |
| fprintf(dfp, "okay, %s is already there\n", cpuset_base); |
| else |
| err(1, "could not create '%s'", cpuset_base); |
| } |
| } |
| |
| retval = mount("dummy", cpuset_base, "cgroup", 0, "cpuset"); |
| if (retval) { |
| if (errno != EBUSY) |
| err(retval, "mount cpuset"); |
| if (debug) |
| fprintf(dfp, "cpuset already mounted\n"); |
| } |
| |
| retval = chdir(cpuset_base); |
| if (retval) |
| err(retval, "%s", cpuset_base); |
| |
| retval = mkdir(cpuset_balloon, 0755); |
| if (retval) { |
| if (errno == EEXIST) |
| fprintf(dfp, "okay, %s is already there\n", cpuset_balloon); |
| else |
| err(1, "could not create '%s'", cpuset_balloon); |
| } |
| |
| copy_file_by_line("cpuset.mems", "balloon/cpuset.mems"); |
| |
| copy_file_by_line("cpuset.cpus", "balloon/cpuset.cpus"); |
| |
| copy_file_by_line("tasks", "balloon/tasks"); |
| |
| } |
| |
| void daemonize(void) |
| { |
| pid_t pid; |
| int i; |
| |
| pid = fork(); |
| if (pid < 0) |
| err(1, "fork"); |
| if (pid > 0) |
| exit(0); /* parent exits */ |
| |
| if (setsid() < 0) |
| err(1, "setsid"); |
| signal(SIGCHLD, SIG_IGN); |
| signal(SIGHUP, SIG_IGN); |
| |
| pid = fork(); |
| if (pid < 0) |
| err(2, "Fork"); |
| if (pid > 0) |
| exit(0); /* parent exits */ |
| |
| umask(0); |
| chdir("/"); |
| |
| for (i = sysconf(_SC_OPEN_MAX); i >= 0; i--) |
| close(i); |
| } |
| |
| void debug_init(void) |
| { |
| if (output_path == NULL) { |
| dfp = stderr; |
| |
| //output_path = "/tmp/shepherd.log"; |
| } else { |
| dfp = fopen_or_die(output_path, "w"); |
| } |
| } |
| |
| int main(int argc, char **argv) |
| { |
| |
| srandom((unsigned int)time(NULL)); |
| |
| defaults_init(); |
| |
| cmdline(argc, argv); |
| |
| debug_init(); |
| |
| if (!debug) |
| daemonize(); |
| |
| openlog("shepherd", LOG_PID, LOG_DAEMON); |
| |
| topology_probe(); |
| |
| cpuset_init(); |
| |
| stats_init(); |
| |
| if (argc - optind) |
| fork_it(argv + optind); |
| |
| signals_init(); |
| main_loop(); |
| |
| dump_statistics(); |
| do_binding(cpuset_present); /* restore system binding */ |
| return 0; |
| } |
| |