tools/power/shepherd/shepherd.c - pub/scm/linux/kernel/git/lenb/linux - Git at Google

 /*
  * shepherd.c
  * Scheduler Helper Daemon
  *
  * see shepherd.1 for details
  *
  * This program is free software, released under GPLv2
  * Copyright (C) 2016 Len Brown, Intel Corp <len.brown@intel.com>
  */

 #define _GNU_SOURCE             /* See feature_test_macros(7) */
 #include <sched.h>


 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <getopt.h>
 #include <errno.h>
 #include <signal.h>
 #include <sched.h>
 #include <stddef.h>
 #include <string.h>
 #include <sys/prctl.h>
 #include <time.h>
 #include <err.h>
 #include <sys/wait.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/mount.h>
 #include <syslog.h>

 #include "cpuset.h"

 #include "cpuset.c"

 #define UNUSED(x) unsigned int UNUSED_ ## x __attribute__((__unused__))

 unsigned int debug;

 unsigned int max_cpu_num;
 unsigned int present_num_cpus;

 cpu_set_t *cpuset_present;
 cpu_set_t *cpuset_active;
 size_t system_cpuset_size;

 char *proc_stat = "/proc/stat";
 char *proc_loadavg = "/proc/loadavg";

 long user_tick_per_sec;
 double poll_slow_sec;
 double poll_norm_sec;
 double poll_fast_sec;

 int  exit_main_loop;
 FILE *dfp;
 char *output_path;

 /*
  * cpuset_size_floor
  * the CPU set will not shrink smaller than cpuset_size_floor
  */
 int cpuset_size_floor = 1;

 /*
  * cpuset_size_ceiling
  * the CPU set will not grow no larger than cpuset_size_ceiling
  */
 int cpuset_size_ceiling;

 #define HISTORY_LENGTH 10

 struct cpuset_history {
 	double seconds;
 	unsigned cpuset_num_cpus;
 	unsigned long long system_idle_tick_sum;
 } history[HISTORY_LENGTH];

 unsigned int *set_size_choosen;
 double *set_size_seconds;

 static void debug_dump_history(void)
 {
 	int i;

 	for (i = 0; i < HISTORY_LENGTH; ++i)
 	{
 		fprintf(dfp, "history[%d] %d cpus %f sec %lld sum\n",
 			i,
 			history[i].cpuset_num_cpus,
 			history[i].seconds,
 			history[i].system_idle_tick_sum);
 	}
 }
 #ifdef NOTYET
 /*
  * up_slope_max -- how fast a set can grow per interval
  * 1.00 means it can increase by up to 100% of its current size
  * 0.50 means it can increase by up to 50% of its current size
  * etc.
  */
 double up_slope_max = 0.50;
 #endif

 #ifdef NOTYET
 /*
  * up_cpus_max
  * maximum number of CPUs to increase set per interval
  * default = cpuset_size_ceiling/4;
  */
 int up_cpus_max;
 #endif

 #ifdef NOTYET
 /*
  * down_rate
  * 0.50 means shrink at 50% of the way to ideal_set size on each interval
  */
 double down_rate = 0.5;
 #endif


 /*
  * get_next_polling_interval_sec()
  *
  * Policy decision:
  * return ns for next polling interval
  */
 static double get_next_polling_interval_sec(double average_utilization, int set_size)
 {

 	/*
 	 * If max set size, then poll normally,
 	 * no matter the utilization.  We can't grow more,
 	 * and we are not in a big rush to shrink.
 	 */
 	if (set_size == cpuset_size_ceiling)
 		return poll_norm_sec;

 	/*
 	 * If min set and very low utilization, poll slowly,
 	 * to minimally disturb profoundly idle system.
 	 */
 	if ((set_size == cpuset_size_floor) && (average_utilization < 5))
 		return poll_slow_sec;

 	/*
 	 * If high utilzation, poll quickly
 	 * to promptly expand capacity with demand
 	 */
 	if (average_utilization > 80)
 		return poll_fast_sec;

 	/*
 	 * poll at moderate rate
 	 */
 	return poll_norm_sec;
 }

 #ifdef NOTYET
 /*
  * Timer slack gives the kernel permission to coalesce our timeout
  * with nearby system timers.  System default is +/- 0.050 ms,
  * which is much tighter than we need.  Increase timer slack
  * to 1/16th of the requested timeout duration, eg.
  *
  *    10 ms / 16 = +/-   0.625 ms
  *   100 ms / 16 = +/-  6.25 ms
  * 1,000 ms / 16 = +/-  62.5 ms
  * 2,000 ms / 16 = +/- 125.0 ms
  */

 unsigned int debug_timer_slack_updates;

 unsigned long timer_slack_current_ns;
 static void timer_slack_check(double sleep_sec)
 {
 	unsigned long timer_slack_desired_ns;

 	timer_slack_desired_ns = (unsigned long long)sleep_sec / 16 * 1000000000;

 	if (timer_slack_desired_ns == timer_slack_current_ns)
 		return;

 	debug_timer_slack_updates++;

 	if (prctl(PR_SET_TIMERSLACK, timer_slack_desired_ns) < 0)
 		err(-1, "PR_SET_TIMERSLACK");

 	timer_slack_current_ns = timer_slack_desired_ns;
 }
 #endif

 static double timestamp_sec(void)
 {
 	struct timespec ts;

 	if (clock_gettime(CLOCK_REALTIME, &ts) < 0)
 		err(-1, "clock_gettime");

 	return ((double)ts.tv_sec + (double)ts.tv_nsec/1000000000);
 }

 struct timespec ts_begin, ts_end;

 #define DEBUG_SLEEP_ARRAY_SIZE	12

 unsigned int sleep_request_histogram[DEBUG_SLEEP_ARRAY_SIZE];
 unsigned int sleep_actual_histogram[DEBUG_SLEEP_ARRAY_SIZE];

 unsigned int debug_sleep_count;
 double debug_sleep_stats_start_time;

 static void debug_sleep_histogram_record(unsigned int *a, double interval)
 {
 	int i;

 	for (i = 0; i < DEBUG_SLEEP_ARRAY_SIZE; ++i) {
 		if (interval >= 2.0) {
 			a[i]++;
 			return;
 		}
 		interval *= 2;
 	}
 }
 static void debug_sleep_histograms_clear(void)
 {
 	int i;

 	for (i = 0; i < DEBUG_SLEEP_ARRAY_SIZE; ++i) {
 		sleep_request_histogram[i] = 0;
 		sleep_actual_histogram[i] = 0;
 	}

 	debug_sleep_stats_start_time = timestamp_sec();
 }
 static void debug_sleep_histograms_dump(void)
 {
 	int i;
 	double seconds = 2.0;
 	unsigned int request_count, actual_count;
 	double now, interval;

 	request_count = actual_count = 0;

 	fprintf(dfp, "Sleep Wake    Seconds\n");
 	for (i = 0; i < DEBUG_SLEEP_ARRAY_SIZE; ++i) {
 		request_count += sleep_request_histogram[i];
 		actual_count += sleep_actual_histogram[i];
 		fprintf(dfp, "%5d %4d >= %f\n",
 			sleep_request_histogram[i],
 			sleep_actual_histogram[i], seconds);
 		seconds /= 2;
 	}
 	now = timestamp_sec();
 	interval = now - debug_sleep_stats_start_time;
 	fprintf(dfp, "%5d %4d  Total, %f per second over %.2f seconds\n",
 		request_count, actual_count,
 		(double)request_count/(interval), interval);
 }

 static void stats_clear(void)
 {
 	unsigned int cpus;

 	for (cpus = 0; cpus <= present_num_cpus; ++cpus) {
 		set_size_choosen[cpus] = 0;
 		set_size_seconds[cpus] = 0.0;
 	}
 	debug_sleep_histograms_clear();
 }
 static void stats_init(void)
 {
 	set_size_choosen = calloc(present_num_cpus + 1, sizeof(unsigned int));
 	if (set_size_choosen == NULL)
 		err(1, "calloc(%d, %zd)", present_num_cpus + 1, sizeof(unsigned int));

 	set_size_seconds = calloc(present_num_cpus + 1, sizeof(double));
 	if (set_size_seconds == NULL)
 		err(1, "calloc(%d, %zd)", present_num_cpus + 1, sizeof(double));

 	stats_clear();
 }

 static void stats_dump(void)
 {
 	unsigned int cpus;

 	fprintf(dfp, "CPUs  Used Duration[sec]\n");

 	for (cpus = 1; cpus <= present_num_cpus; ++cpus) {
 		fprintf(dfp, "%4d  %4d %8.2f\n",
 			cpus, set_size_choosen[cpus], set_size_seconds[cpus]);
 	}
 }

 static void dump_statistics(void)
 {
 	stats_dump();
 	debug_sleep_histograms_dump();

 #if NOTYET
 	fprintf(dfp, "%u%% timer slack updates (%u)\n",
 		100 * debug_timer_slack_updates/debug_sleep_count, debug_timer_slack_updates);

 	debug_timer_slack_updates = 0;
 	debug_sleep_count = 0;
 #endif
 	stats_clear();
 	debug_sleep_histograms_clear();
 }

 /*
  * sleep_for_sec(sec)
  *
  * Sleep for 'ns' nanosec
  * First allow adjustment to timer_slack
  */

 static void sleep_for_sec(double seconds)
 {
 	struct timespec ts;

 	debug_sleep_count++;
 	debug_sleep_histogram_record(sleep_request_histogram, seconds);

 #if NOTYET
 	timer_slack_check(seconds);
 #endif

 	ts.tv_sec = (time_t)seconds;

 	if (debug > 1)
 		fprintf(dfp, "input seconds %f -> tv seconds %lld\n",
 			seconds, (long long int)ts.tv_sec);

 	seconds = seconds - ts.tv_sec;
 	ts.tv_nsec = (long)(seconds * 1000000000.0);

 	if (debug > 1)
 		fprintf(dfp, "input seconds * 1B %f -> tv nsec %ld\n",
 			seconds * 1000000000, ts.tv_nsec);

 	if (nanosleep(&ts, NULL) < 0) {
 		if (errno == EINTR) {
 			/* early return if signaled */
 			return;
 		}
 		err(-1, "nanosleep");
 	}
 }
 static void __attribute__((__noreturn__)) usage(FILE *out)
 {
 	fprintf(out, "shepherd 17.05.21a (C) 2017 Len Brown <len.brown@intel.com>\n");
 	fprintf(out, "Usage: shepherd [options]\n");
 	fprintf(out, "'man shepherd' for details\n");

 	exit(out == stderr ? EXIT_FAILURE: EXIT_SUCCESS);
 }

 static void cmdline(int argc, char **argv)
 {
 	int c;

 	static const struct option longopts[] = {
 		{ "debug",	0, NULL, 'd' },
 		{ "help",	0, NULL, 'h' },
 		{ "output",	0, NULL, 'o' },
 	};

 	while ((c = getopt_long_only(argc, argv, "+dho:", longopts, NULL)) != -1) {
 		switch (c) {
 		case 'd':
 			debug++;
 			break;
 		case 'h':
 			usage(stdout);
 			break;
 		case 'o':
 			output_path = optarg;
 			break;
 		default:
 			usage(stderr);
 			break;
 		}
 	}
 }

 static void init_default_sample_rates(void)
 {

 	double user_tick_sec;

 	user_tick_per_sec = sysconf(_SC_CLK_TCK);

 	if (user_tick_per_sec != 100)
 		warnx("User ticks %ld, expected 100", user_tick_per_sec);

 	user_tick_sec = 1.0 / user_tick_per_sec;

 	/*
 	 * Nominal wakeup frequency is once per second
 	 * 1.0 sec/sample = 1 sample/sec
 	 */
 	poll_norm_sec = 1;

 	/*
 	 * Slow wakeup frequency is used during profound idle
 	 * and at max utilization of full set size
 	 * 2.0 sec/sample = 0.5 sample/sec
 	 */
 	poll_slow_sec = 2;

 	/*
 	 * Fast wakeup frequency is used at high utilization
 	 * Sample at 10 * user user tick rate, which means
 	 * we will see at most 10 counter ticks/cpu/sample.
 	 * Since user_tick_sec = 10ms,
 	 * sample rate = 10 * 10ms = 100ms = 10 samples/sec
 	 */
 	poll_fast_sec =  user_tick_sec * 10;

 }

 static void defaults_init(void)
 {
 	init_default_sample_rates();

 	// set_size_ceiling = TBD;

 }
 /*
  * Open a file, and exit on failure
  */
 static FILE *fopen_or_die(const char *path, const char *mode)
 {
 	FILE *filep = fopen(path, mode);
 	if (!filep)
 		err(1, "%s: open failed", path);
 	return filep;
 }
 /*
  * run func(cpu) on every cpu in /proc/stat
  * return sum of return values of func(cpu)
  * else, if func() returns negative value, return that.
  */
 static unsigned long long sum_for_all_proc_cpus(int (func)(unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int), unsigned int arg)
 {
 	FILE *fp;
 	int cpu_num, user, nice, system, idle;
 	int retval;
 	unsigned long long sum = 0;

 	fp = fopen_or_die(proc_stat, "r");

 	retval = fscanf(fp, "cpu %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n");
 	if (retval != 0)
 		err(1, "%s: failed to parse format", proc_stat);

 	while (1) {
 		retval = fscanf(fp, "cpu%u %d %d %d %d %*d %*d %*d %*d %*d %*d\n",
 			&cpu_num, &user, &nice, &system, &idle);
 		if (retval != 5)
 			break;

 		retval = func(arg, cpu_num, user, nice, system, idle);
 		if (retval < 0) {
 			fclose(fp);
 			return(retval);
 		}
 		sum += retval;
 	}
 	fclose(fp);
 	return sum;
 }

 static unsigned int get_num_runnable_threads(void)
 {
 	FILE *fp;
 	int retval;
 	unsigned int num_threads;

 	fp = fopen_or_die(proc_loadavg, "r");

 	retval = fscanf(fp, "%*f %*f %*f %ud/%*d %*d\n", &num_threads);
 	if (retval != 1)
 		err(1, "%s", proc_stat);

 	if (debug > 2)
 		fprintf(dfp, "%d threads\n",  num_threads);
 	fclose(fp);

 	return num_threads;
 }
 #if NOTYET
 static int for_all_cpuset_cpus(cpu_set_t *set, int (func)(unsigned int))
 {
 	unsigned int cpu;
 	int retval;

 	for (cpu = 0; cpu <= max_cpu_num; ++cpu) {
 		if (!CPU_ISSET_S(cpu, system_cpuset_size, set))
 			continue;
 		retval = func(cpu);
 		if (retval)
 			return retval;
 	}
 	return 0;
 }
 #endif
 /*
  * cpu_count()
  * update global max_cpu_num, return 1 (per cpu)
  */
 static int cpu_count(UNUSED(arg), unsigned int cpu, UNUSED(a), UNUSED(b), UNUSED(c), UNUSED(d))
 {
 	if (cpu > max_cpu_num)
 		max_cpu_num = cpu;

 	if (debug > 1)
 		fprintf(dfp, "cpu%d: %d\n", cpu, present_num_cpus);
 	return 1;
 }

 static int cpu_mark_present(UNUSED(arg), unsigned int cpu, UNUSED(a), UNUSED(b), UNUSED(c), UNUSED(d))
 {
 	CPU_SET_S(cpu, system_cpuset_size, cpuset_present);
 	return 0;
 }

 /*
  * for given CPU, starting with INDEX 0
  * Find first /sys/devices/system/cpu/$CPU/cache/$INDEX/shared_cpu_list
  * and return it in cpuset_cache_topology arg
  *
  * return 0 on success, non-0 on failure
  */
 static int get_cache_shared_cpu_list(unsigned int cpu_num, cpu_set_t *cpuset_cache_topology)
 {
 	FILE *fp;
 	int index;
 	char buf[256];

 int retval;

 	for (index = 0; index < 10; ++index) {
 		char *s;

 		sprintf(buf, "/sys/devices/system/cpu/cpu%d/cache/index%d", cpu_num, index);
 		if (access(buf, R_OK) != 0) {
 			if (debug)
 				perror(buf);
 			return -1;
 		}
 		sprintf(buf, "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_list", cpu_num, index);
 		fp = fopen(buf, "r");
 		if (fp == NULL) {
 			if (debug) perror(buf);
 			continue;
 		}

 		if (fgets(buf, sizeof(buf), fp) == NULL)
 			err(1, "%s", buf);

 		s = strchr(buf, '\n');
 		*s = '\0';

 		if (debug > 1)
 			fprintf(dfp, "get_cache_shared_cpu_list(%d): read '%s'\n", cpu_num, buf);
 		retval = cpulist_parse(buf, cpuset_cache_topology, system_cpuset_size, 1);
 		return retval;
 	}
 	return -1;
 }
 /*
  * cpus[i] is the order that CPUs are added to the set.
  * This mapping is used to allocate cpus to the set in "topology order".
  * But even for a linear mapping it is needed to deal with sparse numbering.
  */
 unsigned int *cpus;

 static void topology_init_cpu_order(void)
 {
 	int index;
 	unsigned int cpu_num;
 	cpu_set_t *cpuset_cache_topology, *cpuset_unprocessed;

 	cpuset_cache_topology = CPU_ALLOC((max_cpu_num + 1));
 	if (cpuset_cache_topology == NULL)
 		err(3, "CPU_ALLOC");

 	cpuset_unprocessed = CPU_ALLOC((max_cpu_num + 1));
 	if (cpuset_unprocessed == NULL)
 		err(3, "CPU_ALLOC");

 	CPU_ZERO_S(system_cpuset_size, cpuset_unprocessed);

 	CPU_OR_S(system_cpuset_size, cpuset_unprocessed, cpuset_unprocessed, cpuset_active);

 	for (index = 0, cpu_num = 0; cpu_num <= max_cpu_num; ++cpu_num) {
 		unsigned int cache_cpu_num;

 		if (!CPU_ISSET_S(cpu_num, system_cpuset_size, cpuset_unprocessed))
 			continue;

 		CPU_ZERO_S(system_cpuset_size, cpuset_cache_topology);
 		if (get_cache_shared_cpu_list(cpu_num, cpuset_cache_topology)) {
 			if (debug) fprintf(dfp, "BAILING to LINEAR\n");
 			break;
 		}
 		for (cache_cpu_num = 0; cache_cpu_num <= max_cpu_num; ++cache_cpu_num)
 		{
 			if (!CPU_ISSET_S(cache_cpu_num, system_cpuset_size, cpuset_cache_topology))
 				continue;
 			cpus[index] = cache_cpu_num;
 			if (debug)
 				fprintf(dfp, "%d: cpus[%d] = cpu%d\n", cpu_num, index, cache_cpu_num);
 			CPU_CLR_S(cache_cpu_num, system_cpuset_size, cpuset_unprocessed);
 			index++;
 		}
 	}

 	CPU_FREE(cpuset_cache_topology);
 	CPU_FREE(cpuset_unprocessed);

 	if (cpu_num == max_cpu_num + 1)
 		return;

 	fprintf(dfp, "BACKUP plan: linear %d\n", cpu_num);
 	/* linear */
 	for (index = 0, cpu_num = 0; cpu_num <= max_cpu_num; ++cpu_num) {
 		if (CPU_ISSET_S(cpu_num, system_cpuset_size, cpuset_active)) {
 			cpus[index] = cpu_num;
 			if (debug > 1)
 				fprintf(dfp, "cpus[%d] = cpu%d\n", index, cpu_num);
 			index++;
 		}
 	}
 }

 static void topology_probe(void)
 {
 	present_num_cpus = 0;
 	max_cpu_num = 0;
 	present_num_cpus = sum_for_all_proc_cpus(cpu_count, 0);
 	cpuset_size_ceiling = present_num_cpus;

 	if (debug > 1)
 		fprintf(dfp, "%d cpus, cpu%d is highest numbered\n",
 			present_num_cpus, max_cpu_num);

 	/*
 	 * Allocate the ordered CPU topology list
 	 * indexed by size of active_set
 	 * each entry containes cpu# which is last
 	 * onlined to complete a set of that size
 	 */
 	cpus = calloc(present_num_cpus, sizeof(unsigned int));
 	if (cpus == NULL)
 		err(1, "calloc(%d, %zd)", present_num_cpus, sizeof(unsigned int));

 	/*
 	 * Allocate cpuset_present, initialize to all CPUs in system
 	 */
 	cpuset_present = CPU_ALLOC((max_cpu_num + 1));
 	if (cpuset_present == NULL)
 		err(3, "CPU_ALLOC");
 	system_cpuset_size = CPU_ALLOC_SIZE((max_cpu_num + 1));
 	CPU_ZERO_S(system_cpuset_size, cpuset_present);
 	sum_for_all_proc_cpus(cpu_mark_present, 0);

 	/*
 	 * Allocate cpuset_active, initialize = cpuset_present
 	 */
 	cpuset_active = CPU_ALLOC((max_cpu_num + 1));
 	if (cpuset_active == NULL)
 		err(3, "CPU_ALLOC");
 	CPU_ZERO_S(system_cpuset_size, cpuset_active);
 	CPU_OR_S(system_cpuset_size, cpuset_active, cpuset_present, cpuset_active);

 	/*
 	 * TODO: make this 2 on HT boxes
 	 */
 	cpuset_size_floor = 1;

 	topology_init_cpu_order();
 }

 static int cpu_record_utilization(UNUSED(index), unsigned int cpu,
 	UNUSED(a), UNUSED(b), UNUSED(c), unsigned int idle_count)
 {
 	if (debug > 1)
 		fprintf(dfp, "cpu%d idle %d\n", cpu, idle_count);

 //	if (CPU_ISSET_S(cpu, system_cpuset_size, cpuset_active))
 	return idle_count;
 //	else
 //		return 0;
 }

 static void system_snapshot_utilization(unsigned int index)
 {

 	history[index].seconds = timestamp_sec();
 	if(debug > 1)
 		fprintf(dfp, "util timestamp %f\n", history[index].seconds);

 	history[index].system_idle_tick_sum =
 		sum_for_all_proc_cpus(cpu_record_utilization, 0);
 	if (debug > 2)
 		debug_dump_history();
 }

 #if NOTYET
 static int per_cpu_stub(unsigned int cpu)
 {
 	fprintf(dfp, "hello world %d\n", cpu);
 	return 0;
 }
 #endif
 /*
  * cpuset_calculate_utilization()
  *
  * return average_utilization of set during previous interval from 0.0 to 100.0 %
  */
 double ema;

 static double cpuset_calculate_utilization(unsigned int util_index_prev, unsigned int util_index, unsigned int active_num_cpus)
 {
 	double idle_ticks;
 	double ticks_per_interval_per_cpu;
 	double delta_sec;
 	double average_utilization;
 	double total_possible_ticks;
 	double busy_ticks;

 #if NOYET
 	double ema_period_sec = 2.0;
 	double ema_weight;
 #endif

 	if (util_index == util_index_prev)
 		return -1;

 	delta_sec = history[util_index].seconds - history[util_index_prev].seconds;
 	debug_sleep_histogram_record(sleep_actual_histogram, delta_sec);

 	if (delta_sec <= 0) {
 		fprintf(dfp, "BUG: delta_sec %f\n", delta_sec);
 		delta_sec = 0.01;
 	}
 	set_size_choosen[active_num_cpus]++;
 	set_size_seconds[active_num_cpus] += delta_sec;

 	if(debug > 1)
 		fprintf(dfp, "delta_sec %f = [%d] %f - [%d] %f\n",
 			delta_sec, util_index, history[util_index].seconds,
 			util_index_prev, history[util_index_prev].seconds);

 	ticks_per_interval_per_cpu = user_tick_per_sec * delta_sec;
 	if(debug > 1)
 		fprintf(dfp, "ticks_per_interval_per_cpu %f = %ld * %f\n",
 			ticks_per_interval_per_cpu, user_tick_per_sec, delta_sec);

 	idle_ticks = history[util_index].system_idle_tick_sum -
 		history[util_index_prev].system_idle_tick_sum;
 	if (debug > 1)
 		fprintf(dfp, "idle_ticks %f = %lld - %lld\n",
 			idle_ticks, history[util_index].system_idle_tick_sum,
 			history[util_index_prev].system_idle_tick_sum);

 	total_possible_ticks = ticks_per_interval_per_cpu * present_num_cpus;
 	busy_ticks = total_possible_ticks - idle_ticks;

 	if (debug > 1)
 		fprintf(dfp, "total_possible_ticks %f busy_ticks %f\n",
 			total_possible_ticks, busy_ticks);


 	average_utilization = 100 * busy_ticks / ticks_per_interval_per_cpu / active_num_cpus;

 	if (average_utilization < 0)
 		average_utilization = 0;


 #if NOTYET
 	/*
 	 * Exponential Moving Average (EMA)
 	 *
 	 * EMA(t) = EMA(t-1) + EMA_WEIGHT * (SAMPLE(t) - EMA(t-1))
 	 *
 	 * Larger EMA_WEIGHT favors SAMPLE(t)
 	 * at the expense of EMA history, EMA(t-1)
 	 *
 	 * An N-period EMA will forget 86% of history
 	 * after N samples, and EMA_WEIGHT = 1/(N + 1).
 	 *
 	 * We make the policy decision of selecting ema_period_sec = 2.0.
 	 * We use a different EMA_WEIGHT depending on the sample rate.
 	 *
 	 * EMA_WEIGHT(2.0 sec, 100 samples/sec) = 2/(200+1) = 0.00995
 	 * EMA_WEIGHT(2.0 sec, 10 samples/sec)  = 2/(20+1)  = 0.0952
 	 * EMA_WEIGHT(2.0 sec, 1 samples/sec)  = 2/(2+1)  = 0.666
 	 * EMA_WEIGHT(2.0 sec, .5 samples/sec)  = 2/(1+1)  = 1.0
 	 */
 	ema_weight = 2 / (ema_period_sec * 1/delta_sec + 1);
 	ema = ema + ema_weight * average_utilization -  ema_weight * ema;
 #endif

 	if(debug) {
 		char cpulist[256];

 		cpulist_create(cpulist, 256, cpuset_active, system_cpuset_size);

 		fprintf(dfp, "%d cpus util%%: %.2f%% TGT: %.2f sec: %.4f set: %s\n",
 			active_num_cpus, average_utilization,
 			active_num_cpus * average_utilization / 100,
 			delta_sec, cpulist);
 	}

 	return average_utilization;
 }
 /*
  * cpuset_change_size(target_num, num_active)
  * cpu[num_active] is the next available cpu
  */
 static void cpuset_change_size(unsigned int target_num, unsigned int num_active)
 {
 	unsigned int i;
 	if (debug > 1)
 		fprintf(dfp, "ACTIVE %d TARGET: %d ",
 			num_active, target_num);

 	if (target_num > num_active) {
 		if (debug > 1) fprintf(dfp, "ADD:");
 		for (i = num_active; i < target_num; ++i) {
 			CPU_SET_S(cpus[i], system_cpuset_size, cpuset_active);
 			if(debug > 1) fprintf(dfp, " cpu%d", cpus[i]);
 		}
 		if (debug > 1) fprintf(dfp, "\n");
 	}
 	if (target_num < num_active) {
 		if (debug > 1) fprintf(dfp, "SUB:");
 		for (i = num_active - 1; i > target_num - 1; --i) {
 			CPU_CLR_S(cpus[i], system_cpuset_size, cpuset_active);
 			if(debug > 1) fprintf(dfp, " cpu%d", cpus[i]);
 		}
 		if (debug > 1) fprintf(dfp, "\n");
 	}
 }
 static void do_binding(cpu_set_t *set)
 {
 	int retval;
 	char cpulist[256];
 	FILE *fp;

 	cpulist_create(cpulist, 256, set, system_cpuset_size);
 	fp = fopen_or_die("balloon/cpuset.cpus", "w");
 	retval = fputs(cpulist, fp);
 	if (retval == EOF)
 		err(1, "failed to write %s to balloon/cpuset.cpus\n", cpulist);

 if (debug) fprintf(dfp, "balloon/cpuset.cpus %s\n", cpulist);
 	retval = fclose(fp);
 	if (retval)
 		err(1, "fcose balloon/cpuset.cpus");
 #if NOT_ANYMORE
 	{
 	char command[256];
 	snprintf(command, 256, "taskset -cp %s all", cpulist);
 	retval = system(command);
 	if (debug > 1)
 		fprintf(dfp, "\t\t\t\t\tSYSTEM \"%s\" = %d\n", command, retval);
 	}
 #endif
 }

 /*
  * cpuset_update()
  *
  * The "ideal" target set size is the size that could retire
  * the previous load with all CPUs exactly 100% utilized.
  * But the math rarely works out evenly, and we need to
  * decided on an integer target number of CPUs in the set.
  *
  * Say utilization * active_num_cpus = 3.2?
  * We can't allocate 0.2 of a CPU...
  *
  */
 static unsigned int cpuset_update(double avg_util, unsigned int active_num_cpus)
 {
 	double ideal_target_num_cpus;
 	double target_num_cpus;
 	double roundup;
 	int truncated_int;

 	if (avg_util > 95) {
 		if (get_num_runnable_threads() > active_num_cpus + 2) {
 			target_num_cpus = active_num_cpus + 1;
 			goto done;
 		}
 	}
 	ideal_target_num_cpus = (avg_util * active_num_cpus)/100;
 	truncated_int = (unsigned int)ideal_target_num_cpus;
 	roundup = ideal_target_num_cpus - truncated_int;
 	if (roundup > 0.5)
 		roundup = 1;
 	else if (roundup > ideal_target_num_cpus/10)
 		roundup = 1;

 	target_num_cpus = ideal_target_num_cpus + roundup;

 	if (debug > 1)
 		fprintf(dfp, "ideal %f = %f * %d; target %d\n",
 			ideal_target_num_cpus, avg_util, active_num_cpus,
 			(unsigned int)target_num_cpus);

 #if 0
 	ideal_set_size = util_sum / 100.0;

 	if (ideal_set_size > (up_threshold * active_num_cpus)) {
 		/*
   		 * Utilization is above up_threshold
 		 * Increase set size
 		 */

 		delta_set_size = active_num_cpus * up_slope;
 		if (delta_set_size < 1 )
 			delta_set_size == 1;

 		target_set_size = active_num_cpus + increase;
 	} else {
 		/*
  		 * Utilization is below up_threshold
  		 * Possibly decrease set size
  		 */
 		delta_set_size = active_num_cpus - ideal_set_size;
 		delta_set_size *= down_rate;
 		target_set_size = active_num_cpus - delta_set_size;
 	}
 #endif

 done:
 	if (target_num_cpus > cpuset_size_ceiling)
 		target_num_cpus = cpuset_size_ceiling;
 	else if (target_num_cpus < cpuset_size_floor)
 		target_num_cpus = cpuset_size_floor;

 	if ((unsigned int) target_num_cpus != active_num_cpus) {
 		cpuset_change_size(target_num_cpus, active_num_cpus);
 		do_binding(cpuset_active);
 	}

 	return (unsigned int) target_num_cpus;
 }
 static void main_loop(void)
 {
 	int i;
 	unsigned int util_index_prev = 0;
 	unsigned int util_index = 1;
 	unsigned int active_num_cpus = present_num_cpus;	// TBD

 	syslog(LOG_NOTICE, "shepherd started.");
 	/*
 	 * Initial snapshot
 	 */
 	system_snapshot_utilization(util_index_prev);
 	history[util_index_prev].cpuset_num_cpus = active_num_cpus;
 	sleep_for_sec(poll_fast_sec);

 	/*
 	 * run for(ever)
 	 */
 	for (i = 0; ; ++i) {
 		double avg_util;

 		system_snapshot_utilization(util_index);
 		history[util_index].cpuset_num_cpus = active_num_cpus;
 		avg_util = cpuset_calculate_utilization(util_index_prev, util_index, active_num_cpus);

 		if (exit_main_loop)
 			break;

 		active_num_cpus = cpuset_update(avg_util, active_num_cpus);

 		sleep_for_sec(get_next_polling_interval_sec(avg_util, active_num_cpus));

 		util_index_prev = util_index;
 		util_index += 1;
 		if (util_index >= HISTORY_LENGTH)
 			util_index = 0;

 	}
 	syslog(LOG_NOTICE, "shepherd terminated.");
 }
 pid_t child_pid;

 /*
  * nanny - babysit the child
  * leave a note for the parrent main loop
  * if the child exits
  */
 static void nanny(__attribute__((unused))int signal)
 {
 	if (waitpid(child_pid, NULL, WNOHANG) == child_pid)
 		exit_main_loop = 1;
 }
 /*
  * fork_it()
  */
 static void fork_it(char **argv)
 {

 	child_pid = fork();

 	if (!child_pid) { /* child */

 		if (!child_pid) { /* command child  */
 			execvp(argv[0], argv);
 			err(1, "%s", argv[0]);
 		}
 	} else { /* parent */
 		signal(SIGCHLD, nanny);
 	}
 }
 static void signal_handler_stats(int signal)
 {
 	if (debug)
 		fprintf(dfp, "signal %d received\n", signal);

 	dump_statistics();
 }

 static void signal_handler_end_it(int signal)
 {
 	if (debug)
 		fprintf(dfp, "caught signal %d\n", signal);
 	exit_main_loop = 1;
 }

 static void signals_init(void)
 {
 	signal(SIGALRM, signal_handler_stats);
 	signal(SIGUSR1, signal_handler_stats);
 	signal(SIGINT, signal_handler_end_it);
 }

 void copy_file_by_line(char *from_path, char *to_path)
 {
 	FILE *from_fp, *to_fp;
 	char line_buf[1024];
 	char *s;
 	int retval;

 	from_fp = fopen_or_die(from_path, "r");
 	to_fp = fopen_or_die(to_path, "w");

 	while (fgets(line_buf, sizeof(line_buf), from_fp) != NULL) {

 		if(debug > 1) fprintf(dfp, "copying: %s", line_buf);

 		retval = fputs(line_buf, to_fp);
 		if (retval == EOF)
 			err(1, "fputs cpuset.mems");

 		rewind(to_fp);
 	}

 	retval = fclose(from_fp);
 	if (retval)
 		err(1, "close %s", from_path);

 	retval = fclose(to_fp);
 	if (retval)
 		err(1, "close %s", to_path);


 }

 char cpuset_base[] = "/shepherd";
 char cpuset_balloon[] = "balloon";

 static void cpuset_init(void)
 {
 	int retval;

 	retval = access(cpuset_base, F_OK);
 	if (retval) {
 		if (debug)
 			fprintf(dfp, "making %s\n", cpuset_base);
 		retval = mkdir(cpuset_base, 0755);
 		if (retval) {
 			if (errno == EEXIST)
 				fprintf(dfp, "okay, %s is already there\n", cpuset_base);
 			else
 				err(1, "could  not  create '%s'", cpuset_base);
 		}
 	}

 	retval = mount("dummy", cpuset_base, "cgroup", 0, "cpuset");
 	if (retval) {
 		if (errno != EBUSY)
 			err(retval, "mount cpuset");
 		if (debug)
 			fprintf(dfp, "cpuset already mounted\n");
 	}

 	retval = chdir(cpuset_base);
 	if (retval)
 		err(retval, "%s", cpuset_base);

 	retval = mkdir(cpuset_balloon, 0755);
 	if (retval) {
 		if (errno == EEXIST)
 			fprintf(dfp, "okay, %s is already there\n", cpuset_balloon);
 		else
 			err(1, "could  not  create '%s'", cpuset_balloon);
 	}

 	copy_file_by_line("cpuset.mems", "balloon/cpuset.mems");

 	copy_file_by_line("cpuset.cpus", "balloon/cpuset.cpus");

 	copy_file_by_line("tasks", "balloon/tasks");

 }

 void daemonize(void)
 {
 	pid_t pid;
 	int i;

 	pid = fork();
 	if (pid < 0)
 		err(1, "fork");
 	if (pid > 0)
 		exit(0);	/* parent exits */

 	if (setsid() < 0)
 		err(1, "setsid");
 	signal(SIGCHLD, SIG_IGN);
 	signal(SIGHUP, SIG_IGN);

 	pid = fork();
 	if (pid < 0)
 		err(2, "Fork");
 	if (pid > 0)
 		exit(0);	/* parent exits */

 	umask(0);
 	chdir("/");

 	for (i = sysconf(_SC_OPEN_MAX); i >= 0; i--)
 		close(i);
 }

 void debug_init(void)
 {
 	if (output_path == NULL) {
 		dfp = stderr;

 		//output_path = "/tmp/shepherd.log";
 	} else {
 		dfp = fopen_or_die(output_path, "w");
 	}
 }

 int main(int argc, char **argv)
 {

 	srandom((unsigned int)time(NULL));

 	defaults_init();

 	cmdline(argc, argv);

 	debug_init();

 	if (!debug)
 		daemonize();

 	openlog("shepherd", LOG_PID, LOG_DAEMON);

 	topology_probe();

 	cpuset_init();

 	stats_init();

 	if (argc - optind)
 		fork_it(argv + optind);

 	signals_init();
 	main_loop();

 	dump_statistics();
 	do_binding(cpuset_present);	/* restore system binding */
 	return 0;
 }