src/stalld.c - pub/scm/utils/stalld/stalld - Git at Google

 /*
  * stalld: starvation detection and avoidance (with bounds).
  *
  * This program was born after Daniel and Juri started debugging once again
  * problems caused kernel threads starving due to busy-loop sched FIFO threads.
  *
  * The idea is simple: after detecting a thread starving on a given CPU for a
  * given period, this thread will receive a "bounded" chance to run, using
  * SCHED_DEADLINE. In this way, the starving thread is able to make progress
  * causing a bounded Operating System noise (OS Noise).
  *
  * SPDX-License-Identifier: GPL-2.0
  *
  * Copyright (C) 2020 Red Hat Inc, Daniel Bristot de Oliveira <bristot@redhat.com>
  *
  */

 #define _GNU_SOURCE
 #include <ctype.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <getopt.h>
 #include <pthread.h>
 #include <sched.h>
 #include <signal.h>
 #include <stdarg.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <syslog.h>
 #include <sys/param.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <time.h>
 #include <unistd.h>
 #include <linux/sched.h>

 #include "stalld.h"

 /*
  * logging.
  */
 int config_verbose = 0;
 int config_write_kmesg = 0;
 int config_log_syslog = 1;
 int config_log_only = 0;
 int config_foreground = 0;

 /*
  * boost parameters (time in nanoseconds).
  */
 unsigned long config_dl_period  = 1000000000;
 unsigned long config_dl_runtime = 20000;

 /*
  * fifo boost parameters
  */
 unsigned long config_fifo_priority = 98;
 unsigned long config_force_fifo = 0;

 /*
  * control loop (time in seconds)
  */
 long config_starving_threshold = 60;
 long config_boost_duration = 3;
 long config_aggressive = 0;

 /*
  * XXX: Make it a cpu mask, lazy Daniel!
  */
 int config_monitor_all_cpus = 1;
 char *config_monitored_cpus;

 /*
  * Max known to be enough sched_debug buffer size. It increases if the
  * file gets larger.
  */
 int config_buffer_size = BUFFER_SIZE;

 /*
  * boolean to choose between deadline and fifo
  */
 int boost_policy;

 /*
  * variable to indicate if stalld is running or
  * shutting down
  */
 int running = 1;

 /*
  * read the contents of /proc/sched_debug into
  * the input buffer
  */
 int read_sched_debug(char *buffer, int size)
 {
 	int position = 0;
 	int retval;
 	int fd;

 	fd = open("/proc/sched_debug", O_RDONLY);

 	if (!fd)
 		goto out_error;

 	do {
 		retval = read(fd, &buffer[position], size - position);
 		if (read < 0)
 			goto out_close_fd;

 		position += retval;

 	} while (retval > 0 && position < size);

 	buffer[position-1] = '\0';

 	if (position + 100 > config_buffer_size) {
 		config_buffer_size = config_buffer_size * 2;
 		log_msg("sched_debug is getting larger, increasing the buffer to %d\n", config_buffer_size);
 	}

 	close(fd);

 	return position;

 out_close_fd:
 	close(fd);

 out_error:
 	return 0;
 }

 /*
  * find the start of a cpu information block in the
  * input buffer
  */
 char *get_cpu_info_start(char *buffer, int cpu)
 {
 	/* 'cpu#9999,\0' */
 	char cpu_header[10];

 	sprintf(cpu_header, "cpu#%d,", cpu);

 	return strstr(buffer, cpu_header);
 }

 char *get_next_cpu_info_start(char *start)
 {
 	const char *next_cpu = "cpu#";

 	/* Skipp the current cpu definition */
 	start += 10;

 	return strstr(start, next_cpu);
 }

 char *alloc_and_fill_cpu_buffer(int cpu, char *sched_dbg, int sched_dbg_size)
 {
 	char *next_cpu_start;
 	char *cpu_buffer;
 	char *cpu_start;
 	int size = 0;

 	cpu_start = get_cpu_info_start(sched_dbg, cpu);

 	/*
 	 * The CPU might be offline.
 	 */
 	if (!cpu_start)
 		return NULL;

 	next_cpu_start = get_next_cpu_info_start(cpu_start);

 	/*
 	 * If it did not find the next CPU, it should be the
 	 * end of the file.
 	 */
 	if (!next_cpu_start)
 		next_cpu_start = sched_dbg + sched_dbg_size;

 	size = next_cpu_start - cpu_start;

 	if (size <= 0)
 		return NULL;

 	cpu_buffer = malloc(size);

 	if (!cpu_buffer)
 		return NULL;

 	strncpy(cpu_buffer, cpu_start, size);

 	cpu_buffer[size-1] = '\0';

 	return cpu_buffer;
 }
 /*
  * Example:
  * ' S           task   PID         tree-key  switches  prio     wait-time             sum-exec        sum-sleep'
  * '-----------------------------------------------------------------------------------------------------------'
  * ' I         rcu_gp     3        13.973264         2   100         0.000000         0.004469         0.000000 0 0 /
  */
 int fill_waiting_task(char *buffer, struct task_info *task_info, int nr_entries)
 {
 	struct task_info *task;
 	char *start = buffer;
 	int tasks = 0;
 	int comm_size;
 	char *end;

 	while (tasks < nr_entries) {
 		task = &task_info[tasks];

 		/*
 		 * only care about tasks in the Runnable state
 		 * Note: the actual scheduled task will show up as
 		 * "\n>R" so we will skip it.
 		 *
 		 */
 		start = strstr(start, "\n R");

 		/*
 		 * if no match then there are no more Runnable tasks
 		 */
 		if (!start)
 			break;

 		/*
 		 * Skip '\n R'
 		 */
 		start = &start[3];

 		/*
 		 * skip the spaces.
 		 */
 		while(start[0] == ' ')
 			start++;

 		end = start;

 		while(end[0] != ' ')
 			end++;

 		comm_size = end - start;

 		if (comm_size > 15) {
 			warn("comm_size is too large: %d\n", comm_size);
 			comm_size = 15;
 		}

 		strncpy(task->comm, start, comm_size);

 		task->comm[comm_size] = 0;

 		/*
 		 * go to the end of the task comm
 		 */
 		start=end;

 		task->pid = strtol(start, &end, 10);

 		/*
 		 * go to the end of the pid
 		 */
 		start=end;

 		/*
 		 * skip the tree-key
 		 */
 		while(start[0] == ' ')
 			start++;

 		while(start[0] != ' ')
 			start++;

 		task->ctxsw = strtol(start, &end, 10);

 		start = end;

 		task->prio = strtol(start, &end, 10);

 		task->since = time(NULL);

 		/*
 		 * go to the end and try to find the next occurence.
 		 */
 		start = end;

 		tasks++;
 	}

 	return tasks;
 }

 void print_waiting_tasks(struct cpu_info *cpu_info)
 {
 	struct task_info *task;
 	time_t now = time(NULL);
 	int i;

 	printf("CPU %d has %d waiting tasks\n", cpu_info->id, cpu_info->nr_waiting_tasks);
 	if (!cpu_info->nr_waiting_tasks)
 		return;

 	for (i = 0; i < cpu_info->nr_waiting_tasks; i++) {
 		task = &cpu_info->starving[i];

 		printf("%15s %9d %9d %9d %9ld\n", task->comm, task->pid, task->prio, task->ctxsw, (now - task->since));
 	}

 	return;

 }

 void merge_taks_info(struct task_info *old_tasks, int nr_old, struct task_info *new_tasks, int nr_new)
 {
 	struct task_info *old_task;
 	struct task_info *new_task;
 	int i;
 	int j;

 	for (i = 0; i < nr_old; i++) {
 		old_task = &old_tasks[i];

 		for (j = 0; j < nr_new; j++) {
 			new_task = &new_tasks[j];

 			if (old_task->pid == new_task->pid) {
 				if (old_task->ctxsw == new_task->ctxsw)
 					new_task->since = old_task->since;

 				break;
 			}
 		}
 	}
 }

 int parse_cpu_info(struct cpu_info *cpu_info, char *buffer, int buffer_size)
 {

 	struct task_info *old_tasks = cpu_info->starving;
 	int nr_old_tasks = cpu_info->nr_waiting_tasks;
 	long nr_running, nr_rt_running;
 	int cpu = cpu_info->id;
 	char *cpu_buffer;
 	int retval = 0;

 	cpu_buffer = alloc_and_fill_cpu_buffer(cpu, buffer, buffer_size);
 	/*
 	 * It is not necessarily a problem, the CPU might be offline. Cleanup
 	 * and leave.
 	 */
 	if (!cpu_buffer) {
 		if (old_tasks)
 			free(old_tasks);
 		cpu_info->nr_waiting_tasks = 0;
 		cpu_info->nr_running = 0;
 		cpu_info->nr_rt_running = 0;
 		cpu_info->starving = 0;
 		goto out;
 	}

 	nr_running = get_variable_long_value(cpu_buffer, ".nr_running");
 	nr_rt_running = get_variable_long_value(cpu_buffer, ".rt_nr_running");

 	if ((nr_running == -1) || (nr_rt_running == -1)) {
 		retval = -EINVAL;
 		goto out_free;
 	}

 	cpu_info->nr_running = nr_running;
 	cpu_info->nr_rt_running = nr_rt_running;

 	cpu_info->starving = malloc(sizeof(struct task_info) * cpu_info->nr_running);
 	cpu_info->nr_waiting_tasks = fill_waiting_task(cpu_buffer, cpu_info->starving, cpu_info->nr_running);
 	if (old_tasks) {
 		merge_taks_info(old_tasks, nr_old_tasks, cpu_info->starving, cpu_info->nr_waiting_tasks);
 		free(old_tasks);
 	}

 out_free:
 	free(cpu_buffer);
 out:
 	return retval;
 }

 int get_current_policy(int pid, struct sched_attr *attr)
 {
 	int ret;

 	ret = sched_getattr(pid, attr, sizeof(*attr), 0);
 	if (ret == -1)
 		log_msg("get_current_policy: failed with error %s\n", strerror(errno));
 	return ret;
 }

 int boost_with_deadline(int pid)
 {
 	int ret;
 	int flags = 0;
 	struct sched_attr attr;

 	memset(&attr, 0, sizeof(attr));
 	attr.size = sizeof(attr);
 	attr.sched_policy   = SCHED_DEADLINE;
 	attr.sched_runtime  = config_dl_runtime;
 	attr.sched_deadline = config_dl_period;
 	attr.sched_period   = config_dl_period;

 	ret = sched_setattr(pid, &attr, flags);
 	if (ret < 0) {
 	    log_msg("boost_with_deadline failed to boost pid %d: %s\n", pid, strerror(errno));
 	    return ret;
 	}
 	log_msg("boosted pid %d using SCHED_DEADLINE\n", pid);
 	return ret;
 }

 int boost_with_fifo(int pid)
 {
 	int ret;
 	int flags = 0;
 	struct sched_attr attr;

 	memset(&attr, 0, sizeof(attr));
 	attr.size = sizeof(attr);
 	attr.sched_policy   = SCHED_FIFO;
 	attr.sched_priority = config_fifo_priority;

 	ret = sched_setattr(pid, &attr, flags);
 	if (ret < 0) {
 	    log_msg("boost_with_fifo failed to boost pid %d: %s\n", pid, strerror(errno));
 	    return ret;
 	}
 	log_msg("boosted pid %d using SCHED_FIFO\n", pid);
 	return ret;
 }

 int restore_policy(int pid, struct sched_attr *attr)
 {
 	int ret;
 	int flags = 0;

 	ret = sched_setattr(pid, attr, flags);
 	if (ret < 0)
 		log_msg("restore_policy: failed to restore sched policy for pid %d: %s\n",
 			pid, strerror(errno));
 	return ret;
 }

 /*
  * this function emulates the behavior of SCHED_DEADLINE but
  * using SCHED_FIFO by boosting the thread, sleeping for runtime,
  * changing the pid policy back to its old policy, then sleeping
  * for the remainder of the period, repeating until all the
  * periods are done.
  */
 void do_fifo_boost(int pid, struct sched_attr *old_attr)
 {
 	int i;
 	int nr_periods = config_boost_duration / config_dl_period;
 	struct timespec runtime_ts;
 	struct timespec remainder_ts;
 	struct timespec ts;

 	/*
 	 * setup the runtime sleep
 	 */
 	memset(&runtime_ts, 0, sizeof(runtime_ts));
 	runtime_ts.tv_nsec = config_dl_runtime;
 	normalize_timespec(&runtime_ts);

 	/*
 	 * setup the remainder of the period sleep
 	 */
 	memset(&remainder_ts, 0, sizeof(remainder_ts));
 	remainder_ts.tv_nsec = config_dl_period - config_dl_runtime;
 	normalize_timespec(&remainder_ts);

 	for (i=0; i < nr_periods; i++) {
 		boost_with_fifo(pid);
 		ts = runtime_ts;
 		clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, 0);
 		restore_policy(pid, old_attr);
 		ts = remainder_ts;
 		clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, 0);
 	}
 }

 int boost_starving_task(int pid)
 {
 	int ret;
 	struct sched_attr attr;

 	/*
 	 * Get the old prio, to be restored at the end of the
 	 * boosting period.
 	 */
 	ret = get_current_policy(pid, &attr);
 	if (ret < 0)
 		return ret;

 	/*
 	 * Boost.
 	 */
 	if (boost_policy == SCHED_DEADLINE) {
 		ret = boost_with_deadline(pid);
 		if (ret < 0)
 			return ret;
 		sleep(config_boost_duration);
 		ret = restore_policy(pid, &attr);
 		if (ret < 0)
 			return ret;
 	}
 	else
 		do_fifo_boost(pid, &attr);

 	/*
 	 * XXX: If the proccess dies, we get an error. Deal with that
 	 * latter.
 	 * if (ret < 0)
 	 *   die("sched_setattr failed to set the normal priorities");
 	 */

 	return 0;

 }

 int check_starving_tasks(struct cpu_info *cpu)
 {
 	struct task_info *tasks = cpu->starving;
 	struct task_info *task;
 	int starving = 0;
 	int i;

 	for (i = 0; i < cpu->nr_waiting_tasks; i++) {
 		task = &tasks[i];

 		if ((time(NULL) - task->since) >= config_starving_threshold) {

 			log_msg("%s-%d starved on CPU %d for %d seconds\n",
 				task->comm, task->pid, cpu->id,
 				(time(NULL) - task->since));

 			starving+=1;

 			/*
 			 * It it is only logging, just reset the time couter
 			 * after logging.
 			 */
 			if (config_log_only) {
 				task->since = time(NULL);
 				continue;
 			}

 			boost_starving_task(task->pid);
 		}
 	}

 	return starving;
 }

 int check_might_starve_tasks(struct cpu_info *cpu)
 {
 	struct task_info *tasks = cpu->starving;
 	struct task_info *task;
 	int starving = 0;
 	int i;

 	if (cpu->thread_running)
 		warn("checking a running thread!!!???");

 	for (i = 0; i < cpu->nr_waiting_tasks; i++) {
 		task = &tasks[i];

 		if ((time(NULL) - task->since) >= config_starving_threshold/2) {

 			log_msg("%s-%d might starve on CPU %d (waiting for %d seconds)\n",
 				task->comm, task->pid, cpu->id,
 				(time(NULL) - task->since));

 			starving = 1;
 		}
 	}

 	return starving;
 }

 void *cpu_main(void *data)
 {
 	struct cpu_info *cpu = data;
 	int nothing_to_do = 0;
 	int retval;

 	while (cpu->thread_running && running) {

 		/*
 		 * Buffer size should increase. See read_sched_debug().
 		 */
 		if (config_buffer_size != cpu->buffer_size) {
 			char *old_buffer = cpu->buffer;
 			cpu->buffer = realloc(cpu->buffer, config_buffer_size);
 			if (!cpu->buffer) {
 				warn("fail to increase the buffer... continue");
 				cpu->buffer = old_buffer;
 			} else {
 				cpu->buffer_size = config_buffer_size;
 			}
 		}

 		retval = read_sched_debug(cpu->buffer, cpu->buffer_size);
 		if(!retval) {
 			warn("fail reading sched debug file");
 			warn("Dazed and confused, but trying to continue");
 			continue;
 		}

 		retval = parse_cpu_info(cpu, cpu->buffer, cpu->buffer_size);
 		if (retval) {
 			warn("error parsing CPU info");
 			warn("Dazed and confused, but trying to continue");
 			continue;
 		}

 		if (config_verbose)
 			print_waiting_tasks(cpu);

 		if (cpu->nr_rt_running && cpu->nr_waiting_tasks) {
 			nothing_to_do = 0;
 			check_starving_tasks(cpu);
 		} else {
 			nothing_to_do++;
 		}

 		/*
 		 * it not in aggressive mode, give up after 10 cycles with
 		 * nothing to do.
 		 */
 		if (!config_aggressive && nothing_to_do == 10) {
 			cpu->thread_running=0;
 			pthread_exit(NULL);
 		}

 		sleep(1);
 	}

 	return NULL;
 }

 static const char *join_thread(pthread_t *thread)
 {
 	void *result;

 	pthread_join(*thread, &result);

 	return result;
 }

 void aggressive_main(struct cpu_info *cpus, int nr_cpus)
 {
 	int i;

 	for (i = 0; i < nr_cpus; i++) {
 		if (!should_monitor(i))
 			continue;

 		cpus[i].id = i;
 		cpus[i].thread_running = 1;
 		pthread_create(&cpus[i].thread, NULL, cpu_main, &cpus[i]);
 	}

 	for (i = 0; i < nr_cpus; i++) {
 		if (!should_monitor(i))
 			continue;

 		join_thread(&cpus[i].thread);
 	}
 }

 void conservative_main(struct cpu_info *cpus, int nr_cpus)
 {
 	pthread_attr_t dettached;
 	struct cpu_info *cpu;
 	char *buffer = NULL;
 	int buffer_size = 0;
 	int retval;
 	int i;

 	buffer = malloc(config_buffer_size);
 	if (!buffer)
 		die("cannot allocate buffer");

 	buffer_size = config_buffer_size;

 	pthread_attr_init(&dettached);
 	pthread_attr_setdetachstate(&dettached, PTHREAD_CREATE_DETACHED);

 	for (i = 0; i < nr_cpus; i++) {
 		cpus[i].id = i;
 		cpus[i].thread_running = 0;
 	}

 	while (running) {

 		/*
 		 * Buffer size should increase. See read_sched_debug().
 		 */
 		if (config_buffer_size != buffer_size) {
 			char *old_buffer = buffer;
 			buffer = realloc(buffer, config_buffer_size);
 			if (!buffer) {
 				warn("fail to increase the buffer... continue");
 				buffer = old_buffer;
 			} else {
 				buffer_size = config_buffer_size;
 			}
 		}

 		retval = read_sched_debug(buffer, buffer_size);
 		if(!retval) {
 			warn("Dazed and confused, but trying to continue");
 			continue;
 		}

 		for (i = 0; i < nr_cpus; i++) {
 			if (!should_monitor(i))
 				continue;

 			cpu = &cpus[i];

 			if (cpu->thread_running)
 				continue;

 			retval = parse_cpu_info(cpu, buffer, buffer_size);
 			if (retval) {
 				warn("error parsing CPU info");
 				warn("Dazed and confused, but trying to continue");
 				continue;
 			}

 			if (config_verbose)
 				printf("\tchecking cpu %d - rt: %d - starving: %d\n",
 				       i, cpu->nr_rt_running, cpu->nr_waiting_tasks);

 			if (check_might_starve_tasks(cpu)) {
 				cpus[i].id = i;
 				cpus[i].thread_running = 1;
 				pthread_create(&cpus[i].thread, &dettached, cpu_main, &cpus[i]);
 			}
 		}

 		sleep(MAX(config_starving_threshold/20,1));
 	}
 }


 int check_policies(void)
 {
 	int ret;
 	int saved_runtime = config_dl_runtime;
 	int boosted = SCHED_DEADLINE;
 	struct sched_attr attr;

 	/*
 	 * if we specified fifo on the command line
 	 * just return false
 	 */
 	if (config_force_fifo) {
 		log_msg("forcing SCHED_FIFO for boosting\n");
 		return SCHED_FIFO;
 	}

 	// set runtime to half of period
 	config_dl_runtime = config_dl_period / 2;

 	// save off the current policy
 	if (get_current_policy(0, &attr))
 		die("check_policies: unable to get scheduling policy!");

 	// try boosting to SCHED_DEADLINE
 	ret = boost_with_deadline(0);
 	if (ret < 0) {
 		// try boosting with fifo to see if we have permission
 		ret = boost_with_fifo(0);
 		if (ret < 0) {
 			log_msg("check_policies: unable to change policy to either deadline or fifo,"
 				"defaulting to logging only\n");
 			config_log_only = 1;
 			boosted = 0;
 		}
 		else
 			boosted = SCHED_FIFO;
 	}
 	// if we successfully boosted to something, restore the old policy
 	if (boosted) {
 		ret = restore_policy(0, &attr);
 		// if we can't restore the policy then quit now
 		if (ret < 0)
 			die("check_policies: unable to restore policy: %s\n", strerror(errno));
  	}

 	// restore the actual runtime value
 	config_dl_runtime = saved_runtime;
 	if (boosted == SCHED_DEADLINE)
 		log_msg("using SCHED_DEADLINE for boosting\n");
 	else if (boosted == SCHED_FIFO)
 		log_msg("using SCHED_FIFO for boosting\n");
 	return boosted;
 }

 int main(int argc, char **argv)
 {
 	struct cpu_info *cpus;
 	int nr_cpus;
 	int i;

 	parse_args(argc, argv);

 	/*
 	 * see if deadline scheduler is available
 	 */
 	boost_policy = check_policies();

 	nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
 	if (nr_cpus < 1)
 		die("Can not calculate number of CPUS\n");

 	cpus = malloc(sizeof(struct cpu_info) * nr_cpus);
 	if (!cpus)
 		die("Cannot allocate memory");

 	memset(cpus, 0, sizeof(struct cpu_info) * nr_cpus);

 	for (i = 0; i < nr_cpus; i++) {
 		cpus[i].buffer = malloc(config_buffer_size);
 		if (!cpus[i].buffer)
 			die("Cannot allocate memory");

 		cpus[i].buffer_size = config_buffer_size;
 	}

 	if (config_log_syslog)
 		openlog("stalld", 0, LOG_DAEMON);

 	setup_signal_handling();
 	turn_off_rt_throttling();

 	if (!config_foreground)
 		deamonize();

 	write_pidfile();

 	if (config_aggressive)
 		aggressive_main(cpus, nr_cpus);
 	else
 		conservative_main(cpus, nr_cpus);

 	if (config_log_syslog)
 		closelog();

 	exit(0);
 }
	/*
	* stalld: starvation detection and avoidance (with bounds).
	*
	* This program was born after Daniel and Juri started debugging once again
	* problems caused kernel threads starving due to busy-loop sched FIFO threads.
	*
	* The idea is simple: after detecting a thread starving on a given CPU for a
	* given period, this thread will receive a "bounded" chance to run, using
	* SCHED_DEADLINE. In this way, the starving thread is able to make progress
	* causing a bounded Operating System noise (OS Noise).
	*
	* SPDX-License-Identifier: GPL-2.0
	*
	* Copyright (C) 2020 Red Hat Inc, Daniel Bristot de Oliveira <bristot@redhat.com>
	*
	*/

	#define _GNU_SOURCE
	#include <ctype.h>
	#include <errno.h>
	#include <fcntl.h>
	#include <getopt.h>
	#include <pthread.h>
	#include <sched.h>
	#include <signal.h>
	#include <stdarg.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <syslog.h>
	#include <sys/param.h>
	#include <sys/stat.h>
	#include <sys/types.h>
	#include <time.h>
	#include <unistd.h>
	#include <linux/sched.h>

	#include "stalld.h"

	/*
	* logging.
	*/
	int config_verbose = 0;
	int config_write_kmesg = 0;
	int config_log_syslog = 1;
	int config_log_only = 0;
	int config_foreground = 0;

	/*
	* boost parameters (time in nanoseconds).
	*/
	unsigned long config_dl_period = 1000000000;
	unsigned long config_dl_runtime = 20000;

	/*
	* fifo boost parameters
	*/
	unsigned long config_fifo_priority = 98;
	unsigned long config_force_fifo = 0;

	/*
	* control loop (time in seconds)
	*/
	long config_starving_threshold = 60;
	long config_boost_duration = 3;
	long config_aggressive = 0;

	/*
	* XXX: Make it a cpu mask, lazy Daniel!
	*/
	int config_monitor_all_cpus = 1;
	char *config_monitored_cpus;

	/*
	* Max known to be enough sched_debug buffer size. It increases if the
	* file gets larger.
	*/
	int config_buffer_size = BUFFER_SIZE;

	/*
	* boolean to choose between deadline and fifo
	*/
	int boost_policy;

	/*
	* variable to indicate if stalld is running or
	* shutting down
	*/
	int running = 1;

	/*
	* read the contents of /proc/sched_debug into
	* the input buffer
	*/
	int read_sched_debug(char *buffer, int size)
	{
	int position = 0;
	int retval;
	int fd;

	fd = open("/proc/sched_debug", O_RDONLY);

	if (!fd)
	goto out_error;

	do {
	retval = read(fd, &buffer[position], size - position);
	if (read < 0)
	goto out_close_fd;

	position += retval;

	} while (retval > 0 && position < size);

	buffer[position-1] = '\0';

	if (position + 100 > config_buffer_size) {
	config_buffer_size = config_buffer_size * 2;
	log_msg("sched_debug is getting larger, increasing the buffer to %d\n", config_buffer_size);
	}

	close(fd);

	return position;

	out_close_fd:
	close(fd);

	out_error:
	return 0;
	}

	/*
	* find the start of a cpu information block in the
	* input buffer
	*/
	char get_cpu_info_start(char buffer, int cpu)
	{
	/* 'cpu#9999,\0' */
	char cpu_header[10];

	sprintf(cpu_header, "cpu#%d,", cpu);

	return strstr(buffer, cpu_header);
	}

	char get_next_cpu_info_start(char start)
	{
	const char *next_cpu = "cpu#";

	/* Skipp the current cpu definition */
	start += 10;

	return strstr(start, next_cpu);
	}

	char alloc_and_fill_cpu_buffer(int cpu, char sched_dbg, int sched_dbg_size)
	{
	char *next_cpu_start;
	char *cpu_buffer;
	char *cpu_start;
	int size = 0;

	cpu_start = get_cpu_info_start(sched_dbg, cpu);

	/*
	* The CPU might be offline.
	*/
	if (!cpu_start)
	return NULL;

	next_cpu_start = get_next_cpu_info_start(cpu_start);

	/*
	* If it did not find the next CPU, it should be the
	* end of the file.
	*/
	if (!next_cpu_start)
	next_cpu_start = sched_dbg + sched_dbg_size;

	size = next_cpu_start - cpu_start;

	if (size <= 0)
	return NULL;

	cpu_buffer = malloc(size);

	if (!cpu_buffer)
	return NULL;

	strncpy(cpu_buffer, cpu_start, size);

	cpu_buffer[size-1] = '\0';

	return cpu_buffer;
	}
	/*
	* Example:
	* ' S task PID tree-key switches prio wait-time sum-exec sum-sleep'
	* '-----------------------------------------------------------------------------------------------------------'
	* ' I rcu_gp 3 13.973264 2 100 0.000000 0.004469 0.000000 0 0 /
	*/
	int fill_waiting_task(char buffer, struct task_info task_info, int nr_entries)
	{
	struct task_info *task;
	char *start = buffer;
	int tasks = 0;
	int comm_size;
	char *end;

	while (tasks < nr_entries) {
	task = &task_info[tasks];

	/*
	* only care about tasks in the Runnable state
	* Note: the actual scheduled task will show up as
	* "\n>R" so we will skip it.
	*
	*/
	start = strstr(start, "\n R");

	/*
	* if no match then there are no more Runnable tasks
	*/
	if (!start)
	break;

	/*
	* Skip '\n R'
	*/
	start = &start[3];

	/*
	* skip the spaces.
	*/
	while(start[0] == ' ')
	start++;

	end = start;

	while(end[0] != ' ')
	end++;

	comm_size = end - start;

	if (comm_size > 15) {
	warn("comm_size is too large: %d\n", comm_size);
	comm_size = 15;
	}

	strncpy(task->comm, start, comm_size);

	task->comm[comm_size] = 0;

	/*
	* go to the end of the task comm
	*/
	start=end;

	task->pid = strtol(start, &end, 10);

	/*
	* go to the end of the pid
	*/
	start=end;

	/*
	* skip the tree-key
	*/
	while(start[0] == ' ')
	start++;

	while(start[0] != ' ')
	start++;

	task->ctxsw = strtol(start, &end, 10);

	start = end;

	task->prio = strtol(start, &end, 10);

	task->since = time(NULL);

	/*
	* go to the end and try to find the next occurence.
	*/
	start = end;

	tasks++;
	}

	return tasks;
	}

	void print_waiting_tasks(struct cpu_info *cpu_info)
	{
	struct task_info *task;
	time_t now = time(NULL);
	int i;

	printf("CPU %d has %d waiting tasks\n", cpu_info->id, cpu_info->nr_waiting_tasks);
	if (!cpu_info->nr_waiting_tasks)
	return;

	for (i = 0; i < cpu_info->nr_waiting_tasks; i++) {
	task = &cpu_info->starving[i];

	printf("%15s %9d %9d %9d %9ld\n", task->comm, task->pid, task->prio, task->ctxsw, (now - task->since));
	}

	return;

	}

	void merge_taks_info(struct task_info old_tasks, int nr_old, struct task_info new_tasks, int nr_new)
	{
	struct task_info *old_task;
	struct task_info *new_task;
	int i;
	int j;

	for (i = 0; i < nr_old; i++) {
	old_task = &old_tasks[i];

	for (j = 0; j < nr_new; j++) {
	new_task = &new_tasks[j];

	if (old_task->pid == new_task->pid) {
	if (old_task->ctxsw == new_task->ctxsw)
	new_task->since = old_task->since;

	break;
	}
	}
	}
	}

	int parse_cpu_info(struct cpu_info cpu_info, char buffer, int buffer_size)
	{

	struct task_info *old_tasks = cpu_info->starving;
	int nr_old_tasks = cpu_info->nr_waiting_tasks;
	long nr_running, nr_rt_running;
	int cpu = cpu_info->id;
	char *cpu_buffer;
	int retval = 0;

	cpu_buffer = alloc_and_fill_cpu_buffer(cpu, buffer, buffer_size);
	/*
	* It is not necessarily a problem, the CPU might be offline. Cleanup
	* and leave.
	*/
	if (!cpu_buffer) {
	if (old_tasks)
	free(old_tasks);
	cpu_info->nr_waiting_tasks = 0;
	cpu_info->nr_running = 0;
	cpu_info->nr_rt_running = 0;
	cpu_info->starving = 0;
	goto out;
	}

	nr_running = get_variable_long_value(cpu_buffer, ".nr_running");
	nr_rt_running = get_variable_long_value(cpu_buffer, ".rt_nr_running");

	if ((nr_running == -1) \|\| (nr_rt_running == -1)) {
	retval = -EINVAL;
	goto out_free;
	}

	cpu_info->nr_running = nr_running;
	cpu_info->nr_rt_running = nr_rt_running;

	cpu_info->starving = malloc(sizeof(struct task_info) * cpu_info->nr_running);
	cpu_info->nr_waiting_tasks = fill_waiting_task(cpu_buffer, cpu_info->starving, cpu_info->nr_running);
	if (old_tasks) {
	merge_taks_info(old_tasks, nr_old_tasks, cpu_info->starving, cpu_info->nr_waiting_tasks);
	free(old_tasks);
	}

	out_free:
	free(cpu_buffer);
	out:
	return retval;
	}

	int get_current_policy(int pid, struct sched_attr *attr)
	{
	int ret;

	ret = sched_getattr(pid, attr, sizeof(*attr), 0);
	if (ret == -1)
	log_msg("get_current_policy: failed with error %s\n", strerror(errno));
	return ret;
	}

	int boost_with_deadline(int pid)
	{
	int ret;
	int flags = 0;
	struct sched_attr attr;

	memset(&attr, 0, sizeof(attr));
	attr.size = sizeof(attr);
	attr.sched_policy = SCHED_DEADLINE;
	attr.sched_runtime = config_dl_runtime;
	attr.sched_deadline = config_dl_period;
	attr.sched_period = config_dl_period;

	ret = sched_setattr(pid, &attr, flags);
	if (ret < 0) {
	log_msg("boost_with_deadline failed to boost pid %d: %s\n", pid, strerror(errno));
	return ret;
	}
	log_msg("boosted pid %d using SCHED_DEADLINE\n", pid);
	return ret;
	}

	int boost_with_fifo(int pid)
	{
	int ret;
	int flags = 0;
	struct sched_attr attr;

	memset(&attr, 0, sizeof(attr));
	attr.size = sizeof(attr);
	attr.sched_policy = SCHED_FIFO;
	attr.sched_priority = config_fifo_priority;

	ret = sched_setattr(pid, &attr, flags);
	if (ret < 0) {
	log_msg("boost_with_fifo failed to boost pid %d: %s\n", pid, strerror(errno));
	return ret;
	}
	log_msg("boosted pid %d using SCHED_FIFO\n", pid);
	return ret;
	}

	int restore_policy(int pid, struct sched_attr *attr)
	{
	int ret;
	int flags = 0;

	ret = sched_setattr(pid, attr, flags);
	if (ret < 0)
	log_msg("restore_policy: failed to restore sched policy for pid %d: %s\n",
	pid, strerror(errno));
	return ret;
	}

	/*
	* this function emulates the behavior of SCHED_DEADLINE but
	* using SCHED_FIFO by boosting the thread, sleeping for runtime,
	* changing the pid policy back to its old policy, then sleeping
	* for the remainder of the period, repeating until all the
	* periods are done.
	*/
	void do_fifo_boost(int pid, struct sched_attr *old_attr)
	{
	int i;
	int nr_periods = config_boost_duration / config_dl_period;
	struct timespec runtime_ts;
	struct timespec remainder_ts;
	struct timespec ts;

	/*
	* setup the runtime sleep
	*/
	memset(&runtime_ts, 0, sizeof(runtime_ts));
	runtime_ts.tv_nsec = config_dl_runtime;
	normalize_timespec(&runtime_ts);

	/*
	* setup the remainder of the period sleep
	*/
	memset(&remainder_ts, 0, sizeof(remainder_ts));
	remainder_ts.tv_nsec = config_dl_period - config_dl_runtime;
	normalize_timespec(&remainder_ts);

	for (i=0; i < nr_periods; i++) {
	boost_with_fifo(pid);
	ts = runtime_ts;
	clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, 0);
	restore_policy(pid, old_attr);
	ts = remainder_ts;
	clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, 0);
	}
	}

	int boost_starving_task(int pid)
	{
	int ret;
	struct sched_attr attr;

	/*
	* Get the old prio, to be restored at the end of the
	* boosting period.
	*/
	ret = get_current_policy(pid, &attr);
	if (ret < 0)
	return ret;

	/*
	* Boost.
	*/
	if (boost_policy == SCHED_DEADLINE) {
	ret = boost_with_deadline(pid);
	if (ret < 0)
	return ret;
	sleep(config_boost_duration);
	ret = restore_policy(pid, &attr);
	if (ret < 0)
	return ret;
	}
	else
	do_fifo_boost(pid, &attr);

	/*
	* XXX: If the proccess dies, we get an error. Deal with that
	* latter.
	* if (ret < 0)
	* die("sched_setattr failed to set the normal priorities");
	*/

	return 0;

	}

	int check_starving_tasks(struct cpu_info *cpu)
	{
	struct task_info *tasks = cpu->starving;
	struct task_info *task;
	int starving = 0;
	int i;

	for (i = 0; i < cpu->nr_waiting_tasks; i++) {
	task = &tasks[i];

	if ((time(NULL) - task->since) >= config_starving_threshold) {

	log_msg("%s-%d starved on CPU %d for %d seconds\n",
	task->comm, task->pid, cpu->id,
	(time(NULL) - task->since));

	starving+=1;

	/*
	* It it is only logging, just reset the time couter
	* after logging.
	*/
	if (config_log_only) {
	task->since = time(NULL);
	continue;
	}

	boost_starving_task(task->pid);
	}
	}

	return starving;
	}

	int check_might_starve_tasks(struct cpu_info *cpu)
	{
	struct task_info *tasks = cpu->starving;
	struct task_info *task;
	int starving = 0;
	int i;

	if (cpu->thread_running)
	warn("checking a running thread!!!???");

	for (i = 0; i < cpu->nr_waiting_tasks; i++) {
	task = &tasks[i];

	if ((time(NULL) - task->since) >= config_starving_threshold/2) {

	log_msg("%s-%d might starve on CPU %d (waiting for %d seconds)\n",
	task->comm, task->pid, cpu->id,
	(time(NULL) - task->since));

	starving = 1;
	}
	}

	return starving;
	}

	void cpu_main(void data)
	{
	struct cpu_info *cpu = data;
	int nothing_to_do = 0;
	int retval;

	while (cpu->thread_running && running) {

	/*
	* Buffer size should increase. See read_sched_debug().
	*/
	if (config_buffer_size != cpu->buffer_size) {
	char *old_buffer = cpu->buffer;
	cpu->buffer = realloc(cpu->buffer, config_buffer_size);
	if (!cpu->buffer) {
	warn("fail to increase the buffer... continue");
	cpu->buffer = old_buffer;
	} else {
	cpu->buffer_size = config_buffer_size;
	}
	}

	retval = read_sched_debug(cpu->buffer, cpu->buffer_size);
	if(!retval) {
	warn("fail reading sched debug file");
	warn("Dazed and confused, but trying to continue");
	continue;
	}

	retval = parse_cpu_info(cpu, cpu->buffer, cpu->buffer_size);
	if (retval) {
	warn("error parsing CPU info");
	warn("Dazed and confused, but trying to continue");
	continue;
	}

	if (config_verbose)
	print_waiting_tasks(cpu);

	if (cpu->nr_rt_running && cpu->nr_waiting_tasks) {
	nothing_to_do = 0;
	check_starving_tasks(cpu);
	} else {
	nothing_to_do++;
	}

	/*
	* it not in aggressive mode, give up after 10 cycles with
	* nothing to do.
	*/
	if (!config_aggressive && nothing_to_do == 10) {
	cpu->thread_running=0;
	pthread_exit(NULL);
	}

	sleep(1);
	}

	return NULL;
	}

	static const char join_thread(pthread_t thread)
	{
	void *result;

	pthread_join(*thread, &result);

	return result;
	}

	void aggressive_main(struct cpu_info *cpus, int nr_cpus)
	{
	int i;

	for (i = 0; i < nr_cpus; i++) {
	if (!should_monitor(i))
	continue;

	cpus[i].id = i;
	cpus[i].thread_running = 1;
	pthread_create(&cpus[i].thread, NULL, cpu_main, &cpus[i]);
	}

	for (i = 0; i < nr_cpus; i++) {
	if (!should_monitor(i))
	continue;

	join_thread(&cpus[i].thread);
	}
	}

	void conservative_main(struct cpu_info *cpus, int nr_cpus)
	{
	pthread_attr_t dettached;
	struct cpu_info *cpu;
	char *buffer = NULL;
	int buffer_size = 0;
	int retval;
	int i;

	buffer = malloc(config_buffer_size);
	if (!buffer)
	die("cannot allocate buffer");

	buffer_size = config_buffer_size;

	pthread_attr_init(&dettached);
	pthread_attr_setdetachstate(&dettached, PTHREAD_CREATE_DETACHED);

	for (i = 0; i < nr_cpus; i++) {
	cpus[i].id = i;
	cpus[i].thread_running = 0;
	}

	while (running) {

	/*
	* Buffer size should increase. See read_sched_debug().
	*/
	if (config_buffer_size != buffer_size) {
	char *old_buffer = buffer;
	buffer = realloc(buffer, config_buffer_size);
	if (!buffer) {
	warn("fail to increase the buffer... continue");
	buffer = old_buffer;
	} else {
	buffer_size = config_buffer_size;
	}
	}

	retval = read_sched_debug(buffer, buffer_size);
	if(!retval) {
	warn("Dazed and confused, but trying to continue");
	continue;
	}

	for (i = 0; i < nr_cpus; i++) {
	if (!should_monitor(i))
	continue;

	cpu = &cpus[i];

	if (cpu->thread_running)
	continue;

	retval = parse_cpu_info(cpu, buffer, buffer_size);
	if (retval) {
	warn("error parsing CPU info");
	warn("Dazed and confused, but trying to continue");
	continue;
	}

	if (config_verbose)
	printf("\tchecking cpu %d - rt: %d - starving: %d\n",
	i, cpu->nr_rt_running, cpu->nr_waiting_tasks);

	if (check_might_starve_tasks(cpu)) {
	cpus[i].id = i;
	cpus[i].thread_running = 1;
	pthread_create(&cpus[i].thread, &dettached, cpu_main, &cpus[i]);
	}
	}

	sleep(MAX(config_starving_threshold/20,1));
	}
	}


	int check_policies(void)
	{
	int ret;
	int saved_runtime = config_dl_runtime;
	int boosted = SCHED_DEADLINE;
	struct sched_attr attr;

	/*
	* if we specified fifo on the command line
	* just return false
	*/
	if (config_force_fifo) {
	log_msg("forcing SCHED_FIFO for boosting\n");
	return SCHED_FIFO;
	}

	// set runtime to half of period
	config_dl_runtime = config_dl_period / 2;

	// save off the current policy
	if (get_current_policy(0, &attr))
	die("check_policies: unable to get scheduling policy!");

	// try boosting to SCHED_DEADLINE
	ret = boost_with_deadline(0);
	if (ret < 0) {
	// try boosting with fifo to see if we have permission
	ret = boost_with_fifo(0);
	if (ret < 0) {
	log_msg("check_policies: unable to change policy to either deadline or fifo,"
	"defaulting to logging only\n");
	config_log_only = 1;
	boosted = 0;
	}
	else
	boosted = SCHED_FIFO;
	}
	// if we successfully boosted to something, restore the old policy
	if (boosted) {
	ret = restore_policy(0, &attr);
	// if we can't restore the policy then quit now
	if (ret < 0)
	die("check_policies: unable to restore policy: %s\n", strerror(errno));
	}

	// restore the actual runtime value
	config_dl_runtime = saved_runtime;
	if (boosted == SCHED_DEADLINE)
	log_msg("using SCHED_DEADLINE for boosting\n");
	else if (boosted == SCHED_FIFO)
	log_msg("using SCHED_FIFO for boosting\n");
	return boosted;
	}

	int main(int argc, char **argv)
	{
	struct cpu_info *cpus;
	int nr_cpus;
	int i;

	parse_args(argc, argv);

	/*
	* see if deadline scheduler is available
	*/
	boost_policy = check_policies();

	nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
	if (nr_cpus < 1)
	die("Can not calculate number of CPUS\n");

	cpus = malloc(sizeof(struct cpu_info) * nr_cpus);
	if (!cpus)
	die("Cannot allocate memory");

	memset(cpus, 0, sizeof(struct cpu_info) * nr_cpus);

	for (i = 0; i < nr_cpus; i++) {
	cpus[i].buffer = malloc(config_buffer_size);
	if (!cpus[i].buffer)
	die("Cannot allocate memory");

	cpus[i].buffer_size = config_buffer_size;
	}

	if (config_log_syslog)
	openlog("stalld", 0, LOG_DAEMON);

	setup_signal_handling();
	turn_off_rt_throttling();

	if (!config_foreground)
	deamonize();

	write_pidfile();

	if (config_aggressive)
	aggressive_main(cpus, nr_cpus);
	else
	conservative_main(cpus, nr_cpus);

	if (config_log_syslog)
	closelog();

	exit(0);
	}