blob: 196836b86011f03f6a4b7b5c7f51bf5b6daeeb62 [file] [log] [blame]
/*
* stalld: starvation detection and avoidance (with bounds).
*
* This program was born after Daniel and Juri started debugging once again
* problems caused kernel threads starving due to busy-loop sched FIFO threads.
*
* The idea is simple: after detecting a thread starving on a given CPU for a
* given period, this thread will receive a "bounded" chance to run, using
* SCHED_DEADLINE. In this way, the starving thread is able to make progress
* causing a bounded Operating System noise (OS Noise).
*
* SPDX-License-Identifier: GPL-2.0
*
* Copyright (C) 2020 Red Hat Inc, Daniel Bristot de Oliveira <bristot@redhat.com>
*
*/
#define _GNU_SOURCE
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <pthread.h>
#include <sched.h>
#include <signal.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <syslog.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#include <linux/sched.h>
#include "stalld.h"
/*
* logging.
*/
int config_verbose = 0;
int config_write_kmesg = 0;
int config_log_syslog = 1;
int config_log_only = 0;
int config_foreground = 0;
/*
* boost parameters (time in nanoseconds).
*/
unsigned long config_dl_period = 1000000000;
unsigned long config_dl_runtime = 20000;
/*
* fifo boost parameters
*/
unsigned long config_fifo_priority = 98;
unsigned long config_force_fifo = 0;
/*
* control loop (time in seconds)
*/
long config_starving_threshold = 60;
long config_boost_duration = 3;
long config_aggressive = 0;
/*
* XXX: Make it a cpu mask, lazy Daniel!
*/
int config_monitor_all_cpus = 1;
char *config_monitored_cpus;
/*
* Max known to be enough sched_debug buffer size. It increases if the
* file gets larger.
*/
int config_buffer_size = BUFFER_SIZE;
/*
* boolean to choose between deadline and fifo
*/
int boost_policy;
/*
* variable to indicate if stalld is running or
* shutting down
*/
int running = 1;
/*
* read the contents of /proc/sched_debug into
* the input buffer
*/
int read_sched_debug(char *buffer, int size)
{
int position = 0;
int retval;
int fd;
fd = open("/proc/sched_debug", O_RDONLY);
if (!fd)
goto out_error;
do {
retval = read(fd, &buffer[position], size - position);
if (read < 0)
goto out_close_fd;
position += retval;
} while (retval > 0 && position < size);
buffer[position-1] = '\0';
if (position + 100 > config_buffer_size) {
config_buffer_size = config_buffer_size * 2;
log_msg("sched_debug is getting larger, increasing the buffer to %d\n", config_buffer_size);
}
close(fd);
return position;
out_close_fd:
close(fd);
out_error:
return 0;
}
/*
* find the start of a cpu information block in the
* input buffer
*/
char *get_cpu_info_start(char *buffer, int cpu)
{
/* 'cpu#9999,\0' */
char cpu_header[10];
sprintf(cpu_header, "cpu#%d,", cpu);
return strstr(buffer, cpu_header);
}
char *get_next_cpu_info_start(char *start)
{
const char *next_cpu = "cpu#";
/* Skipp the current cpu definition */
start += 10;
return strstr(start, next_cpu);
}
char *alloc_and_fill_cpu_buffer(int cpu, char *sched_dbg, int sched_dbg_size)
{
char *next_cpu_start;
char *cpu_buffer;
char *cpu_start;
int size = 0;
cpu_start = get_cpu_info_start(sched_dbg, cpu);
/*
* The CPU might be offline.
*/
if (!cpu_start)
return NULL;
next_cpu_start = get_next_cpu_info_start(cpu_start);
/*
* If it did not find the next CPU, it should be the
* end of the file.
*/
if (!next_cpu_start)
next_cpu_start = sched_dbg + sched_dbg_size;
size = next_cpu_start - cpu_start;
if (size <= 0)
return NULL;
cpu_buffer = malloc(size);
if (!cpu_buffer)
return NULL;
strncpy(cpu_buffer, cpu_start, size);
cpu_buffer[size-1] = '\0';
return cpu_buffer;
}
/*
* Example:
* ' S task PID tree-key switches prio wait-time sum-exec sum-sleep'
* '-----------------------------------------------------------------------------------------------------------'
* ' I rcu_gp 3 13.973264 2 100 0.000000 0.004469 0.000000 0 0 /
*/
int fill_waiting_task(char *buffer, struct task_info *task_info, int nr_entries)
{
struct task_info *task;
char *start = buffer;
int tasks = 0;
int comm_size;
char *end;
while (tasks < nr_entries) {
task = &task_info[tasks];
/*
* only care about tasks in the Runnable state
* Note: the actual scheduled task will show up as
* "\n>R" so we will skip it.
*
*/
start = strstr(start, "\n R");
/*
* if no match then there are no more Runnable tasks
*/
if (!start)
break;
/*
* Skip '\n R'
*/
start = &start[3];
/*
* skip the spaces.
*/
while(start[0] == ' ')
start++;
end = start;
while(end[0] != ' ')
end++;
comm_size = end - start;
if (comm_size > 15) {
warn("comm_size is too large: %d\n", comm_size);
comm_size = 15;
}
strncpy(task->comm, start, comm_size);
task->comm[comm_size] = 0;
/*
* go to the end of the task comm
*/
start=end;
task->pid = strtol(start, &end, 10);
/*
* go to the end of the pid
*/
start=end;
/*
* skip the tree-key
*/
while(start[0] == ' ')
start++;
while(start[0] != ' ')
start++;
task->ctxsw = strtol(start, &end, 10);
start = end;
task->prio = strtol(start, &end, 10);
task->since = time(NULL);
/*
* go to the end and try to find the next occurence.
*/
start = end;
tasks++;
}
return tasks;
}
void print_waiting_tasks(struct cpu_info *cpu_info)
{
struct task_info *task;
time_t now = time(NULL);
int i;
printf("CPU %d has %d waiting tasks\n", cpu_info->id, cpu_info->nr_waiting_tasks);
if (!cpu_info->nr_waiting_tasks)
return;
for (i = 0; i < cpu_info->nr_waiting_tasks; i++) {
task = &cpu_info->starving[i];
printf("%15s %9d %9d %9d %9ld\n", task->comm, task->pid, task->prio, task->ctxsw, (now - task->since));
}
return;
}
void merge_taks_info(struct task_info *old_tasks, int nr_old, struct task_info *new_tasks, int nr_new)
{
struct task_info *old_task;
struct task_info *new_task;
int i;
int j;
for (i = 0; i < nr_old; i++) {
old_task = &old_tasks[i];
for (j = 0; j < nr_new; j++) {
new_task = &new_tasks[j];
if (old_task->pid == new_task->pid) {
if (old_task->ctxsw == new_task->ctxsw)
new_task->since = old_task->since;
break;
}
}
}
}
int parse_cpu_info(struct cpu_info *cpu_info, char *buffer, int buffer_size)
{
struct task_info *old_tasks = cpu_info->starving;
int nr_old_tasks = cpu_info->nr_waiting_tasks;
long nr_running, nr_rt_running;
int cpu = cpu_info->id;
char *cpu_buffer;
int retval = 0;
cpu_buffer = alloc_and_fill_cpu_buffer(cpu, buffer, buffer_size);
/*
* It is not necessarily a problem, the CPU might be offline. Cleanup
* and leave.
*/
if (!cpu_buffer) {
if (old_tasks)
free(old_tasks);
cpu_info->nr_waiting_tasks = 0;
cpu_info->nr_running = 0;
cpu_info->nr_rt_running = 0;
cpu_info->starving = 0;
goto out;
}
nr_running = get_variable_long_value(cpu_buffer, ".nr_running");
nr_rt_running = get_variable_long_value(cpu_buffer, ".rt_nr_running");
if ((nr_running == -1) || (nr_rt_running == -1)) {
retval = -EINVAL;
goto out_free;
}
cpu_info->nr_running = nr_running;
cpu_info->nr_rt_running = nr_rt_running;
cpu_info->starving = malloc(sizeof(struct task_info) * cpu_info->nr_running);
cpu_info->nr_waiting_tasks = fill_waiting_task(cpu_buffer, cpu_info->starving, cpu_info->nr_running);
if (old_tasks) {
merge_taks_info(old_tasks, nr_old_tasks, cpu_info->starving, cpu_info->nr_waiting_tasks);
free(old_tasks);
}
out_free:
free(cpu_buffer);
out:
return retval;
}
int get_current_policy(int pid, struct sched_attr *attr)
{
int ret;
ret = sched_getattr(pid, attr, sizeof(*attr), 0);
if (ret == -1)
log_msg("get_current_policy: failed with error %s\n", strerror(errno));
return ret;
}
int boost_with_deadline(int pid)
{
int ret;
int flags = 0;
struct sched_attr attr;
memset(&attr, 0, sizeof(attr));
attr.size = sizeof(attr);
attr.sched_policy = SCHED_DEADLINE;
attr.sched_runtime = config_dl_runtime;
attr.sched_deadline = config_dl_period;
attr.sched_period = config_dl_period;
ret = sched_setattr(pid, &attr, flags);
if (ret < 0) {
log_msg("boost_with_deadline failed to boost pid %d: %s\n", pid, strerror(errno));
return ret;
}
log_msg("boosted pid %d using SCHED_DEADLINE\n", pid);
return ret;
}
int boost_with_fifo(int pid)
{
int ret;
int flags = 0;
struct sched_attr attr;
memset(&attr, 0, sizeof(attr));
attr.size = sizeof(attr);
attr.sched_policy = SCHED_FIFO;
attr.sched_priority = config_fifo_priority;
ret = sched_setattr(pid, &attr, flags);
if (ret < 0) {
log_msg("boost_with_fifo failed to boost pid %d: %s\n", pid, strerror(errno));
return ret;
}
log_msg("boosted pid %d using SCHED_FIFO\n", pid);
return ret;
}
int restore_policy(int pid, struct sched_attr *attr)
{
int ret;
int flags = 0;
ret = sched_setattr(pid, attr, flags);
if (ret < 0)
log_msg("restore_policy: failed to restore sched policy for pid %d: %s\n",
pid, strerror(errno));
return ret;
}
/*
* this function emulates the behavior of SCHED_DEADLINE but
* using SCHED_FIFO by boosting the thread, sleeping for runtime,
* changing the pid policy back to its old policy, then sleeping
* for the remainder of the period, repeating until all the
* periods are done.
*/
void do_fifo_boost(int pid, struct sched_attr *old_attr)
{
int i;
int nr_periods = config_boost_duration / config_dl_period;
struct timespec runtime_ts;
struct timespec remainder_ts;
struct timespec ts;
/*
* setup the runtime sleep
*/
memset(&runtime_ts, 0, sizeof(runtime_ts));
runtime_ts.tv_nsec = config_dl_runtime;
normalize_timespec(&runtime_ts);
/*
* setup the remainder of the period sleep
*/
memset(&remainder_ts, 0, sizeof(remainder_ts));
remainder_ts.tv_nsec = config_dl_period - config_dl_runtime;
normalize_timespec(&remainder_ts);
for (i=0; i < nr_periods; i++) {
boost_with_fifo(pid);
ts = runtime_ts;
clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, 0);
restore_policy(pid, old_attr);
ts = remainder_ts;
clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, 0);
}
}
int boost_starving_task(int pid)
{
int ret;
struct sched_attr attr;
/*
* Get the old prio, to be restored at the end of the
* boosting period.
*/
ret = get_current_policy(pid, &attr);
if (ret < 0)
return ret;
/*
* Boost.
*/
if (boost_policy == SCHED_DEADLINE) {
ret = boost_with_deadline(pid);
if (ret < 0)
return ret;
sleep(config_boost_duration);
ret = restore_policy(pid, &attr);
if (ret < 0)
return ret;
}
else
do_fifo_boost(pid, &attr);
/*
* XXX: If the proccess dies, we get an error. Deal with that
* latter.
* if (ret < 0)
* die("sched_setattr failed to set the normal priorities");
*/
return 0;
}
int check_starving_tasks(struct cpu_info *cpu)
{
struct task_info *tasks = cpu->starving;
struct task_info *task;
int starving = 0;
int i;
for (i = 0; i < cpu->nr_waiting_tasks; i++) {
task = &tasks[i];
if ((time(NULL) - task->since) >= config_starving_threshold) {
log_msg("%s-%d starved on CPU %d for %d seconds\n",
task->comm, task->pid, cpu->id,
(time(NULL) - task->since));
starving+=1;
/*
* It it is only logging, just reset the time couter
* after logging.
*/
if (config_log_only) {
task->since = time(NULL);
continue;
}
boost_starving_task(task->pid);
}
}
return starving;
}
int check_might_starve_tasks(struct cpu_info *cpu)
{
struct task_info *tasks = cpu->starving;
struct task_info *task;
int starving = 0;
int i;
if (cpu->thread_running)
warn("checking a running thread!!!???");
for (i = 0; i < cpu->nr_waiting_tasks; i++) {
task = &tasks[i];
if ((time(NULL) - task->since) >= config_starving_threshold/2) {
log_msg("%s-%d might starve on CPU %d (waiting for %d seconds)\n",
task->comm, task->pid, cpu->id,
(time(NULL) - task->since));
starving = 1;
}
}
return starving;
}
void *cpu_main(void *data)
{
struct cpu_info *cpu = data;
int nothing_to_do = 0;
int retval;
while (cpu->thread_running && running) {
/*
* Buffer size should increase. See read_sched_debug().
*/
if (config_buffer_size != cpu->buffer_size) {
char *old_buffer = cpu->buffer;
cpu->buffer = realloc(cpu->buffer, config_buffer_size);
if (!cpu->buffer) {
warn("fail to increase the buffer... continue");
cpu->buffer = old_buffer;
} else {
cpu->buffer_size = config_buffer_size;
}
}
retval = read_sched_debug(cpu->buffer, cpu->buffer_size);
if(!retval) {
warn("fail reading sched debug file");
warn("Dazed and confused, but trying to continue");
continue;
}
retval = parse_cpu_info(cpu, cpu->buffer, cpu->buffer_size);
if (retval) {
warn("error parsing CPU info");
warn("Dazed and confused, but trying to continue");
continue;
}
if (config_verbose)
print_waiting_tasks(cpu);
if (cpu->nr_rt_running && cpu->nr_waiting_tasks) {
nothing_to_do = 0;
check_starving_tasks(cpu);
} else {
nothing_to_do++;
}
/*
* it not in aggressive mode, give up after 10 cycles with
* nothing to do.
*/
if (!config_aggressive && nothing_to_do == 10) {
cpu->thread_running=0;
pthread_exit(NULL);
}
sleep(1);
}
return NULL;
}
static const char *join_thread(pthread_t *thread)
{
void *result;
pthread_join(*thread, &result);
return result;
}
void aggressive_main(struct cpu_info *cpus, int nr_cpus)
{
int i;
for (i = 0; i < nr_cpus; i++) {
if (!should_monitor(i))
continue;
cpus[i].id = i;
cpus[i].thread_running = 1;
pthread_create(&cpus[i].thread, NULL, cpu_main, &cpus[i]);
}
for (i = 0; i < nr_cpus; i++) {
if (!should_monitor(i))
continue;
join_thread(&cpus[i].thread);
}
}
void conservative_main(struct cpu_info *cpus, int nr_cpus)
{
pthread_attr_t dettached;
struct cpu_info *cpu;
char *buffer = NULL;
int buffer_size = 0;
int retval;
int i;
buffer = malloc(config_buffer_size);
if (!buffer)
die("cannot allocate buffer");
buffer_size = config_buffer_size;
pthread_attr_init(&dettached);
pthread_attr_setdetachstate(&dettached, PTHREAD_CREATE_DETACHED);
for (i = 0; i < nr_cpus; i++) {
cpus[i].id = i;
cpus[i].thread_running = 0;
}
while (running) {
/*
* Buffer size should increase. See read_sched_debug().
*/
if (config_buffer_size != buffer_size) {
char *old_buffer = buffer;
buffer = realloc(buffer, config_buffer_size);
if (!buffer) {
warn("fail to increase the buffer... continue");
buffer = old_buffer;
} else {
buffer_size = config_buffer_size;
}
}
retval = read_sched_debug(buffer, buffer_size);
if(!retval) {
warn("Dazed and confused, but trying to continue");
continue;
}
for (i = 0; i < nr_cpus; i++) {
if (!should_monitor(i))
continue;
cpu = &cpus[i];
if (cpu->thread_running)
continue;
retval = parse_cpu_info(cpu, buffer, buffer_size);
if (retval) {
warn("error parsing CPU info");
warn("Dazed and confused, but trying to continue");
continue;
}
if (config_verbose)
printf("\tchecking cpu %d - rt: %d - starving: %d\n",
i, cpu->nr_rt_running, cpu->nr_waiting_tasks);
if (check_might_starve_tasks(cpu)) {
cpus[i].id = i;
cpus[i].thread_running = 1;
pthread_create(&cpus[i].thread, &dettached, cpu_main, &cpus[i]);
}
}
sleep(MAX(config_starving_threshold/20,1));
}
}
int check_policies(void)
{
int ret;
int saved_runtime = config_dl_runtime;
int boosted = SCHED_DEADLINE;
struct sched_attr attr;
/*
* if we specified fifo on the command line
* just return false
*/
if (config_force_fifo) {
log_msg("forcing SCHED_FIFO for boosting\n");
return SCHED_FIFO;
}
// set runtime to half of period
config_dl_runtime = config_dl_period / 2;
// save off the current policy
if (get_current_policy(0, &attr))
die("check_policies: unable to get scheduling policy!");
// try boosting to SCHED_DEADLINE
ret = boost_with_deadline(0);
if (ret < 0) {
// try boosting with fifo to see if we have permission
ret = boost_with_fifo(0);
if (ret < 0) {
log_msg("check_policies: unable to change policy to either deadline or fifo,"
"defaulting to logging only\n");
config_log_only = 1;
boosted = 0;
}
else
boosted = SCHED_FIFO;
}
// if we successfully boosted to something, restore the old policy
if (boosted) {
ret = restore_policy(0, &attr);
// if we can't restore the policy then quit now
if (ret < 0)
die("check_policies: unable to restore policy: %s\n", strerror(errno));
}
// restore the actual runtime value
config_dl_runtime = saved_runtime;
if (boosted == SCHED_DEADLINE)
log_msg("using SCHED_DEADLINE for boosting\n");
else if (boosted == SCHED_FIFO)
log_msg("using SCHED_FIFO for boosting\n");
return boosted;
}
int main(int argc, char **argv)
{
struct cpu_info *cpus;
int nr_cpus;
int i;
parse_args(argc, argv);
/*
* see if deadline scheduler is available
*/
boost_policy = check_policies();
nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
if (nr_cpus < 1)
die("Can not calculate number of CPUS\n");
cpus = malloc(sizeof(struct cpu_info) * nr_cpus);
if (!cpus)
die("Cannot allocate memory");
memset(cpus, 0, sizeof(struct cpu_info) * nr_cpus);
for (i = 0; i < nr_cpus; i++) {
cpus[i].buffer = malloc(config_buffer_size);
if (!cpus[i].buffer)
die("Cannot allocate memory");
cpus[i].buffer_size = config_buffer_size;
}
if (config_log_syslog)
openlog("stalld", 0, LOG_DAEMON);
setup_signal_handling();
turn_off_rt_throttling();
if (!config_foreground)
deamonize();
write_pidfile();
if (config_aggressive)
aggressive_main(cpus, nr_cpus);
else
conservative_main(cpus, nr_cpus);
if (config_log_syslog)
closelog();
exit(0);
}