blob: 4a55e8a7366ea7f8cff9503517d93820aeb5557e [file] [log] [blame]
/*
* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
*
* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
* Interactivity improvements by Mike Galbraith
* (C) 2007 Mike Galbraith <efault@gmx.de>
*
* Various enhancements by Dmitry Adamushko.
* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
*
* Group scheduling enhancements by Srivatsa Vaddagiri
* Copyright IBM Corporation, 2007
* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
*
* Scaled math optimizations by Thomas Gleixner
* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
*
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*/
#include <linux/latencytop.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
#include <trace/events/sched.h>
#include "sched.h"
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
* and have no persistent notion like in traditional, time-slice
* based scheduling concepts.
*
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
*/
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*
* Options are:
* SCHED_TUNABLESCALING_NONE - unscaled, always *1
* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
*/
enum sched_tunable_scaling sysctl_sched_tunable_scaling
= SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_min_granularity = 750000ULL;
unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
/*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
*/
static unsigned int sched_nr_latency = 8;
/*
* After fork, child runs first. If set to 0 (default) then
* parent will (try to) run first.
*/
unsigned int sysctl_sched_child_runs_first __read_mostly;
/*
* SCHED_OTHER wake-up granularity.
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
*/
unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
/*
* The exponential sliding window over which load is averaged for shares
* distribution.
* (default: 10msec)
*/
unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
#ifdef CONFIG_CFS_BANDWIDTH
/*
* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
* each time a cfs_rq requests quota.
*
* Note: in the case that the slice exceeds the runtime remaining (either due
* to consumption or the quota being specified to be smaller than the slice)
* we will always only issue the remaining available time.
*
* default: 5 msec, units: microseconds
*/
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
/*
* Increase the granularity value when there are more CPUs,
* because with more CPUs the 'effective latency' as visible
* to users decreases. But the relationship is not linear,
* so pick a second-best guess by going with the log2 of the
* number of CPUs.
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
static int get_update_sysctl_factor(void)
{
unsigned int cpus = min_t(int, num_online_cpus(), 8);
unsigned int factor;
switch (sysctl_sched_tunable_scaling) {
case SCHED_TUNABLESCALING_NONE:
factor = 1;
break;
case SCHED_TUNABLESCALING_LINEAR:
factor = cpus;
break;
case SCHED_TUNABLESCALING_LOG:
default:
factor = 1 + ilog2(cpus);
break;
}
return factor;
}
static void update_sysctl(void)
{
unsigned int factor = get_update_sysctl_factor();
#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
SET_SYSCTL(sched_min_granularity);
SET_SYSCTL(sched_latency);
SET_SYSCTL(sched_wakeup_granularity);
#undef SET_SYSCTL
}
void sched_init_granularity(void)
{
update_sysctl();
}
#if BITS_PER_LONG == 32
# define WMULT_CONST (~0UL)
#else
# define WMULT_CONST (1UL << 32)
#endif
#define WMULT_SHIFT 32
/*
* Shift right and round:
*/
#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
/*
* delta *= weight / lw
*/
static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
struct load_weight *lw)
{
u64 tmp;
/*
* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
* entities since MIN_SHARES = 2. Treat weight as 1 if less than
* 2^SCHED_LOAD_RESOLUTION.
*/
if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
tmp = (u64)delta_exec * scale_load_down(weight);
else
tmp = (u64)delta_exec;
if (!lw->inv_weight) {
unsigned long w = scale_load_down(lw->weight);
if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
lw->inv_weight = 1;
else if (unlikely(!w))
lw->inv_weight = WMULT_CONST;
else
lw->inv_weight = WMULT_CONST / w;
}
/*
* Check whether we'd overflow the 64-bit multiplication:
*/
if (unlikely(tmp > WMULT_CONST))
tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
WMULT_SHIFT/2);
else
tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}
const struct sched_class fair_sched_class;
/**************************************************************
* CFS operations on generic schedulable entities:
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
/* cpu runqueue to which this cfs_rq is attached */
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
return cfs_rq->rq;
}
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
static inline struct task_struct *task_of(struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
WARN_ON_ONCE(!entity_is_task(se));
#endif
return container_of(se, struct task_struct, se);
}
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
{
return p->se.cfs_rq;
}
/* runqueue on which this entity is (to be) queued */
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
return se->cfs_rq;
}
/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
return grp->my_q;
}
static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
int force_update);
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
* enqueued. The fact that we always enqueue bottom-up
* reduces this to two cases.
*/
if (cfs_rq->tg->parent &&
cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
&rq_of(cfs_rq)->leaf_cfs_rq_list);
} else {
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
&rq_of(cfs_rq)->leaf_cfs_rq_list);
}
cfs_rq->on_list = 1;
/* We should have no load, but we need to update last_decay. */
update_cfs_rq_blocked_load(cfs_rq, 0);
}
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (cfs_rq->on_list) {
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
cfs_rq->on_list = 0;
}
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline int
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (se->cfs_rq == pse->cfs_rq)
return 1;
return 0;
}
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return se->parent;
}
/* return depth at which a sched entity is present in the hierarchy */
static inline int depth_se(struct sched_entity *se)
{
int depth = 0;
for_each_sched_entity(se)
depth++;
return depth;
}
static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
int se_depth, pse_depth;
/*
* preemption test can be made between sibling entities who are in the
* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
* both tasks until we find their ancestors who are siblings of common
* parent.
*/
/* First walk up until both entities are at same depth */
se_depth = depth_se(*se);
pse_depth = depth_se(*pse);
while (se_depth > pse_depth) {
se_depth--;
*se = parent_entity(*se);
}
while (pse_depth > se_depth) {
pse_depth--;
*pse = parent_entity(*pse);
}
while (!is_same_group(*se, *pse)) {
*se = parent_entity(*se);
*pse = parent_entity(*pse);
}
}
#else /* !CONFIG_FAIR_GROUP_SCHED */
static inline struct task_struct *task_of(struct sched_entity *se)
{
return container_of(se, struct task_struct, se);
}
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
return container_of(cfs_rq, struct rq, cfs);
}
#define entity_is_task(se) 1
#define for_each_sched_entity(se) \
for (; se; se = NULL)
static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
{
return &task_rq(p)->cfs;
}
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
struct task_struct *p = task_of(se);
struct rq *rq = task_rq(p);
return &rq->cfs;
}
/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
return NULL;
}
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
static inline int
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
return 1;
}
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return NULL;
}
static inline void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
/**************************************************************
* Scheduling class tree data structure manipulation methods:
*/
static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta > 0)
min_vruntime = vruntime;
return min_vruntime;
}
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta < 0)
min_vruntime = vruntime;
return min_vruntime;
}
static inline int entity_before(struct sched_entity *a,
struct sched_entity *b)
{
return (s64)(a->vruntime - b->vruntime) < 0;
}
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
u64 vruntime = cfs_rq->min_vruntime;
if (cfs_rq->curr)
vruntime = cfs_rq->curr->vruntime;
if (cfs_rq->rb_leftmost) {
struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
struct sched_entity,
run_node);
if (!cfs_rq->curr)
vruntime = se->vruntime;
else
vruntime = min_vruntime(vruntime, se->vruntime);
}
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
#ifndef CONFIG_64BIT
smp_wmb();
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
}
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
int leftmost = 1;
/*
* Find the right place in the rbtree:
*/
while (*link) {
parent = *link;
entry = rb_entry(parent, struct sched_entity, run_node);
/*
* We dont care about collisions. Nodes with
* the same key stay together.
*/
if (entity_before(se, entry)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = 0;
}
}
/*
* Maintain a cache of leftmost tree entries (it is frequently
* used):
*/
if (leftmost)
cfs_rq->rb_leftmost = &se->run_node;
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->rb_leftmost == &se->run_node) {
struct rb_node *next_node;
next_node = rb_next(&se->run_node);
cfs_rq->rb_leftmost = next_node;
}
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = cfs_rq->rb_leftmost;
if (!left)
return NULL;
return rb_entry(left, struct sched_entity, run_node);
}
static struct sched_entity *__pick_next_entity(struct sched_entity *se)
{
struct rb_node *next = rb_next(&se->run_node);
if (!next)
return NULL;
return rb_entry(next, struct sched_entity, run_node);
}
#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
if (!last)
return NULL;
return rb_entry(last, struct sched_entity, run_node);
}
/**************************************************************
* Scheduling class statistics methods:
*/
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
int factor = get_update_sysctl_factor();
if (ret || !write)
return ret;
sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
sysctl_sched_min_granularity);
#define WRT_SYSCTL(name) \
(normalized_sysctl_##name = sysctl_##name / (factor))
WRT_SYSCTL(sched_min_granularity);
WRT_SYSCTL(sched_latency);
WRT_SYSCTL(sched_wakeup_granularity);
#undef WRT_SYSCTL
return 0;
}
#endif
/*
* delta /= w
*/
static inline unsigned long
calc_delta_fair(unsigned long delta, struct sched_entity *se)
{
if (unlikely(se->load.weight != NICE_0_LOAD))
delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
return delta;
}
/*
* The idea is to set a period in which each task runs once.
*
* When there are too many tasks (sched_nr_latency) we have to stretch
* this period because otherwise the slices get too small.
*
* p = (nr <= nl) ? l : l*nr/nl
*/
static u64 __sched_period(unsigned long nr_running)
{
u64 period = sysctl_sched_latency;
unsigned long nr_latency = sched_nr_latency;
if (unlikely(nr_running > nr_latency)) {
period = sysctl_sched_min_granularity;
period *= nr_running;
}
return period;
}
/*
* We calculate the wall-time slice from the period by taking a part
* proportional to the weight.
*
* s = p*P[w/rw]
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
for_each_sched_entity(se) {
struct load_weight *load;
struct load_weight lw;
cfs_rq = cfs_rq_of(se);
load = &cfs_rq->load;
if (unlikely(!se->on_rq)) {
lw = cfs_rq->load;
update_load_add(&lw, se->load.weight);
load = &lw;
}
slice = calc_delta_mine(slice, se->load.weight, load);
}
return slice;
}
/*
* We calculate the vruntime slice of a to be inserted task
*
* vs = s/w
*/
static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
*/
static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
unsigned long delta_exec)
{
unsigned long delta_exec_weighted;
schedstat_set(curr->statistics.exec_max,
max((u64)delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
delta_exec_weighted = calc_delta_fair(delta_exec, curr);
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
}
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_clock_task(rq_of(cfs_rq));
unsigned long delta_exec;
if (unlikely(!curr))
return;
/*
* Get the amount of time the current task was running
* since the last time we changed load (this cannot
* overflow on 32 bits):
*/
delta_exec = (unsigned long)(now - curr->exec_start);
if (!delta_exec)
return;
__update_curr(cfs_rq, curr, delta_exec);
curr->exec_start = now;
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cpuacct_charge(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
}
/*
* Task is being enqueued - update stats:
*/
static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
update_stats_wait_start(cfs_rq, se);
}
static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
trace_sched_stat_wait(task_of(se),
rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
}
#endif
schedstat_set(se->statistics.wait_start, 0);
}
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* Mark the end of the wait period if dequeueing a
* waiting task:
*/
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
}
/*
* We are picking a new current task - update its stats:
*/
static inline void
update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* We are starting a new run period:
*/
se->exec_start = rq_clock_task(rq_of(cfs_rq));
}
/**************************************************
* Scheduling class queueing methods:
*/
#ifdef CONFIG_NUMA_BALANCING
/*
* numa task sample period in ms
*/
unsigned int sysctl_numa_balancing_scan_period_min = 100;
unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
/* Portion of address space to scan in MB */
unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
static void task_numa_placement(struct task_struct *p)
{
int seq;
if (!p->mm) /* for example, ksmd faulting in a user's mm */
return;
seq = ACCESS_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
/* FIXME: Scheduling placement policy hints go here */
}
/*
* Got a PROT_NONE fault for a page on @node.
*/
void task_numa_fault(int node, int pages, bool migrated)
{
struct task_struct *p = current;
if (!sched_feat_numa(NUMA))
return;
/* FIXME: Allocate task-specific structure for placement policy here */
/*
* If pages are properly placed (did not migrate) then scan slower.
* This is reset periodically in case of phase changes
*/
if (!migrated)
p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
p->numa_scan_period + jiffies_to_msecs(10));
task_numa_placement(p);
}
static void reset_ptenuma_scan(struct task_struct *p)
{
ACCESS_ONCE(p->mm->numa_scan_seq)++;
p->mm->numa_scan_offset = 0;
}
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
*/
void task_numa_work(struct callback_head *work)
{
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
struct vm_area_struct *vma;
unsigned long start, end;
long pages;
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
work->next = work; /* protect against double add */
/*
* Who cares about NUMA placement when they're dying.
*
* NOTE: make sure not to dereference p->mm before this check,
* exit_task_work() happens _after_ exit_mm() so we could be called
* without p->mm even though we still had it when we enqueued this
* work.
*/
if (p->flags & PF_EXITING)
return;
/*
* We do not care about task placement until a task runs on a node
* other than the first one used by the address space. This is
* largely because migrations are driven by what CPU the task
* is running on. If it's never scheduled on another node, it'll
* not migrate so why bother trapping the fault.
*/
if (mm->first_nid == NUMA_PTE_SCAN_INIT)
mm->first_nid = numa_node_id();
if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
/* Are we running on a new node yet? */
if (numa_node_id() == mm->first_nid &&
!sched_feat_numa(NUMA_FORCE))
return;
mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
}
/*
* Reset the scan period if enough time has gone by. Objective is that
* scanning will be reduced if pages are properly placed. As tasks
* can enter different phases this needs to be re-examined. Lacking
* proper tracking of reference behaviour, this blunt hammer is used.
*/
migrate = mm->numa_next_reset;
if (time_after(now, migrate)) {
p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
xchg(&mm->numa_next_reset, next_scan);
}
/*
* Enforce maximal scan/migration frequency..
*/
migrate = mm->numa_next_scan;
if (time_before(now, migrate))
return;
if (p->numa_scan_period == 0)
p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
return;
/*
* Do not set pte_numa if the current running node is rate-limited.
* This loses statistics on the fault but if we are unwilling to
* migrate to this node, it is less likely we can do useful work
*/
if (migrate_ratelimited(numa_node_id()))
return;
start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
if (!pages)
return;
down_read(&mm->mmap_sem);
vma = find_vma(mm, start);
if (!vma) {
reset_ptenuma_scan(p);
start = 0;
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
if (!vma_migratable(vma))
continue;
/* Skip small VMAs. They are not likely to be of relevance */
if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
continue;
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
end = min(end, vma->vm_end);
pages -= change_prot_numa(vma, start, end);
start = end;
if (pages <= 0)
goto out;
} while (end != vma->vm_end);
}
out:
/*
* It is possible to reach the end of the VMA list but the last few VMAs are
* not guaranteed to the vma_migratable. If they are not, we would find the
* !migratable VMA on the next scan but not reset the scanner to the start
* so check it now.
*/
if (vma)
mm->numa_scan_offset = start;
else
reset_ptenuma_scan(p);
up_read(&mm->mmap_sem);
}
/*
* Drive the periodic memory faults..
*/
void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
struct callback_head *work = &curr->numa_work;
u64 period, now;
/*
* We don't care about NUMA placement if we don't have memory.
*/
if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
return;
/*
* Using runtime rather than walltime has the dual advantage that
* we (mostly) drive the selection from busy threads and that the
* task needs to have done some actual work before we bother with
* NUMA placement.
*/
now = curr->se.sum_exec_runtime;
period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
if (now - curr->node_stamp > period) {
if (!curr->node_stamp)
curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
curr->node_stamp = now;
if (!time_before(jiffies, curr->mm->numa_next_scan)) {
init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
task_work_add(curr, work, true);
}
}
}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
#endif /* CONFIG_NUMA_BALANCING */
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
#ifdef CONFIG_SMP
if (entity_is_task(se))
list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
#endif
cfs_rq->nr_running++;
}
static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
if (entity_is_task(se))
list_del_init(&se->group_node);
cfs_rq->nr_running--;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
{
long tg_weight;
/*
* Use this CPU's actual weight instead of the last load_contribution
* to gain a more accurate current total weight. See
* update_cfs_rq_load_contribution().
*/
tg_weight = atomic64_read(&tg->load_avg);
tg_weight -= cfs_rq->tg_load_contrib;
tg_weight += cfs_rq->load.weight;
return tg_weight;
}
static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
long tg_weight, load, shares;
tg_weight = calc_tg_weight(tg, cfs_rq);
load = cfs_rq->load.weight;
shares = (tg->shares * load);
if (tg_weight)
shares /= tg_weight;
if (shares < MIN_SHARES)
shares = MIN_SHARES;
if (shares > tg->shares)
shares = tg->shares;
return shares;
}
# else /* CONFIG_SMP */
static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
return tg->shares;
}
# endif /* CONFIG_SMP */
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
account_entity_dequeue(cfs_rq, se);
}
update_load_set(&se->load, weight);
if (se->on_rq)
account_entity_enqueue(cfs_rq, se);
}
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
static void update_cfs_shares(struct cfs_rq *cfs_rq)
{
struct task_group *tg;
struct sched_entity *se;
long shares;
tg = cfs_rq->tg;
se = tg->se[cpu_of(rq_of(cfs_rq))];
if (!se || throttled_hierarchy(cfs_rq))
return;
#ifndef CONFIG_SMP
if (likely(se->load.weight == tg->shares))
return;
#endif
shares = calc_cfs_shares(cfs_rq, tg);
reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
/*
* We choose a half-life close to 1 scheduling period.
* Note: The tables below are dependent on this value.
*/
#define LOAD_AVG_PERIOD 32
#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
/* Precomputed fixed inverse multiplies for multiplication by y^n */
static const u32 runnable_avg_yN_inv[] = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
0x85aac367, 0x82cd8698,
};
/*
* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
* over-estimates when re-combining.
*/
static const u32 runnable_avg_yN_sum[] = {
0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
};
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
static __always_inline u64 decay_load(u64 val, u64 n)
{
unsigned int local_n;
if (!n)
return val;
else if (unlikely(n > LOAD_AVG_PERIOD * 63))
return 0;
/* after bounds checking we can collapse to 32-bit */
local_n = n;
/*
* As y^PERIOD = 1/2, we can combine
* y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
* With a look-up table which covers k^n (n<PERIOD)
*
* To achieve constant time decay_load.
*/
if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
val >>= local_n / LOAD_AVG_PERIOD;
local_n %= LOAD_AVG_PERIOD;
}
val *= runnable_avg_yN_inv[local_n];
/* We don't use SRR here since we always want to round down. */
return val >> 32;
}
/*
* For updates fully spanning n periods, the contribution to runnable
* average will be: \Sum 1024*y^n
*
* We can compute this reasonably efficiently by combining:
* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
*/
static u32 __compute_runnable_contrib(u64 n)
{
u32 contrib = 0;
if (likely(n <= LOAD_AVG_PERIOD))
return runnable_avg_yN_sum[n];
else if (unlikely(n >= LOAD_AVG_MAX_N))
return LOAD_AVG_MAX;
/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
do {
contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
n -= LOAD_AVG_PERIOD;
} while (n > LOAD_AVG_PERIOD);
contrib = decay_load(contrib, n);
return contrib + runnable_avg_yN_sum[n];
}
/*
* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
* history into segments of approximately 1ms (1024us); label the segment that
* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
*
* [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
* p0 p1 p2
* (now) (~1ms ago) (~2ms ago)
*
* Let u_i denote the fraction of p_i that the entity was runnable.
*
* We then designate the fractions u_i as our co-efficients, yielding the
* following representation of historical load:
* u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
*
* We choose y based on the with of a reasonably scheduling period, fixing:
* y^32 = 0.5
*
* This means that the contribution to load ~32ms ago (u_32) will be weighted
* approximately half as much as the contribution to load within the last ms
* (u_0).
*
* When a period "rolls over" and we have new u_0`, multiplying the previous
* sum again by y is sufficient to update:
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int __update_entity_runnable_avg(u64 now,
struct sched_avg *sa,
int runnable)
{
u64 delta, periods;
u32 runnable_contrib;
int delta_w, decayed = 0;
delta = now - sa->last_runnable_update;
/*
* This should only happen when time goes backwards, which it
* unfortunately does during sched clock init when we swap over to TSC.
*/
if ((s64)delta < 0) {
sa->last_runnable_update = now;
return 0;
}
/*
* Use 1024ns as the unit of measurement since it's a reasonable
* approximation of 1us and fast to compute.
*/
delta >>= 10;
if (!delta)
return 0;
sa->last_runnable_update = now;
/* delta_w is the amount already accumulated against our next period */
delta_w = sa->runnable_avg_period % 1024;
if (delta + delta_w >= 1024) {
/* period roll-over */
decayed = 1;
/*
* Now that we know we're crossing a period boundary, figure
* out how much from delta we need to complete the current
* period and accrue it.
*/
delta_w = 1024 - delta_w;
if (runnable)
sa->runnable_avg_sum += delta_w;
sa->runnable_avg_period += delta_w;
delta -= delta_w;
/* Figure out how many additional periods this update spans */
periods = delta / 1024;
delta %= 1024;
sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
periods + 1);
sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
periods + 1);
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
runnable_contrib = __compute_runnable_contrib(periods);
if (runnable)
sa->runnable_avg_sum += runnable_contrib;
sa->runnable_avg_period += runnable_contrib;
}
/* Remainder of delta accrued against u_0` */
if (runnable)
sa->runnable_avg_sum += delta;
sa->runnable_avg_period += delta;
return decayed;
}
/* Synchronize an entity's decay with its parenting cfs_rq.*/
static inline u64 __synchronize_entity_decay(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 decays = atomic64_read(&cfs_rq->decay_counter);
decays -= se->avg.decay_count;
if (!decays)
return 0;
se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
se->avg.decay_count = 0;
return decays;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
int force_update)
{
struct task_group *tg = cfs_rq->tg;
s64 tg_contrib;
tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
tg_contrib -= cfs_rq->tg_load_contrib;
if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
atomic64_add(tg_contrib, &tg->load_avg);
cfs_rq->tg_load_contrib += tg_contrib;
}
}
/*
* Aggregate cfs_rq runnable averages into an equivalent task_group
* representation for computing load contributions.
*/
static inline void __update_tg_runnable_avg(struct sched_avg *sa,
struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
long contrib;
/* The fraction of a cpu used by this cfs_rq */
contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
sa->runnable_avg_period + 1);
contrib -= cfs_rq->tg_runnable_contrib;
if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
atomic_add(contrib, &tg->runnable_avg);
cfs_rq->tg_runnable_contrib += contrib;
}
}
static inline void __update_group_entity_contrib(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg = cfs_rq->tg;
int runnable_avg;
u64 contrib;
contrib = cfs_rq->tg_load_contrib * tg->shares;
se->avg.load_avg_contrib = div64_u64(contrib,
atomic64_read(&tg->load_avg) + 1);
/*
* For group entities we need to compute a correction term in the case
* that they are consuming <1 cpu so that we would contribute the same
* load as a task of equal weight.
*
* Explicitly co-ordinating this measurement would be expensive, but
* fortunately the sum of each cpus contribution forms a usable
* lower-bound on the true value.
*
* Consider the aggregate of 2 contributions. Either they are disjoint
* (and the sum represents true value) or they are disjoint and we are
* understating by the aggregate of their overlap.
*
* Extending this to N cpus, for a given overlap, the maximum amount we
* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
* cpus that overlap for this interval and w_i is the interval width.
*
* On a small machine; the first term is well-bounded which bounds the
* total error since w_i is a subset of the period. Whereas on a
* larger machine, while this first term can be larger, if w_i is the
* of consequential size guaranteed to see n_i*w_i quickly converge to
* our upper bound of 1-cpu.
*/
runnable_avg = atomic_read(&tg->runnable_avg);
if (runnable_avg < NICE_0_LOAD) {
se->avg.load_avg_contrib *= runnable_avg;
se->avg.load_avg_contrib >>= NICE_0_SHIFT;
}
}
#else
static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
int force_update) {}
static inline void __update_tg_runnable_avg(struct sched_avg *sa,
struct cfs_rq *cfs_rq) {}
static inline void __update_group_entity_contrib(struct sched_entity *se) {}
#endif
static inline void __update_task_entity_contrib(struct sched_entity *se)
{
u32 contrib;
/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
contrib /= (se->avg.runnable_avg_period + 1);
se->avg.load_avg_contrib = scale_load(contrib);
}
/* Compute the current contribution to load_avg by se, return any delta */
static long __update_entity_load_avg_contrib(struct sched_entity *se)
{
long old_contrib = se->avg.load_avg_contrib;
if (entity_is_task(se)) {
__update_task_entity_contrib(se);
} else {
__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
__update_group_entity_contrib(se);
}
return se->avg.load_avg_contrib - old_contrib;
}
static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
long load_contrib)
{
if (likely(load_contrib < cfs_rq->blocked_load_avg))
cfs_rq->blocked_load_avg -= load_contrib;
else
cfs_rq->blocked_load_avg = 0;
}
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
/* Update a sched_entity's runnable average */
static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
long contrib_delta;
u64 now;
/*
* For a group entity we need to use their owned cfs_rq_clock_task() in
* case they are the parent of a throttled hierarchy.
*/
if (entity_is_task(se))
now = cfs_rq_clock_task(cfs_rq);
else
now = cfs_rq_clock_task(group_cfs_rq(se));
if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
return;
contrib_delta = __update_entity_load_avg_contrib(se);
if (!update_cfs_rq)
return;
if (se->on_rq)
cfs_rq->runnable_load_avg += contrib_delta;
else
subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
}
/*
* Decay the load contributed by all blocked children and account this so that
* their contribution may appropriately discounted when they wake up.
*/
static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
{
u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
u64 decays;
decays = now - cfs_rq->last_decay;
if (!decays && !force_update)
return;
if (atomic64_read(&cfs_rq->removed_load)) {
u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
subtract_blocked_load_contrib(cfs_rq, removed_load);
}
if (decays) {
cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
decays);
atomic64_add(decays, &cfs_rq->decay_counter);
cfs_rq->last_decay = now;
}
__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
}
static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
{
__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
}
/* Add the load generated by se into cfs_rq's child load-average */
static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
struct sched_entity *se,
int wakeup)
{
/*
* We track migrations using entity decay_count <= 0, on a wake-up
* migration we use a negative decay count to track the remote decays
* accumulated while sleeping.
*/
if (unlikely(se->avg.decay_count <= 0)) {
se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
if (se->avg.decay_count) {
/*
* In a wake-up migration we have to approximate the
* time sleeping. This is because we can't synchronize
* clock_task between the two cpus, and it is not
* guaranteed to be read-safe. Instead, we can
* approximate this using our carried decays, which are
* explicitly atomically readable.
*/
se->avg.last_runnable_update -= (-se->avg.decay_count)
<< 20;
update_entity_load_avg(se, 0);
/* Indicate that we're now synchronized and on-rq */
se->avg.decay_count = 0;
}
wakeup = 0;
} else {
__synchronize_entity_decay(se);
}
/* migrated tasks did not contribute to our blocked load */
if (wakeup) {
subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
update_entity_load_avg(se, 0);
}
cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
/* we force update consideration on load-balancer moves */
update_cfs_rq_blocked_load(cfs_rq, !wakeup);
}
/*
* Remove se's load from this cfs_rq child load-average, if the entity is
* transitioning to a blocked state we track its projected decay using
* blocked_load_avg.
*/
static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
struct sched_entity *se,
int sleep)
{
update_entity_load_avg(se, 1);
/* we force update consideration on load-balancer moves */
update_cfs_rq_blocked_load(cfs_rq, !sleep);
cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
if (sleep) {
cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
} /* migrations, e.g. sleep=0 leave decay_count == 0 */
}
#else
static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq) {}
static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
struct sched_entity *se,
int wakeup) {}
static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
struct sched_entity *se,
int sleep) {}
static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
int force_update) {}
#endif
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
struct task_struct *tsk = NULL;
if (entity_is_task(se))
tsk = task_of(se);
if (se->statistics.sleep_start) {
u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > se->statistics.sleep_max))
se->statistics.sleep_max = delta;
se->statistics.sleep_start = 0;
se->statistics.sum_sleep_runtime += delta;
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
trace_sched_stat_sleep(tsk, delta);
}
}
if (se->statistics.block_start) {
u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > se->statistics.block_max))
se->statistics.block_max = delta;
se->statistics.block_start = 0;
se->statistics.sum_sleep_runtime += delta;
if (tsk) {
if (tsk->in_iowait) {
se->statistics.iowait_sum += delta;
se->statistics.iowait_count++;
trace_sched_stat_iowait(tsk, delta);
}
trace_sched_stat_blocked(tsk, delta);
/*
* Blocking time is in units of nanosecs, so shift by
* 20 to get a milliseconds-range estimation of the
* amount of time that the task spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
profile_hits(SLEEP_PROFILING,
(void *)get_wchan(tsk),
delta >> 20);
}
account_scheduler_latency(tsk, delta >> 10, 0);
}
}
#endif
}
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
s64 d = se->vruntime - cfs_rq->min_vruntime;
if (d < 0)
d = -d;
if (d > 3*sysctl_sched_latency)
schedstat_inc(cfs_rq, nr_spread_over);
#endif
}
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;
/*
* The 'current' period is already promised to the current tasks,
* however the extra weight of the new task will slow them down a
* little, place the new task so that it fits in the slot that
* stays open at the end.
*/
if (initial && sched_feat(START_DEBIT))
vruntime += sched_vslice(cfs_rq, se);
/* sleeps up to a single latency don't count. */
if (!initial) {
unsigned long thresh = sysctl_sched_latency;
/*
* Halve their sleep time's effect, to allow
* for a gentler effect of sleepers:
*/
if (sched_feat(GENTLE_FAIR_SLEEPERS))
thresh >>= 1;
vruntime -= thresh;
}
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime(se->vruntime, vruntime);
se->vruntime = vruntime;
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update the normalized vruntime before updating min_vruntime
* through callig update_curr().
*/
if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
se->vruntime += cfs_rq->min_vruntime;
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
enqueue_sleeper(cfs_rq, se);
}
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
if (cfs_rq->nr_running == 1) {
list_add_leaf_cfs_rq(cfs_rq);
check_enqueue_throttle(cfs_rq);
}
}
static void __clear_buddies_last(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (cfs_rq->last == se)
cfs_rq->last = NULL;
else
break;
}
}
static void __clear_buddies_next(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (cfs_rq->next == se)
cfs_rq->next = NULL;
else
break;
}
}
static void __clear_buddies_skip(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (cfs_rq->skip == se)
cfs_rq->skip = NULL;
else
break;
}
}
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->last == se)
__clear_buddies_last(se);
if (cfs_rq->next == se)
__clear_buddies_next(se);
if (cfs_rq->skip == se)
__clear_buddies_skip(se);
}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
if (tsk->state & TASK_UNINTERRUPTIBLE)
se->statistics.block_start = rq_clock(rq_of(cfs_rq));
}
#endif
}
clear_buddies(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
/*
* Normalize the entity after updating the min_vruntime because the
* update can refer to the ->curr item and we need to reflect this
* movement in our normalized position.
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
update_min_vruntime(cfs_rq);
update_cfs_shares(cfs_rq);
}
/*
* Preempt the current task with a newly woken task if needed:
*/
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
unsigned long ideal_runtime, delta_exec;
struct sched_entity *se;
s64 delta;
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
resched_task(rq_of(cfs_rq)->curr);
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
*/
clear_buddies(cfs_rq, curr);
return;
}
/*
* Ensure that a task that missed wakeup preemption by a
* narrow margin doesn't have to wait for a full slice.
* This also mitigates buddy induced latencies under load.
*/
if (delta_exec < sysctl_sched_min_granularity)
return;
se = __pick_first_entity(cfs_rq);
delta = curr->vruntime - se->vruntime;
if (delta < 0)
return;
if (delta > ideal_runtime)
resched_task(rq_of(cfs_rq)->curr);
}
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
* Any task has to be enqueued before it get to execute on
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
}
update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se;
#ifdef CONFIG_SCHEDSTATS
/*
* Track our maximum slice length, if the CPU's load is at
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
#endif
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
* 2) pick the "next" process, since someone really wants that to run
* 3) pick the "last" process, for cache locality
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *left = se;
/*
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
if (cfs_rq->skip == se) {
struct sched_entity *second = __pick_next_entity(se);
if (second && wakeup_preempt_entity(second, left) < 1)
se = second;
}
/*
* Prefer last buddy, try to return the CPU to a preempted task.
*/
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
se = cfs_rq->last;
/*
* Someone really wants this to run. If it's not unfair, run it.
*/
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
se = cfs_rq->next;
clear_buddies(cfs_rq, se);
return se;
}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
/*
* If still on the runqueue then deactivate_task()
* was not called and update_curr() has to be done:
*/
if (prev->on_rq)
update_curr(cfs_rq);
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
check_spread(cfs_rq, prev);
if (prev->on_rq) {
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
update_entity_load_avg(prev, 1);
}
cfs_rq->curr = NULL;
}
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
/*
* Ensure that runnable average is periodically updated.
*/
update_entity_load_avg(curr, 1);
update_cfs_rq_blocked_load(cfs_rq, 1);
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
* validating it and just reschedule.
*/
if (queued) {
resched_task(rq_of(cfs_rq)->curr);
return;
}
/*
* don't let the period tick interfere with the hrtick preemption
*/
if (!sched_feat(DOUBLE_TICK) &&
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
if (cfs_rq->nr_running > 1)
check_preempt_tick(cfs_rq, curr);
}
/**************************************************
* CFS bandwidth control machinery
*/
#ifdef CONFIG_CFS_BANDWIDTH
#ifdef HAVE_JUMP_LABEL
static struct static_key __cfs_bandwidth_used;
static inline bool cfs_bandwidth_used(void)
{
return static_key_false(&__cfs_bandwidth_used);
}
void account_cfs_bandwidth_used(int enabled, int was_enabled)
{
/* only need to count groups transitioning between enabled/!enabled */
if (enabled && !was_enabled)
static_key_slow_inc(&__cfs_bandwidth_used);
else if (!enabled && was_enabled)
static_key_slow_dec(&__cfs_bandwidth_used);
}
#else /* HAVE_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
{
return true;
}
void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
#endif /* HAVE_JUMP_LABEL */
/*
* default period for cfs group bandwidth.
* default: 0.1s, units: nanoseconds
*/
static inline u64 default_cfs_period(void)
{
return 100000000ULL;
}
static inline u64 sched_cfs_bandwidth_slice(void)
{
return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
}
/*
* Replenish runtime according to assigned quota and update expiration time.
* We use sched_clock_cpu directly instead of rq->clock to avoid adding
* additional synchronization around rq->lock.
*
* requires cfs_b->lock
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
u64 now;
if (cfs_b->quota == RUNTIME_INF)
return;
now = sched_clock_cpu(smp_processor_id());
cfs_b->runtime = cfs_b->quota;
cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
{
return &tg->cfs_bandwidth;
}
/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
if (unlikely(cfs_rq->throttle_count))
return cfs_rq->throttled_clock_task;
return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
}
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
u64 amount = 0, min_amount, expires;
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
/*
* If the bandwidth pool has become inactive, then at least one
* period must have elapsed since the last consumption.
* Refresh the global state and ensure bandwidth timer becomes
* active.
*/
if (!cfs_b->timer_active) {
__refill_cfs_bandwidth_runtime(cfs_b);
__start_cfs_bandwidth(cfs_b);
}
if (cfs_b->runtime > 0) {
amount = min(cfs_b->runtime, min_amount);
cfs_b->runtime -= amount;
cfs_b->idle = 0;
}
}
expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
/*
* we may have advanced our local expiration to account for allowed
* spread between our sched_clock and the one on which runtime was
* issued.
*/
if ((s64)(expires - cfs_rq->runtime_expires) > 0)
cfs_rq->runtime_expires = expires;
return cfs_rq->runtime_remaining > 0;
}
/*
* Note: This depends on the synchronization provided by sched_clock and the
* fact that rq->clock snapshots this value.
*/
static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct rq *rq = rq_of(cfs_rq);
/* if the deadline is ahead of our clock, nothing to do */
if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
return;
if (cfs_rq->runtime_remaining < 0)
return;
/*
* If the local deadline has passed we have to consider the
* possibility that our sched_clock is 'fast' and the global deadline
* has not truly expired.
*
* Fortunately we can check determine whether this the case by checking
* whether the global deadline has advanced.
*/
if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
/* extend local deadline, drift is bounded above by 2 ticks */
cfs_rq->runtime_expires += TICK_NSEC;
} else {
/* global deadline is ahead, expiration has passed */
cfs_rq->runtime_remaining = 0;
}
}
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
expire_cfs_rq_runtime(cfs_rq);
if (likely(cfs_rq->runtime_remaining > 0))
return;
/*
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
*/
if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
resched_task(rq_of(cfs_rq)->curr);
}
static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
{
if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
return;
__account_cfs_rq_runtime(cfs_rq, delta_exec);
}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
return cfs_bandwidth_used() && cfs_rq->throttled;
}
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
/*
* Ensure that neither of the group entities corresponding to src_cpu or
* dest_cpu are members of a throttled hierarchy when performing group
* load-balance operations.
*/
static inline int throttled_lb_pair(struct task_group *tg,
int src_cpu, int dest_cpu)
{
struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
src_cfs_rq = tg->cfs_rq[src_cpu];
dest_cfs_rq = tg->cfs_rq[dest_cpu];
return throttled_hierarchy(src_cfs_rq) ||
throttled_hierarchy(dest_cfs_rq);
}
/* updated child weight may affect parent so we have to do this bottom up */
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
cfs_rq->throttle_count--;
#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) {
/* adjust cfs_rq_clock_task() */
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
}
#endif
return 0;
}
static int tg_throttle_down(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
/* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count)
cfs_rq->throttled_clock_task = rq_clock_task(rq);
cfs_rq->throttle_count++;
return 0;
}
static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, dequeue = 1;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
/* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
task_delta = cfs_rq->h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
break;
if (dequeue)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
qcfs_rq->h_nr_running -= task_delta;
if (qcfs_rq->load.weight)
dequeue = 0;
}
if (!se)
rq->nr_running -= task_delta;
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
raw_spin_unlock(&cfs_b->lock);
}
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
int enqueue = 1;
long task_delta;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
cfs_rq->throttled = 0;
update_rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
if (!cfs_rq->load.weight)
return;
task_delta = cfs_rq->h_nr_running;
for_each_sched_entity(se) {
if (se->on_rq)
enqueue = 0;
cfs_rq = cfs_rq_of(se);
if (enqueue)
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
cfs_rq->h_nr_running += task_delta;
if (cfs_rq_throttled(cfs_rq))
break;
}
if (!se)
rq->nr_running += task_delta;
/* determine whether we need to wake up potentially idle cpu */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_task(rq->curr);
}
static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
u64 remaining, u64 expires)
{
struct cfs_rq *cfs_rq;
u64 runtime = remaining;
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
struct rq *rq = rq_of(cfs_rq);
raw_spin_lock(&rq->lock);
if (!cfs_rq_throttled(cfs_rq))
goto next;
runtime = -cfs_rq->runtime_remaining + 1;
if (runtime > remaining)
runtime = remaining;
remaining -= runtime;
cfs_rq->runtime_remaining += runtime;
cfs_rq->runtime_expires = expires;
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0)
unthrottle_cfs_rq(cfs_rq);
next:
raw_spin_unlock(&rq->lock);
if (!remaining)
break;
}
rcu_read_unlock();
return remaining;
}
/*
* Responsible for refilling a task_group's bandwidth and unthrottling its
* cfs_rqs as appropriate. If there has been no activity within the last
* period the timer is deactivated until scheduling resumes; cfs_b->idle is
* used to track this state.
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
{
u64 runtime, runtime_expires;
int idle = 1, throttled;
raw_spin_lock(&cfs_b->lock);
/* no need to continue the timer with no bandwidth constraint */
if (cfs_b->quota == RUNTIME_INF)
goto out_unlock;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
/* idle depends on !throttled (for the case of a large deficit) */
idle = cfs_b->idle && !throttled;
cfs_b->nr_periods += overrun;
/* if we're going inactive then everything else can be deferred */
if (idle)
goto out_unlock;
__refill_cfs_bandwidth_runtime(cfs_b);
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
goto out_unlock;
}
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
/*
* There are throttled entities so we must first use the new bandwidth
* to unthrottle them before making it generally available. This
* ensures that all existing debts will be paid before a new cfs_rq is
* allowed to run.
*/
runtime = cfs_b->runtime;
runtime_expires = cfs_b->runtime_expires;
cfs_b->runtime = 0;
/*
* This check is repeated as we are holding onto the new bandwidth
* while we unthrottle. This can potentially race with an unthrottled
* group trying to acquire new bandwidth from the global pool.
*/
while (throttled && runtime > 0) {
raw_spin_unlock(&cfs_b->lock);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
runtime_expires);
raw_spin_lock(&cfs_b->lock);
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}
/* return (any) remaining runtime */
cfs_b->runtime = runtime;
/*
* While we are ensured activity in the period following an
* unthrottle, this also covers the case in which the new bandwidth is
* insufficient to cover the existing bandwidth deficit. (Forcing the
* timer to remain active while there are any throttled entities.)
*/
cfs_b->idle = 0;
out_unlock:
if (idle)
cfs_b->timer_active = 0;
raw_spin_unlock(&cfs_b->lock);
return idle;
}
/* a cfs_rq won't donate quota below this amount */
static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
/* minimum remaining period time to redistribute slack quota */
static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
/* how long we wait to gather additional slack before distributing */
static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
/* are we near the end of the current quota period? */
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
u64 remaining;
/* if the call-back is running a quota refresh is already occurring */
if (hrtimer_callback_running(refresh_timer))
return 1;
/* is a quota refresh about to occur? */
remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
if (remaining < min_expire)
return 1;
return 0;
}
static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
{
u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
/* if there's a quota refresh soon don't bother with slack */
if (runtime_refresh_within(cfs_b, min_left))
return;
start_bandwidth_timer(&cfs_b->slack_timer,
ns_to_ktime(cfs_bandwidth_slack_period));
}
/* we know any runtime found here is valid as update_curr() precedes return */
static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
if (slack_runtime <= 0)
return;
raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota != RUNTIME_INF &&
cfs_rq->runtime_expires == cfs_b->runtime_expires) {
cfs_b->runtime += slack_runtime;
/* we are under rq->lock, defer unthrottling using a timer */
if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
!list_empty(&cfs_b->throttled_cfs_rq))
start_cfs_slack_bandwidth(cfs_b);
}
raw_spin_unlock(&cfs_b->lock);
/* even if it's not valid for return we don't want to try again */
cfs_rq->runtime_remaining -= slack_runtime;
}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
return;
if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
return;
__return_cfs_rq_runtime(cfs_rq);
}
/*
* This is done with a timer (instead of inline with bandwidth return) since
* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
*/
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
u64 expires;
/* confirm we're still not at a refresh boundary */
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
return;
raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
runtime = cfs_b->runtime;
cfs_b->runtime = 0;
}
expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
if (!runtime)
return;
runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
raw_spin_lock(&cfs_b->lock);
if (expires == cfs_b->runtime_expires)
cfs_b->runtime = runtime;
raw_spin_unlock(&cfs_b->lock);
}
/*
* When a group wakes up we want to make sure that its quota is not already
* expired/exceeded, otherwise it may be allowed to steal additional ticks of
* runtime as update_curr() throttling can not not trigger until it's on-rq.
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
return;
/* an active group must be handled by the update_curr()->put() path */
if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return;
/* ensure the group is not already throttled */
if (cfs_rq_throttled(cfs_rq))
return;
/* update runtime allocation */
account_cfs_rq_runtime(cfs_rq, 0);
if (cfs_rq->runtime_remaining <= 0)
throttle_cfs_rq(cfs_rq);
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
return;
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
return;
/*
* it's possible for a throttled entity to be forced into a running
* state (e.g. set_curr_task), in this case we're finished.
*/
if (cfs_rq_throttled(cfs_rq))
return;
throttle_cfs_rq(cfs_rq);
}
static inline u64 default_cfs_period(void);
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, slack_timer);
do_sched_cfs_slack_timer(cfs_b);
return HRTIMER_NORESTART;
}
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
ktime_t now;
int overrun;
int idle = 0;
for (;;) {
now = hrtimer_cb_get_time(timer);
overrun = hrtimer_forward(timer, now, cfs_b->period);
if (!overrun)
break;
idle = do_sched_cfs_period_timer(cfs_b, overrun);
}
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
raw_spin_lock_init(&cfs_b->lock);
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
cfs_b->period = ns_to_ktime(default_cfs_period());
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
}
/* requires cfs_b->lock, may release to reprogram timer */
void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
/*
* The timer may be active because we're trying to set a new bandwidth
* period or because we're racing with the tear-down path
* (timer_active==0 becomes visible before the hrtimer call-back
* terminates). In either case we ensure that it's re-programmed
*/
while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
raw_spin_unlock(&cfs_b->lock);
/* ensure cfs_b->lock is available while we wait */
hrtimer_cancel(&cfs_b->period_timer);
raw_spin_lock(&cfs_b->lock);
/* if someone else restarted the timer then we're done */
if (cfs_b->timer_active)
return;
}
cfs_b->timer_active = 1;
start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
}
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
}
static void unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct cfs_rq *cfs_rq;
for_each_leaf_cfs_rq(rq, cfs_rq) {
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
if (!cfs_rq->runtime_enabled)
continue;
/*
* clock_task is not advancing so we just need to make sure
* there's some valid quota amount
*/
cfs_rq->runtime_remaining = cfs_b->quota;
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
}
#else /* CONFIG_CFS_BANDWIDTH */
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
return rq_clock_task(rq_of(cfs_rq));
}
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
return 0;
}
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return 0;
}
static inline int throttled_lb_pair(struct task_group *tg,
int src_cpu, int dest_cpu)
{
return 0;
}
void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
#ifdef CONFIG_FAIR_GROUP_SCHED
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
#endif
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
{
return NULL;
}
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
#endif /* CONFIG_CFS_BANDWIDTH */
/**************************************************
* CFS operations on tasks:
*/
#ifdef CONFIG_SCHED_HRTICK
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
WARN_ON(task_rq(p) != rq);
if (cfs_rq->nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
if (delta < 0) {
if (rq->curr == p)
resched_task(p);
return;
}
/*
* Don't schedule slices shorter than 10000ns, that just
* doesn't make sense. Rely on vruntime for fairness.
*/
if (rq->curr != p)
delta = max_t(s64, 10000LL, delta);
hrtick_start(rq, delta);
}
}
/*
* called from enqueue/dequeue and updates the hrtick when the
* current task is from our class and nr_running is low enough
* to matter.
*/
static void hrtick_update(struct rq *rq)
{
struct task_struct *curr = rq->curr;
if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
return;
if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
hrtick_start_fair(rq, curr);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
}
static inline void hrtick_update(struct rq *rq)
{
}
#endif
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
* then put the task into the rbtree:
*/
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);
/*
* end evaluation on encountering a throttled cfs_rq
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running increment below.
*/
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
flags = ENQUEUE_WAKEUP;
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
if (cfs_rq_throttled(cfs_rq))
break;
update_cfs_shares(cfs_rq);
update_entity_load_avg(se, 1);
}
if (!se) {
update_rq_runnable_avg(rq, rq->nr_running);
inc_nr_running(rq);
}
hrtick_update(rq);
}
static void set_next_buddy(struct sched_entity *se);
/*
* The dequeue_task method is called before nr_running is
* decreased. We remove the task from the rbtree and
* update the fair scheduling stats:
*/
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
/*
* end evaluation on encountering a throttled cfs_rq
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running decrement below.
*/
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
/*
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
if (task_sleep && parent_entity(se))
set_next_buddy(parent_entity(se));
/* avoid re-evaluating load for this entity */
se = parent_entity(se);
break;
}
flags |= DEQUEUE_SLEEP;
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
if (cfs_rq_throttled(cfs_rq))
break;
update_cfs_shares(cfs_rq);
update_entity_load_avg(se, 1);
}
if (!se) {
dec_nr_running(rq);
update_rq_runnable_avg(rq, 1);
}
hrtick_update(rq);
}
#ifdef CONFIG_SMP
/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(const int cpu)
{
return cpu_rq(cpu)->load.weight;
}
static inline int sched_lb_bias(void)
{
#ifndef CONFIG_NO_HZ_FULL
return sched_feat(LB_BIAS);
#else
return 0;
#endif
}
/*
* Return a low guess at the load of a migration-source cpu weighted
* according to the scheduling class and "nice" value.
*
* We want to under-estimate the load of migration sources, to
* balance conservatively.
*/
static unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
if (type == 0 || !sched_lb_bias())
return total;