blob: 00ebd7686676bd87a6e5b3be513d27028e518796 [file] [log] [blame]
/*
* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
*
* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
* Interactivity improvements by Mike Galbraith
* (C) 2007 Mike Galbraith <efault@gmx.de>
*
* Various enhancements by Dmitry Adamushko.
* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
*
* Group scheduling enhancements by Srivatsa Vaddagiri
* Copyright IBM Corporation, 2007
* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
*
* Scaled math optimizations by Thomas Gleixner
* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
*
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*/
#include <linux/latencytop.h>
#include <linux/sched.h>
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
* and have no persistent notion like in traditional, time-slice
* based scheduling concepts.
*
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
*/
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*
* Options are:
* SCHED_TUNABLESCALING_NONE - unscaled, always *1
* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
*/
enum sched_tunable_scaling sysctl_sched_tunable_scaling
= SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_min_granularity = 750000ULL;
unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
/*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
*/
static unsigned int sched_nr_latency = 8;
/*
* After fork, child runs first. If set to 0 (default) then
* parent will (try to) run first.
*/
unsigned int sysctl_sched_child_runs_first __read_mostly;
/*
* sys_sched_yield() compat mode
*
* This option switches the agressive yield implementation of the
* old scheduler back on.
*/
unsigned int __read_mostly sysctl_sched_compat_yield;
/*
* SCHED_OTHER wake-up granularity.
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
*/
unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
static const struct sched_class fair_sched_class;
/**************************************************************
* CFS operations on generic schedulable entities:
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
/* cpu runqueue to which this cfs_rq is attached */
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
return cfs_rq->rq;
}
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
static inline struct task_struct *task_of(struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
WARN_ON_ONCE(!entity_is_task(se));
#endif
return container_of(se, struct task_struct, se);
}
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
{
return p->se.cfs_rq;
}
/* runqueue on which this entity is (to be) queued */
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
return se->cfs_rq;
}
/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
return grp->my_q;
}
/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
* another cpu ('this_cpu')
*/
static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
{
return cfs_rq->tg->cfs_rq[this_cpu];
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline int
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (se->cfs_rq == pse->cfs_rq)
return 1;
return 0;
}
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return se->parent;
}
/* return depth at which a sched entity is present in the hierarchy */
static inline int depth_se(struct sched_entity *se)
{
int depth = 0;
for_each_sched_entity(se)
depth++;
return depth;
}
static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
int se_depth, pse_depth;
/*
* preemption test can be made between sibling entities who are in the
* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
* both tasks until we find their ancestors who are siblings of common
* parent.
*/
/* First walk up until both entities are at same depth */
se_depth = depth_se(*se);
pse_depth = depth_se(*pse);
while (se_depth > pse_depth) {
se_depth--;
*se = parent_entity(*se);
}
while (pse_depth > se_depth) {
pse_depth--;
*pse = parent_entity(*pse);
}
while (!is_same_group(*se, *pse)) {
*se = parent_entity(*se);
*pse = parent_entity(*pse);
}
}
#else /* !CONFIG_FAIR_GROUP_SCHED */
static inline struct task_struct *task_of(struct sched_entity *se)
{
return container_of(se, struct task_struct, se);
}
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
return container_of(cfs_rq, struct rq, cfs);
}
#define entity_is_task(se) 1
#define for_each_sched_entity(se) \
for (; se; se = NULL)
static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
{
return &task_rq(p)->cfs;
}
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
struct task_struct *p = task_of(se);
struct rq *rq = task_rq(p);
return &rq->cfs;
}
/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
return NULL;
}
static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
{
return &cpu_rq(this_cpu)->cfs;
}
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
static inline int
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
return 1;
}
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return NULL;
}
static inline void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
/**************************************************************
* Scheduling class tree data structure manipulation methods:
*/
static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta > 0)
min_vruntime = vruntime;
return min_vruntime;
}
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta < 0)
min_vruntime = vruntime;
return min_vruntime;
}
static inline int entity_before(struct sched_entity *a,
struct sched_entity *b)
{
return (s64)(a->vruntime - b->vruntime) < 0;
}
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return se->vruntime - cfs_rq->min_vruntime;
}
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
u64 vruntime = cfs_rq->min_vruntime;
if (cfs_rq->curr)
vruntime = cfs_rq->curr->vruntime;
if (cfs_rq->rb_leftmost) {
struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
struct sched_entity,
run_node);
if (!cfs_rq->curr)
vruntime = se->vruntime;
else
vruntime = min_vruntime(vruntime, se->vruntime);
}
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
}
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
s64 key = entity_key(cfs_rq, se);
int leftmost = 1;
/*
* Find the right place in the rbtree:
*/
while (*link) {
parent = *link;
entry = rb_entry(parent, struct sched_entity, run_node);
/*
* We dont care about collisions. Nodes with
* the same key stay together.
*/
if (key < entity_key(cfs_rq, entry)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = 0;
}
}
/*
* Maintain a cache of leftmost tree entries (it is frequently
* used):
*/
if (leftmost)
cfs_rq->rb_leftmost = &se->run_node;
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->rb_leftmost == &se->run_node) {
struct rb_node *next_node;
next_node = rb_next(&se->run_node);
cfs_rq->rb_leftmost = next_node;
}
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = cfs_rq->rb_leftmost;
if (!left)
return NULL;
return rb_entry(left, struct sched_entity, run_node);
}
static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
if (!last)
return NULL;
return rb_entry(last, struct sched_entity, run_node);
}
/**************************************************************
* Scheduling class statistics methods:
*/
#ifdef CONFIG_SCHED_DEBUG
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
int factor = get_update_sysctl_factor();
if (ret || !write)
return ret;
sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
sysctl_sched_min_granularity);
#define WRT_SYSCTL(name) \
(normalized_sysctl_##name = sysctl_##name / (factor))
WRT_SYSCTL(sched_min_granularity);
WRT_SYSCTL(sched_latency);
WRT_SYSCTL(sched_wakeup_granularity);
WRT_SYSCTL(sched_shares_ratelimit);
#undef WRT_SYSCTL
return 0;
}
#endif
/*
* delta /= w
*/
static inline unsigned long
calc_delta_fair(unsigned long delta, struct sched_entity *se)
{
if (unlikely(se->load.weight != NICE_0_LOAD))
delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
return delta;
}
/*
* The idea is to set a period in which each task runs once.
*
* When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
* this period because otherwise the slices get too small.
*
* p = (nr <= nl) ? l : l*nr/nl
*/
static u64 __sched_period(unsigned long nr_running)
{
u64 period = sysctl_sched_latency;
unsigned long nr_latency = sched_nr_latency;
if (unlikely(nr_running > nr_latency)) {
period = sysctl_sched_min_granularity;
period *= nr_running;
}
return period;
}
/*
* We calculate the wall-time slice from the period by taking a part
* proportional to the weight.
*
* s = p*P[w/rw]
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
for_each_sched_entity(se) {
struct load_weight *load;
struct load_weight lw;
cfs_rq = cfs_rq_of(se);
load = &cfs_rq->load;
if (unlikely(!se->on_rq)) {
lw = cfs_rq->load;
update_load_add(&lw, se->load.weight);
load = &lw;
}
slice = calc_delta_mine(slice, se->load.weight, load);
}
return slice;
}
/*
* We calculate the vruntime slice of a to be inserted task
*
* vs = s/w
*/
static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
*/
static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
unsigned long delta_exec)
{
unsigned long delta_exec_weighted;
schedstat_set(curr->statistics.exec_max,
max((u64)delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
delta_exec_weighted = calc_delta_fair(delta_exec, curr);
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
}
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_of(cfs_rq)->clock_task;
unsigned long delta_exec;
if (unlikely(!curr))
return;
/*
* Get the amount of time the current task was running
* since the last time we changed load (this cannot
* overflow on 32 bits):
*/
delta_exec = (unsigned long)(now - curr->exec_start);
if (!delta_exec)
return;
__update_curr(cfs_rq, curr, delta_exec);
curr->exec_start = now;
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cpuacct_charge(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
}
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
}
/*
* Task is being enqueued - update stats:
*/
static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
update_stats_wait_start(cfs_rq, se);
}
static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
rq_of(cfs_rq)->clock - se->statistics.wait_start));
schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
rq_of(cfs_rq)->clock - se->statistics.wait_start);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
trace_sched_stat_wait(task_of(se),
rq_of(cfs_rq)->clock - se->statistics.wait_start);
}
#endif
schedstat_set(se->statistics.wait_start, 0);
}
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* Mark the end of the wait period if dequeueing a
* waiting task:
*/
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
}
/*
* We are picking a new current task - update its stats:
*/
static inline void
update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* We are starting a new run period:
*/
se->exec_start = rq_of(cfs_rq)->clock_task;
}
/**************************************************
* Scheduling class queueing methods:
*/
#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
static void
add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
{
cfs_rq->task_weight += weight;
}
#else
static inline void
add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
{
}
#endif
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
inc_cpu_load(rq_of(cfs_rq), se->load.weight);
if (entity_is_task(se)) {
add_cfs_task_weight(cfs_rq, se->load.weight);
list_add(&se->group_node, &cfs_rq->tasks);
}
cfs_rq->nr_running++;
se->on_rq = 1;
}
static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
dec_cpu_load(rq_of(cfs_rq), se->load.weight);
if (entity_is_task(se)) {
add_cfs_task_weight(cfs_rq, -se->load.weight);
list_del_init(&se->group_node);
}
cfs_rq->nr_running--;
se->on_rq = 0;
}
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
struct task_struct *tsk = NULL;
if (entity_is_task(se))
tsk = task_of(se);
if (se->statistics.sleep_start) {
u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > se->statistics.sleep_max))
se->statistics.sleep_max = delta;
se->statistics.sleep_start = 0;
se->statistics.sum_sleep_runtime += delta;
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
trace_sched_stat_sleep(tsk, delta);
}
}
if (se->statistics.block_start) {
u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > se->statistics.block_max))
se->statistics.block_max = delta;
se->statistics.block_start = 0;
se->statistics.sum_sleep_runtime += delta;
if (tsk) {
if (tsk->in_iowait) {
se->statistics.iowait_sum += delta;
se->statistics.iowait_count++;
trace_sched_stat_iowait(tsk, delta);
}
/*
* Blocking time is in units of nanosecs, so shift by
* 20 to get a milliseconds-range estimation of the
* amount of time that the task spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
profile_hits(SLEEP_PROFILING,
(void *)get_wchan(tsk),
delta >> 20);
}
account_scheduler_latency(tsk, delta >> 10, 0);
}
}
#endif
}
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
s64 d = se->vruntime - cfs_rq->min_vruntime;
if (d < 0)
d = -d;
if (d > 3*sysctl_sched_latency)
schedstat_inc(cfs_rq, nr_spread_over);
#endif
}
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;
/*
* The 'current' period is already promised to the current tasks,
* however the extra weight of the new task will slow them down a
* little, place the new task so that it fits in the slot that
* stays open at the end.
*/
if (initial && sched_feat(START_DEBIT))
vruntime += sched_vslice(cfs_rq, se);
/* sleeps up to a single latency don't count. */
if (!initial) {
unsigned long thresh = sysctl_sched_latency;
/*
* Halve their sleep time's effect, to allow
* for a gentler effect of sleepers:
*/
if (sched_feat(GENTLE_FAIR_SLEEPERS))
thresh >>= 1;
vruntime -= thresh;
}
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime(se->vruntime, vruntime);
se->vruntime = vruntime;
}
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update the normalized vruntime before updating min_vruntime
* through callig update_curr().
*/
if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
se->vruntime += cfs_rq->min_vruntime;
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
account_entity_enqueue(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
enqueue_sleeper(cfs_rq, se);
}
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
}
static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (!se || cfs_rq->last == se)
cfs_rq->last = NULL;
if (!se || cfs_rq->next == se)
cfs_rq->next = NULL;
}
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
for_each_sched_entity(se)
__clear_buddies(cfs_rq_of(se), se);
}
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
se->statistics.sleep_start = rq_of(cfs_rq)->clock;
if (tsk->state & TASK_UNINTERRUPTIBLE)
se->statistics.block_start = rq_of(cfs_rq)->clock;
}
#endif
}
clear_buddies(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
update_min_vruntime(cfs_rq);
/*
* Normalize the entity after updating the min_vruntime because the
* update can refer to the ->curr item and we need to reflect this
* movement in our normalized position.
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
}
/*
* Preempt the current task with a newly woken task if needed:
*/
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
unsigned long ideal_runtime, delta_exec;
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
resched_task(rq_of(cfs_rq)->curr);
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
*/
clear_buddies(cfs_rq, curr);
return;
}
/*
* Ensure that a task that missed wakeup preemption by a
* narrow margin doesn't have to wait for a full slice.
* This also mitigates buddy induced latencies under load.
*/
if (!sched_feat(WAKEUP_PREEMPT))
return;
if (delta_exec < sysctl_sched_min_granularity)
return;
if (cfs_rq->nr_running > 1) {
struct sched_entity *se = __pick_next_entity(cfs_rq);
s64 delta = curr->vruntime - se->vruntime;
if (delta > ideal_runtime)
resched_task(rq_of(cfs_rq)->curr);
}
}
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
* Any task has to be enqueued before it get to execute on
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
}
update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se;
#ifdef CONFIG_SCHEDSTATS
/*
* Track our maximum slice length, if the CPU's load is at
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
#endif
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = __pick_next_entity(cfs_rq);
struct sched_entity *left = se;
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
se = cfs_rq->next;
/*
* Prefer last buddy, try to return the CPU to a preempted task.
*/
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
se = cfs_rq->last;
clear_buddies(cfs_rq, se);
return se;
}
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
/*
* If still on the runqueue then deactivate_task()
* was not called and update_curr() has to be done:
*/
if (prev->on_rq)
update_curr(cfs_rq);
check_spread(cfs_rq, prev);
if (prev->on_rq) {
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
}
cfs_rq->curr = NULL;
}
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
* validating it and just reschedule.
*/
if (queued) {
resched_task(rq_of(cfs_rq)->curr);
return;
}
/*
* don't let the period tick interfere with the hrtick preemption
*/
if (!sched_feat(DOUBLE_TICK) &&
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
check_preempt_tick(cfs_rq, curr);
}
/**************************************************
* CFS operations on tasks:
*/
#ifdef CONFIG_SCHED_HRTICK
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
WARN_ON(task_rq(p) != rq);
if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
if (delta < 0) {
if (rq->curr == p)
resched_task(p);
return;
}
/*
* Don't schedule slices shorter than 10000ns, that just
* doesn't make sense. Rely on vruntime for fairness.
*/
if (rq->curr != p)
delta = max_t(s64, 10000LL, delta);
hrtick_start(rq, delta);
}
}
/*
* called from enqueue/dequeue and updates the hrtick when the
* current task is from our class and nr_running is low enough
* to matter.
*/
static void hrtick_update(struct rq *rq)
{
struct task_struct *curr = rq->curr;
if (curr->sched_class != &fair_sched_class)
return;
if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
hrtick_start_fair(rq, curr);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
}
static inline void hrtick_update(struct rq *rq)
{
}
#endif
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
* then put the task into the rbtree:
*/
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);
flags = ENQUEUE_WAKEUP;
}
hrtick_update(rq);
}
/*
* The dequeue_task method is called before nr_running is
* decreased. We remove the task from the rbtree and
* update the fair scheduling stats:
*/
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
break;
flags |= DEQUEUE_SLEEP;
}
hrtick_update(rq);
}
/*
* sched_yield() support is very simple - we dequeue and enqueue.
*
* If compat_yield is turned on then we requeue to the end of the tree.
*/
static void yield_task_fair(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *rightmost, *se = &curr->se;
/*
* Are we the only task in the tree?
*/
if (unlikely(cfs_rq->nr_running == 1))
return;
clear_buddies(cfs_rq, se);
if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
update_rq_clock(rq);
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
return;
}
/*
* Find the rightmost entry in the rbtree:
*/
rightmost = __pick_last_entity(cfs_rq);
/*
* Already in the rightmost position?
*/
if (unlikely(!rightmost || entity_before(rightmost, se)))
return;
/*
* Minimally necessary key value to be last in the tree:
* Upon rescheduling, sched_class::put_prev_task() will place
* 'current' within the tree based on its new key value.
*/
se->vruntime = rightmost->vruntime + 1;
}
#ifdef CONFIG_SMP
static void task_waking_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
se->vruntime -= cfs_rq->min_vruntime;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* effective_load() calculates the load change as seen from the root_task_group
*
* Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one
* can calculate the shift in shares.
*
* The problem is that perfectly aligning the shares is rather expensive, hence
* we try to avoid doing that too often - see update_shares(), which ratelimits
* this change.
*
* We compensate this by not only taking the current delta into account, but
* also considering the delta between when the shares were last adjusted and
* now.
*
* We still saw a performance dip, some tracing learned us that between
* cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
* significantly. Therefore try to bias the error in direction of failing
* the affine wakeup.
*
*/
static long effective_load(struct task_group *tg, int cpu,
long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
if (!tg->parent)
return wl;
/*
* By not taking the decrease of shares on the other cpu into
* account our error leans towards reducing the affine wakeups.
*/
if (!wl && sched_feat(ASYM_EFF_LOAD))
return wl;
for_each_sched_entity(se) {
long S, rw, s, a, b;
long more_w;
/*
* Instead of using this increment, also add the difference
* between when the shares were last updated and now.
*/
more_w = se->my_q->load.weight - se->my_q->rq_weight;
wl += more_w;
wg += more_w;
S = se->my_q->tg->shares;
s = se->my_q->shares;
rw = se->my_q->rq_weight;
a = S*(rw + wl);
b = S*rw + s*wg;
wl = s*(a-b);
if (likely(b))
wl /= b;
/*
* Assume the group is already running and will
* thus already be accounted for in the weight.
*
* That is, moving shares between CPUs, does not
* alter the group weight.
*/
wg = 0;
}
return wl;
}
#else
static inline unsigned long effective_load(struct task_group *tg, int cpu,
unsigned long wl, unsigned long wg)
{
return wl;
}
#endif
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
unsigned long this_load, load;
int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
struct task_group *tg;
unsigned long weight;
int balanced;
idx = sd->wake_idx;
this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
* of the current CPU:
*/
rcu_read_lock();
if (sync) {
tg = task_group(current);
weight = current->se.load.weight;
this_load += effective_load(tg, this_cpu, -weight, -weight);
load += effective_load(tg, prev_cpu, 0, -weight);
}
tg = task_group(p);
weight = p->se.load.weight;
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
* due to the sync cause above having dropped this_load to 0, we'll
* always have an imbalance, but there's really nothing you can do
* about that, so that's good too.
*
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
if (this_load) {
unsigned long this_eff_load, prev_eff_load;
this_eff_load = 100;
this_eff_load *= power_of(prev_cpu);
this_eff_load *= this_load +
effective_load(tg, this_cpu, weight, weight);
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= power_of(this_cpu);
prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
balanced = this_eff_load <= prev_eff_load;
} else
balanced = true;
rcu_read_unlock();
/*
* If the currently running task will sleep within
* a reasonable amount of time then attract this newly
* woken task:
*/
if (sync && balanced)
return 1;
schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
if (balanced ||
(this_load <= load &&
this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
/*
* This domain has SD_WAKE_AFFINE and
* p is cache cold in this domain, and
* there is no bad imbalance.
*/
schedstat_inc(sd, ttwu_move_affine);
schedstat_inc(p, se.statistics.nr_wakeups_affine);
return 1;
}
return 0;
}
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int load_idx)
{
struct sched_group *idlest = NULL, *group = sd->groups;
unsigned long min_load = ULONG_MAX, this_load = 0;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
do {
unsigned long load, avg_load;
int local_group;
int i;
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_cpus(group),
&p->cpus_allowed))
continue;
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
/* Tally up the load of all CPUs in the group */
avg_load = 0;
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
if (local_group)
load = source_load(i, load_idx);
else
load = target_load(i, load_idx);
avg_load += load;
}
/* Adjust by relative CPU power of the group */
avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
if (local_group) {
this_load = avg_load;
} else if (avg_load < min_load) {
min_load = avg_load;
idlest = group;
}
} while (group = group->next, group != sd->groups);
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
return idlest;
}
/*
* find_idlest_cpu - find the idlest cpu among the cpus in group.
*/
static int
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
int idlest = -1;
int i;
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
load = weighted_cpuload(i);
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
idlest = i;
}
}
return idlest;
}
/*
* Try and locate an idle CPU in the sched_domain.
*/
static int select_idle_sibling(struct task_struct *p, int target)
{
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
struct sched_domain *sd;
int i;
/*
* If the task is going to be woken-up on this cpu and if it is
* already idle, then it is the right target.
*/
if (target == cpu && idle_cpu(cpu))
return cpu;
/*
* If the task is going to be woken-up on the cpu where it previously
* ran and if it is currently idle, then it the right target.
*/
if (target == prev_cpu && idle_cpu(prev_cpu))
return prev_cpu;
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
for_each_domain(target, sd) {
if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
break;
for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
if (idle_cpu(i)) {
target = i;
break;
}
}
/*
* Lets stop looking for an idle sibling when we reached
* the domain that spans the current cpu and prev_cpu.
*/
if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
break;
}
return target;
}
/*
* sched_balance_self: balance the current task (running on cpu) in domains
* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
* SD_BALANCE_EXEC.
*
* Balance, ie. select the least loaded group.
*
* Returns the target CPU number, or the same CPU if no balancing is needed.
*
* preempt must be disabled.
*/
static int
select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
int new_cpu = cpu;
int want_affine = 0;
int want_sd = 1;
int sync = wake_flags & WF_SYNC;
if (sd_flag & SD_BALANCE_WAKE) {
if (cpumask_test_cpu(cpu, &p->cpus_allowed))
want_affine = 1;
new_cpu = prev_cpu;
}
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
continue;
/*
* If power savings logic is enabled for a domain, see if we
* are not overloaded, if so, don't balance wider.
*/
if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
unsigned long power = 0;
unsigned long nr_running = 0;
unsigned long capacity;
int i;
for_each_cpu(i, sched_domain_span(tmp)) {
power += power_of(i);
nr_running += cpu_rq(i)->cfs.nr_running;
}
capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
if (tmp->flags & SD_POWERSAVINGS_BALANCE)
nr_running /= 2;
if (nr_running < capacity)
want_sd = 0;
}
/*
* If both cpu and prev_cpu are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
affine_sd = tmp;
want_affine = 0;
}
if (!want_sd && !want_affine)
break;
if (!(tmp->flags & sd_flag))
continue;
if (want_sd)
sd = tmp;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
if (sched_feat(LB_SHARES_UPDATE)) {
/*
* Pick the largest domain to update shares over
*/
tmp = sd;
if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
tmp = affine_sd;
if (tmp) {
raw_spin_unlock(&rq->lock);
update_shares(tmp);
raw_spin_lock(&rq->lock);
}
}
#endif
if (affine_sd) {
if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
return select_idle_sibling(p, cpu);
else
return select_idle_sibling(p, prev_cpu);
}
while (sd) {
int load_idx = sd->forkexec_idx;
struct sched_group *group;
int weight;
if (!(sd->flags & sd_flag)) {
sd = sd->child;
continue;
}
if (sd_flag & SD_BALANCE_WAKE)
load_idx = sd->wake_idx;
group = find_idlest_group(sd, p, cpu, load_idx);
if (!group) {
sd = sd->child;
continue;
}
new_cpu = find_idlest_cpu(group, p, cpu);
if (new_cpu == -1 || new_cpu == cpu) {
/* Now try balancing at a lower domain level of cpu */
sd = sd->child;
continue;
}
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
weight = sd->span_weight;
sd = NULL;
for_each_domain(cpu, tmp) {
if (weight <= tmp->span_weight)
break;
if (tmp->flags & sd_flag)
sd = tmp;
}
/* while loop will break here if sd == NULL */
}
return new_cpu;
}
#endif /* CONFIG_SMP */
static unsigned long
wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
/*
* Since its curr running now, convert the gran from real-time
* to virtual-time in his units.
*
* By using 'se' instead of 'curr' we penalize light tasks, so
* they get preempted easier. That is, if 'se' < 'curr' then
* the resulting gran will be larger, therefore penalizing the
* lighter, if otoh 'se' > 'curr' then the resulting gran will
* be smaller, again penalizing the lighter task.
*
* This is especially important for buddies when the leftmost
* task is higher priority than the buddy.
*/
if (unlikely(se->load.weight != NICE_0_LOAD))
gran = calc_delta_fair(gran, se);
return gran;
}
/*
* Should 'se' preempt 'curr'.
*
* |s1
* |s2
* |s3
* g
* |<--->|c
*
* w(c, s1) = -1
* w(c, s2) = 0
* w(c, s3) = 1
*
*/
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
s64 gran, vdiff = curr->vruntime - se->vruntime;
if (vdiff <= 0)
return -1;
gran = wakeup_gran(curr, se);
if (vdiff > gran)
return 1;
return 0;
}
static void set_last_buddy(struct sched_entity *se)
{
if (likely(task_of(se)->policy != SCHED_IDLE)) {
for_each_sched_entity(se)
cfs_rq_of(se)->last = se;
}
}
static void set_next_buddy(struct sched_entity *se)
{
if (likely(task_of(se)->policy != SCHED_IDLE)) {
for_each_sched_entity(se)
cfs_rq_of(se)->next = se;
}
}
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
int scale = cfs_rq->nr_running >= sched_nr_latency;
if (unlikely(se == pse))
return;
if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
set_next_buddy(pse);
/*
* We can come here with TIF_NEED_RESCHED already set from new task
* wake up path.
*/
if (test_tsk_need_resched(curr))
return;
/*
* Batch and idle tasks do not preempt (their preemption is driven by
* the tick):
*/
if (unlikely(p->policy != SCHED_NORMAL))
return;
/* Idle tasks are by definition preempted by everybody. */
if (unlikely(curr->policy == SCHED_IDLE))
goto preempt;
if (!sched_feat(WAKEUP_PREEMPT))
return;
update_curr(cfs_rq);
find_matching_se(&se, &pse);
BUG_ON(!pse);
if (wakeup_preempt_entity(se, pse) == 1)
goto preempt;
return;
preempt:
resched_task(curr);
/*
* Only set the backward buddy when the current task is still
* on the rq. This can happen when a wakeup gets interleaved
* with schedule on the ->pre_schedule() or idle_balance()
* point, either of which can * drop the rq lock.
*
* Also, during early boot the idle thread is in the fair class,
* for obvious reasons its a bad idea to schedule back to it.
*/
if (unlikely(!se->on_rq || curr == rq->idle))
return;
if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
set_last_buddy(se);
}
static struct task_struct *pick_next_task_fair(struct rq *rq)
{
struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
if (!cfs_rq->nr_running)
return NULL;
do {
se = pick_next_entity(cfs_rq);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
p = task_of(se);
hrtick_start_fair(rq, p);
return p;
}
/*
* Account for a descheduled task:
*/
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
put_prev_entity(cfs_rq, se);
}
}
#ifdef CONFIG_SMP
/**************************************************
* Fair scheduling class load-balancing methods:
*/
/*
* pull_task - move a task from a remote runqueue to the local runqueue.
* Both runqueues must be locked.
*/
static void pull_task(struct rq *src_rq, struct task_struct *p,
struct rq *this_rq, int this_cpu)
{
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
check_preempt_curr(this_rq, p, 0);
}
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
static
int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned)
{
int tsk_cache_hot = 0;
/*
* We do not migrate tasks that are:
* 1) running (obviously), or
* 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) are cache-hot on their current CPU.
*/
if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
return 0;
}
*all_pinned = 0;
if (task_running(rq, p)) {
schedstat_inc(p, se.statistics.nr_failed_migrations_running);
return 0;
}
/*
* Aggressive migration if:
* 1) task is cache cold, or
* 2) too many balance attempts have failed.
*/
tsk_cache_hot = task_hot(p, rq->clock_task, sd);
if (!tsk_cache_hot ||
sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
if (tsk_cache_hot) {
schedstat_inc(sd, lb_hot_gained[idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations);
}
#endif
return 1;
}
if (tsk_cache_hot) {
schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
return 0;
}
return 1;
}
/*
* move_one_task tries to move exactly one task from busiest to this_rq, as
* part of active balancing operations within "domain".
* Returns 1 if successful and 0 otherwise.
*
* Called with both runqueues locked.
*/
static int
move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
struct sched_domain *sd, enum cpu_idle_type idle)
{
struct task_struct *p, *n;
struct cfs_rq *cfs_rq;
int pinned = 0;
for_each_leaf_cfs_rq(busiest, cfs_rq) {
list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
if (!can_migrate_task(p, busiest, this_cpu,
sd, idle, &pinned))
continue;
pull_task(busiest, p, this_rq, this_cpu);
/*
* Right now, this is only the second place pull_task()
* is called, so we can safely collect pull_task()
* stats here rather than inside pull_task().
*/
schedstat_inc(sd, lb_gained[idle]);
return 1;
}
}
return 0;
}
static unsigned long
balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move, struct sched_domain *sd,
enum cpu_idle_type idle, int *all_pinned,
int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
{
int loops = 0, pulled = 0, pinned = 0;
long rem_load_move = max_load_move;
struct task_struct *p, *n;
if (max_load_move == 0)
goto out;
pinned = 1;
list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
if (loops++ > sysctl_sched_nr_migrate)
break;
if ((p->se.load.weight >> 1) > rem_load_move ||
!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
continue;
pull_task(busiest, p, this_rq, this_cpu);
pulled++;
rem_load_move -= p->se.load.weight;
#ifdef CONFIG_PREEMPT
/*
* NEWIDLE balancing is a source of latency, so preemptible
* kernels will stop after the first task is pulled to minimize
* the critical section.
*/
if (idle == CPU_NEWLY_IDLE)
break;
#endif
/*
* We only want to steal up to the prescribed amount of
* weighted load.
*/
if (rem_load_move <= 0)
break;
if (p->prio < *this_best_prio)
*this_best_prio = p->prio;
}
out:
/*
* Right now, this is one of only two places pull_task() is called,
* so we can safely collect pull_task() stats here rather than
* inside pull_task().
*/
schedstat_add(sd, lb_gained[idle], pulled);
if (all_pinned)
*all_pinned = pinned;
return max_load_move - rem_load_move;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, int *this_best_prio)
{
long rem_load_move = max_load_move;
int busiest_cpu = cpu_of(busiest);
struct task_group *tg;
rcu_read_lock();
update_h_load(busiest_cpu);
list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
unsigned long busiest_h_load = busiest_cfs_rq->h_load;
unsigned long busiest_weight = busiest_cfs_rq->load.weight;
u64 rem_load, moved_load;
/*
* empty group
*/
if (!busiest_cfs_rq->task_weight)
continue;
rem_load = (u64)rem_load_move * busiest_weight;
rem_load = div_u64(rem_load, busiest_h_load + 1);
moved_load = balance_tasks(this_rq, this_cpu, busiest,
rem_load, sd, idle, all_pinned, this_best_prio,
busiest_cfs_rq);
if (!moved_load)
continue;
moved_load *= busiest_h_load;
moved_load = div_u64(moved_load, busiest_weight + 1);
rem_load_move -= moved_load;
if (rem_load_move < 0)
break;
}
rcu_read_unlock();
return max_load_move - rem_load_move;
}
#else
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, int *this_best_prio)
{
return balance_tasks(this_rq, this_cpu, busiest,
max_load_move, sd, idle, all_pinned,
this_best_prio, &busiest->cfs);
}
#endif
/*
* move_tasks tries to move up to max_load_move weighted load from busiest to
* this_rq, as part of a balancing operation within domain "sd".
* Returns 1 if successful and 0 otherwise.
*
* Called with both runqueues locked.
*/
static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned)
{
unsigned long total_load_moved = 0, load_moved;
int this_best_prio = this_rq->curr->prio;
do {
load_moved = load_balance_fair(this_rq, this_cpu, busiest,
max_load_move - total_load_moved,
sd, idle, all_pinned, &this_best_prio);
total_load_moved += load_moved;
#ifdef CONFIG_PREEMPT
/*
* NEWIDLE balancing is a source of latency, so preemptible
* kernels will stop after the first task is pulled to minimize
* the critical section.
*/
if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
break;
if (raw_spin_is_contended(&this_rq->lock) ||
raw_spin_is_contended(&busiest->lock))
break;
#endif
} while (load_moved && max_load_move > total_load_moved);
return total_load_moved > 0;
}
/********** Helpers for find_busiest_group ************************/
/*
* sd_lb_stats - Structure to store the statistics of a sched_domain
* during load balancing.
*/
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *this; /* Local group in this sd */
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_pwr; /* Total power of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
/** Statistics of this group */
unsigned long this_load;
unsigned long this_load_per_task;
unsigned long this_nr_running;
unsigned long this_has_capacity;
unsigned int this_idle_cpus;
/* Statistics of the busiest group */
unsigned int busiest_idle_cpus;
unsigned long max_load;
unsigned long busiest_load_per_task;
unsigned long busiest_nr_running;
unsigned long busiest_group_capacity;
unsigned long busiest_has_capacity;
unsigned int busiest_group_weight;
int group_imb; /* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
int power_savings_balance; /* Is powersave balance needed for this sd */
struct sched_group *group_min; /* Least loaded group in sd */
struct sched_group *group_leader; /* Group which relieves group_min */
unsigned long min_load_per_task; /* load_per_task in group_min */
unsigned long leader_nr_running; /* Nr running of group_leader */
unsigned long min_nr_running; /* Nr running of group_min */
#endif
};
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
unsigned long sum_nr_running; /* Nr tasks running in the group */
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long group_capacity;
unsigned long idle_cpus;
unsigned long group_weight;
int group_imb; /* Is there an imbalance in the group ? */
int group_has_capacity; /* Is there extra capacity in the group? */
};
/**
* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
* @group: The group whose first cpu is to be returned.
*/
static inline unsigned int group_first_cpu(struct sched_group *group)
{
return cpumask_first(sched_group_cpus(group));
}
/**
* get_sd_load_idx - Obtain the load index for a given sched domain.
* @sd: The sched_domain whose load_idx is to be obtained.
* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
*/
static inline int get_sd_load_idx(struct sched_domain *sd,
enum cpu_idle_type idle)
{
int load_idx;
switch (idle) {
case CPU_NOT_IDLE:
load_idx = sd->busy_idx;
break;
case CPU_NEWLY_IDLE:
load_idx = sd->newidle_idx;
break;
default:
load_idx = sd->idle_idx;
break;
}
return load_idx;
}
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
/**
* init_sd_power_savings_stats - Initialize power savings statistics for
* the given sched_domain, during load balancing.
*
* @sd: Sched domain whose power-savings statistics are to be initialized.
* @sds: Variable containing the statistics for sd.
* @idle: Idle status of the CPU at which we're performing load-balancing.
*/
static inline void init_sd_power_savings_stats(struct sched_domain *sd,
struct sd_lb_stats *sds, enum cpu_idle_type idle)
{
/*
* Busy processors will not participate in power savings
* balance.
*/
if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
sds->power_savings_balance = 0;
else {
sds->power_savings_balance = 1;
sds->min_nr_running = ULONG_MAX;
sds->leader_nr_running = 0;
}
}
/**
* update_sd_power_savings_stats - Update the power saving stats for a
* sched_domain while performing load balancing.
*
* @group: sched_group belonging to the sched_domain under consideration.
* @sds: Variable containing the statistics of the sched_domain
* @local_group: Does group contain the CPU for which we're performing
* load balancing ?
* @sgs: Variable containing the statistics of the group.
*/
static inline void update_sd_power_savings_stats(struct sched_group *group,
struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
{
if (!sds->power_savings_balance)
return;
/*
* If the local group is idle or completely loaded
* no need to do power savings balance at this domain
*/
if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
!sds->this_nr_running))
sds->power_savings_balance = 0;
/*
* If a group is already running at full capacity or idle,
* don't include that group in power savings calculations
*/
if (!sds->power_savings_balance ||
sgs->sum_nr_running >= sgs->group_capacity ||
!sgs->sum_nr_running)
return;
/*
* Calculate the group which has the least non-idle load.
* This is the group from where we need to pick up the load
* for saving power
*/
if ((sgs->sum_nr_running < sds->min_nr_running) ||
(sgs->sum_nr_running == sds->min_nr_running &&
group_first_cpu(group) > group_first_cpu(sds->group_min))) {
sds->group_min = group;
sds->min_nr_running = sgs->sum_nr_running;
sds->min_load_per_task = sgs->sum_weighted_load /
sgs->sum_nr_running;
}
/*
* Calculate the group which is almost near its
* capacity but still has some space to pick up some load
* from other group and save more power
*/
if (sgs->sum_nr_running + 1 > sgs->group_capacity)
return;
if (sgs->sum_nr_running > sds->leader_nr_running ||
(sgs->sum_nr_running == sds->leader_nr_running &&
group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
sds->group_leader = group;
sds->leader_nr_running = sgs->sum_nr_running;
}
}
/**
* check_power_save_busiest_group - see if there is potential for some power-savings balance
* @sds: Variable containing the statistics of the sched_domain
* under consideration.
* @this_cpu: Cpu at which we're currently performing load-balancing.
* @imbalance: Variable to store the imbalance.
*
* Description:
* Check if we have potential to perform some power-savings balance.
* If yes, set the busiest group to be the least loaded group in the
* sched_domain, so that it's CPUs can be put to idle.
*
* Returns 1 if there is potential to perform power-savings balance.
* Else returns 0.
*/
static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
int this_cpu, unsigned long *imbalance)
{
if (!sds->power_savings_balance)
return 0;
if (sds->this != sds->group_leader ||
sds->group_leader == sds->group_min)
return 0;
*imbalance = sds->min_load_per_task;
sds->busiest = sds->group_min;
return 1;
}
#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
static inline void init_sd_power_savings_stats(struct sched_domain *sd,
struct sd_lb_stats *sds, enum cpu_idle_type idle)
{
return;
}
static inline void update_sd_power_savings_stats(struct sched_group *group,
struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
{
return;
}
static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
int this_cpu, unsigned long *imbalance)
{
return 0;
}
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
{
return SCHED_LOAD_SCALE;
}
unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
{
return default_scale_freq_power(sd, cpu);
}
unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
unsigned long smt_gain = sd->smt_gain;
smt_gain /= weight;
return smt_gain;
}
unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
{
return default_scale_smt_power(sd, cpu);
}
unsigned long scale_rt_power(int cpu)
{
struct rq *rq = cpu_rq(cpu);
u64 total, available;
total = sched_avg_period() + (rq->clock - rq->age_stamp);
if (unlikely(total < rq->rt_avg)) {
/* Ensures that power won't end up being negative */
available = 0;
} else {
available = total - rq->rt_avg;
}
if (unlikely((s64)total < SCHED_LOAD_SCALE))
total = SCHED_LOAD_SCALE;
total >>= SCHED_LOAD_SHIFT;
return div_u64(available, total);
}
static void update_cpu_power(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
unsigned long power = SCHED_LOAD_SCALE;
struct sched_group *sdg = sd->groups;
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
if (sched_feat(ARCH_POWER))
power *= arch_scale_smt_power(sd, cpu);
else
power *= default_scale_smt_power(sd, cpu);
power >>= SCHED_LOAD_SHIFT;
}
sdg->cpu_power_orig = power;
if (sched_feat(ARCH_POWER))
power *= arch_scale_freq_power(sd, cpu);
else
power *= default_scale_freq_power(sd, cpu);
power >>= SCHED_LOAD_SHIFT;
power *= scale_rt_power(cpu);
power >>= SCHED_LOAD_SHIFT;
if (!power)
power = 1;
cpu_rq(cpu)->cpu_power = power;
sdg->cpu_power = power;
}
static void update_group_power(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
unsigned long power;
if (!child) {
update_cpu_power(sd, cpu);
return;
}
power = 0;
group = child->groups;
do {
power += group->cpu_power;
group = group->next;
} while (group != child->groups);
sdg->cpu_power = power;
}
/*
* Try and fix up capacity for tiny siblings, this is needed when
* things like SD_ASYM_PACKING need f_b_g to select another sibling
* which on its own isn't powerful enough.
*
* See update_sd_pick_busiest() and check_asym_packing().
*/
static inline int
fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
{
/*
* Only siblings can have significantly less than SCHED_LOAD_SCALE
*/
if (sd->level != SD_LV_SIBLING)
return 0;
/*
* If ~90% of the cpu_power is still there, we're good.
*/
if (group->cpu_power * 32 > group->cpu_power_orig * 29)
return 1;
return 0;
}
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @sd: The sched_domain whose statistics are to be updated.
* @group: sched_group whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @sd_idle: Idle status of the sched_domain containing group.
* @local_group: Does group contain this_cpu.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sgs: variable to hold the statistics for this group.
*/
static inline void update_sg_lb_stats(struct sched_domain *sd,
struct sched_group *group, int this_cpu,
enum cpu_idle_type idle, int load_idx, int *sd_idle,
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
{
unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
int i;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long avg_load_per_task = 0;
if (local_group)
balance_cpu = group_first_cpu(group);
/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
min_cpu_load = ~0UL;
max_nr_running = 0;
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
struct rq *rq = cpu_rq(i);
if (*sd_idle && rq->nr_running)
*sd_idle = 0;
/* Bias balancing toward cpus of our domain */
if (local_group) {
if (idle_cpu(i) && !first_idle_cpu) {
first_idle_cpu = 1;
balance_cpu = i;
}
load = target_load(i, load_idx);
} else {
load = source_load(i, load_idx);
if (load > max_cpu_load) {
max_cpu_load = load;
max_nr_running = rq->nr_running;
}
if (min_cpu_load > load)
min_cpu_load = load;
}
sgs->group_load += load;
sgs->sum_nr_running += rq->nr_running;
sgs->sum_weighted_load += weighted_cpuload(i);
if (idle_cpu(i))
sgs->idle_cpus++;
}
/*
* First idle cpu or the first cpu(busiest) in this sched group
* is eligible for doing load balancing at this and above
* domains. In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
if (idle != CPU_NEWLY_IDLE && local_group) {
if (balance_cpu != this_cpu) {
*balance = 0;
return;
}
update_group_power(sd, this_cpu);
}
/* Adjust by relative CPU power of the group */
sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
/*
* Consider the group unbalanced when the imbalance is larger
* than the average weight of two tasks.
*
* APZ: with cgroup the avg task weight can vary wildly and
* might not be a suitable number - should we keep a
* normalized nr_running number somewhere that negates
* the hierarchy?
*/
if (sgs->sum_nr_running)
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
if (!sgs->group_capacity)
sgs->group_capacity = fix_small_capacity(sd, group);
sgs->group_weight = group->group_weight;
if (sgs->group_capacity > sgs->sum_nr_running)
sgs->group_has_capacity = 1;
}
/**
* update_sd_pick_busiest - return 1 on busiest group
* @sd: sched_domain whose statistics are to be checked
* @sds: sched_domain statistics
* @sg: sched_group candidate to be checked for being the busiest
* @sgs: sched_group statistics
* @this_cpu: the current cpu
*
* Determine if @sg is a busier group than the previously selected
* busiest group.
*/
static bool update_sd_pick_busiest(struct sched_domain *sd,
struct sd_lb_stats *sds,
struct sched_group *sg,
struct sg_lb_stats *sgs,
int this_cpu)
{
if (sgs->avg_load <= sds->max_load)
return false;
if (sgs->sum_nr_running > sgs->group_capacity)
return true;
if (sgs->group_imb)
return true;
/*
* ASYM_PACKING needs to move all the work to the lowest
* numbered CPUs in the group, therefore mark all groups
* higher than ourself as busy.
*/
if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
this_cpu < group_first_cpu(sg)) {
if (!sds->busiest)
return true;
if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
return true;
}
return false;
}
/**
* update_sd_lb_stats - Update sched_group's statistics for load balancing.
* @sd: sched_domain whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
* @sd_idle: Idle status of the sched_domain containing sg.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
*/
static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
enum cpu_idle_type idle, int *sd_idle,
const struct cpumask *cpus, int *balance,
struct sd_lb_stats *sds)
{
struct sched_domain *child = sd->child;
struct sched_group *sg = sd->groups;
struct sg_lb_stats sgs;
int load_idx, prefer_sibling = 0;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
init_sd_power_savings_stats(sd, sds, idle);
load_idx = get_sd_load_idx(sd, idle);
do {
int local_group;
local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
memset(&sgs, 0, sizeof(sgs));
update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
local_group, cpus, balance, &sgs);
if (local_group && !(*balance))
return;
sds->total_load += sgs.group_load;
sds->total_pwr += sg->cpu_power;
/*
* In case the child domain prefers tasks go to siblings
* first, lower the sg capacity to one so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
* these excess tasks, i.e. nr_running < group_capacity. The
* extra check prevents the case where you always pull from the
* heaviest group when it is already under-utilized (possible
* with a large weight task outweighs the tasks on the system).
*/
if (prefer_sibling && !local_group && sds->this_has_capacity)
sgs.group_capacity = min(sgs.group_capacity, 1UL);
if (local_group) {
sds->this_load = sgs.avg_load;
sds->this = sg;
sds->this_nr_running = sgs.sum_nr_running;
sds->this_load_per_task = sgs.sum_weighted_load;
sds->this_has_capacity = sgs.group_has_capacity;
sds->this_idle_cpus = sgs.idle_cpus;
} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
sds->max_load = sgs.avg_load;
sds->busiest = sg;
sds->busiest_nr_running = sgs.sum_nr_running;
sds->busiest_idle_cpus = sgs.idle_cpus;
sds->busiest_group_capacity = sgs.group_capacity;
sds->busiest_load_per_task = sgs.sum_weighted_load;
sds->busiest_has_capacity = sgs.group_has_capacity;
sds->busiest_group_weight = sgs.group_weight;
sds->group_imb = sgs.group_imb;
}
update_sd_power_savings_stats(sg, sds, local_group, &sgs);
sg = sg->next;
} while (sg != sd->groups);
}
int __weak arch_sd_sibling_asym_packing(void)
{
return 0*SD_ASYM_PACKING;
}
/**
* check_asym_packing - Check to see if the group is packed into the
* sched doman.
*
* This is primarily intended to used at the sibling level. Some
* cores like POWER7 prefer to use lower numbered SMT threads. In the
* case of POWER7, it can move to lower SMT modes only when higher
* threads are idle. When in lower SMT modes, the threads will
* perform better since they share less core resources. Hence when we
* have idle threads, we want them to be the higher ones.
*
* This packing function is run on idle threads. It checks to see if
* the busiest CPU in this domain (core in the P7 case) has a higher
* CPU number than the packing function is being run on. Here we are
* assuming lower CPU number will be equivalent to lower a SMT thread
* number.
*
* Returns 1 when packing is required and a task should be moved to
* this CPU. The amount of the imbalance is returned in *imbalance.
*
* @sd: The sched_domain whose packing is to be checked.
* @sds: Statistics of the sched_domain which is to be packed
* @this_cpu: The cpu at whose sched_domain we're performing load-balance.
* @imbalance: returns amount of imbalanced due to packing.
*/
static int check_asym_packing(struct sched_domain *sd,
struct sd_lb_stats *sds,
int this_cpu, unsigned long *imbalance)
{
int busiest_cpu;
if (!(sd->flags & SD_ASYM_PACKING))
return 0;
if (!sds->busiest)
return 0;
busiest_cpu = group_first_cpu(sds->busiest);
if (this_cpu > busiest_cpu)
return 0;
*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
SCHED_LOAD_SCALE);
return 1;
}
/**
* fix_small_imbalance - Calculate the minor imbalance that exists
* amongst the groups of a sched_domain, during
* load balancing.
* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
* @this_cpu: The cpu at whose sched_domain we're performing load-balance.
* @imbalance: Variable to store the imbalance.
*/
static inline void fix_small_imbalance(struct sd_lb_stats *sds,
int this_cpu, unsigned long *imbalance)
{
unsigned long tmp, pwr_now = 0, pwr_move = 0;
unsigned int imbn = 2;
unsigned long scaled_busy_load_per_task;
if (sds->this_nr_running) {
sds->this_load_per_task /= sds->this_nr_running;
if (sds->busiest_load_per_task >
sds->this_load_per_task)
imbn = 1;
} else
sds->this_load_per_task =
cpu_avg_load_per_task(this_cpu);
scaled_busy_load_per_task = sds->busiest_load_per_task
* SCHED_LOAD_SCALE;
scaled_busy_load_per_task /= sds->busiest->cpu_power;
if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
(scaled_busy_load_per_task * imbn)) {
*imbalance = sds->busiest_load_per_task;
return;
}
/*
* OK, we don't have enough imbalance to justify moving tasks,
* however we may be able to increase total CPU power used by
* moving them.
*/
pwr_now += sds->busiest->cpu_power *
min(sds->busiest_load_per_task, sds->max_load);
pwr_now += sds->this->cpu_power *
min(sds->this_load_per_task, sds->this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
sds->busiest->cpu_power;
if (sds->max_load > tmp)
pwr_move += sds->busiest->cpu_power *
min(sds->busiest_load_per_task, sds->max_load - tmp);
/* Amount of load we'd add */
if (sds->max_load * sds->busiest->cpu_power <
sds->busiest_load_per_task * SCHED_LOAD_SCALE)
tmp = (sds->max_load * sds->busiest->cpu_power) /
sds->this->cpu_power;
else
tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
sds->this->cpu_power;
pwr_move += sds->this->cpu_power *
min(sds->this_load_per_task, sds->this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
if (pwr_move > pwr_now)
*imbalance = sds->busiest_load_per_task;
}
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
* @sds: statistics of the sched_domain whose imbalance is to be calculated.
* @this_cpu: Cpu for which currently load balance is being performed.
* @imbalance: The variable to store the imbalance.
*/
static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
unsigned long *imbalance)
{
unsigned long max_pull, load_above_capacity = ~0UL;
sds->busiest_load_per_task /= sds->busiest_nr_running;
if (sds->group_imb) {
sds->busiest_load_per_task =
min(sds->busiest_load_per_task, sds->avg_load);
}
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
* its cpu_power, while calculating max_load..)
*/
if (sds->max_load < sds->avg_load) {
*imbalance = 0;
return fix_small_imbalance(sds, this_cpu, imbalance);
}
if (!sds->group_imb) {
/*
* Don't want to pull so many tasks that a group would go idle.
*/
load_above_capacity = (sds->busiest_nr_running -
sds->busiest_group_capacity);
load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
load_above_capacity /= sds->busiest->cpu_power;
}
/*
* We're trying to get all the cpus to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
* reduce the max loaded cpu below the average load. At the same time,
* we also don't want to reduce the group load below the group capacity
* (so that we can implement power-savings policies etc). Thus we look
* for the minimum possible imbalance.
* Be careful of negative numbers as they'll appear as very large values
* with unsigned longs.
*/
max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
/* How much load to actually move to equalise the imbalance */
*imbalance = min(max_pull * sds->busiest->cpu_power,
(sds->avg_load - sds->this_load) * sds->this->cpu_power)
/ SCHED_LOAD_SCALE;
/*
* if *imbalance is less than the average load per runnable task
* there is no gaurantee that any tasks will be moved so we'll have
* a think about bumping its value to force at least one task to be
* moved
*/
if (*imbalance < sds->busiest_load_per_task)
return fix_small_imbalance(sds, this_cpu, imbalance);
}
/******* find_busiest_group() helpers end here *********************/
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance. If there isn't an imbalance, and
* the user has opted for power-savings, it returns a group whose
* CPUs can be put to idle by rebalancing those tasks elsewhere, if
* such a group exists.
*
* Also calculates the amount of weighted load which should be moved
* to restore balance.
*
* @sd: The sched_domain whose busiest group is to be returned.
* @this_cpu: The cpu for which load balancing is currently being performed.
* @imbalance: Variable which stores amount of weighted load which should
* be moved to restore balance/put a group to idle.
* @idle: The idle status of this_cpu.
* @sd_idle: The idleness of sd
* @cpus: The set of CPUs under consideration for load-balancing.
* @balance: Pointer to a variable indicating if this_cpu
* is the appropriate cpu to perform load balancing at this_level.
*
* Returns: - the busiest group if imbalance exists.
* - If no imbalance and user has opted for power-savings balance,
* return the least loaded group whose CPUs can be
* put to idle by rebalancing its tasks onto our group.
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum cpu_idle_type idle,
int *sd_idle, const struct cpumask *cpus, int *balance)
{
struct sd_lb_stats sds;
memset(&sds, 0, sizeof(sds));
/*
* Compute the various statistics relavent for load balancing at
* this level.
*/
update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
balance, &sds);
/* Cases where imbalance does not exist from POV of this_cpu */
/* 1) this_cpu is not the appropriate cpu to perform load balancing
* at this level.
* 2) There is no busy sibling group to pull from.
* 3) This group is the busiest group.
* 4) This group is more busy than the avg busieness at this
* sched_domain.
* 5) The imbalance is within the specified limit.
*
* Note: when doing newidle balance, if the local group has excess
* capacity (i.e. nr_running < group_capacity) and the busiest group
* does not have any capacity, we force a load balance to pull tasks
* to the local group. In this case, we skip past checks 3, 4 and 5.
*/
if (!(*balance))
goto ret;
if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
check_asym_packing(sd, &sds, this_cpu, imbalance))
return sds.busiest;
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
!sds.busiest_has_capacity)
goto force_balance;
if (sds.this_load >= sds.max_load)
goto out_balanced;
sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
if (sds.this_load >= sds.avg_load)
goto out_balanced;
/*
* In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
* And to check for busy balance use !idle_cpu instead of
* CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
* even when they are idle.
*/
if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
goto out_balanced;
} else {
/*
* This cpu is idle. If the busiest group load doesn't
* have more tasks than the number of available cpu's and
* there is no imbalance between this and busiest group
* wrt to idle cpu's, it is balanced.
*/
if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
sds.busiest_nr_running <= sds.busiest_group_weight)
goto out_balanced;
}
force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(&sds, this_cpu, imbalance);
return sds.busiest;
out_balanced:
/*
* There is no obvious imbalance. But check if we can do some balancing
* to save power.
*/
if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
return sds.busiest;
ret:
*imbalance = 0;
return NULL;
}
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
static struct rq *
find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
enum cpu_idle_type idle, unsigned long imbalance,
const struct cpumask *cpus)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
int i;
for_each_cpu(i, sched_group_cpus(group)) {
unsigned long power = power_of(i);
unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE