| From 9dd831da9bfeeff489dc3c21ae99a50e85dc5d46 Mon Sep 17 00:00:00 2001 |
| From: Suresh Siddha <suresh.b.siddha@intel.com> |
| Date: Thu, 10 Feb 2011 10:23:28 +0100 |
| Subject: sched: Use group weight, idle cpu metrics to fix imbalances during idle |
| |
| Commit: aae6d3ddd8b90f5b2c8d79a2b914d1706d124193 upstream |
| |
| Currently we consider a sched domain to be well balanced when the imbalance |
| is less than the domain's imablance_pct. As the number of cores and threads |
| are increasing, current values of imbalance_pct (for example 25% for a |
| NUMA domain) are not enough to detect imbalances like: |
| |
| a) On a WSM-EP system (two sockets, each having 6 cores and 12 logical threads), |
| 24 cpu-hogging tasks get scheduled as 13 on one socket and 11 on another |
| socket. Leading to an idle HT cpu. |
| |
| b) On a hypothetial 2 socket NHM-EX system (each socket having 8 cores and |
| 16 logical threads), 16 cpu-hogging tasks can get scheduled as 9 on one |
| socket and 7 on another socket. Leaving one core in a socket idle |
| whereas in another socket we have a core having both its HT siblings busy. |
| |
| While this issue can be fixed by decreasing the domain's imbalance_pct |
| (by making it a function of number of logical cpus in the domain), it |
| can potentially cause more task migrations across sched groups in an |
| overloaded case. |
| |
| Fix this by using imbalance_pct only during newly_idle and busy |
| load balancing. And during idle load balancing, check if there |
| is an imbalance in number of idle cpu's across the busiest and this |
| sched_group or if the busiest group has more tasks than its weight that |
| the idle cpu in this_group can pull. |
| |
| Reported-by: Nikhil Rao <ncrao@google.com> |
| Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> |
| Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> |
| LKML-Reference: <1284760952.2676.11.camel@sbsiddha-MOBL3.sc.intel.com> |
| Signed-off-by: Ingo Molnar <mingo@elte.hu> |
| Signed-off-by: Mike Galbraith <efault@gmx.de> |
| Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> |
| --- |
| include/linux/sched.h | 1 + |
| kernel/sched.c | 36 +++++++++++++++++++++++++++++++++--- |
| 2 files changed, 34 insertions(+), 3 deletions(-) |
| |
| --- a/include/linux/sched.h |
| +++ b/include/linux/sched.h |
| @@ -894,6 +894,7 @@ struct sched_group { |
| * single CPU. |
| */ |
| unsigned int cpu_power; |
| + unsigned int group_weight; |
| |
| /* |
| * The CPUs this group covers. |
| --- a/kernel/sched.c |
| +++ b/kernel/sched.c |
| @@ -3534,13 +3534,16 @@ struct sd_lb_stats { |
| unsigned long this_load_per_task; |
| unsigned long this_nr_running; |
| unsigned long this_has_capacity; |
| + unsigned int this_idle_cpus; |
| |
| /* Statistics of the busiest group */ |
| + unsigned int busiest_idle_cpus; |
| unsigned long max_load; |
| unsigned long busiest_load_per_task; |
| unsigned long busiest_nr_running; |
| unsigned long busiest_group_capacity; |
| unsigned long busiest_has_capacity; |
| + unsigned int busiest_group_weight; |
| |
| int group_imb; /* Is there imbalance in this sd */ |
| #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
| @@ -3562,6 +3565,8 @@ struct sg_lb_stats { |
| unsigned long sum_nr_running; /* Nr tasks running in the group */ |
| unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
| unsigned long group_capacity; |
| + unsigned long idle_cpus; |
| + unsigned long group_weight; |
| int group_imb; /* Is there an imbalance in the group ? */ |
| int group_has_capacity; /* Is there extra capacity in the group? */ |
| }; |
| @@ -3905,7 +3910,8 @@ static inline void update_sg_lb_stats(st |
| sgs->group_load += load; |
| sgs->sum_nr_running += rq->nr_running; |
| sgs->sum_weighted_load += weighted_cpuload(i); |
| - |
| + if (idle_cpu(i)) |
| + sgs->idle_cpus++; |
| } |
| |
| /* |
| @@ -3939,6 +3945,7 @@ static inline void update_sg_lb_stats(st |
| sgs->group_imb = 1; |
| |
| sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
| + sgs->group_weight = group->group_weight; |
| |
| if (sgs->group_capacity > sgs->sum_nr_running) |
| sgs->group_has_capacity = 1; |
| @@ -4004,13 +4011,16 @@ static inline void update_sd_lb_stats(st |
| sds->this_nr_running = sgs.sum_nr_running; |
| sds->this_load_per_task = sgs.sum_weighted_load; |
| sds->this_has_capacity = sgs.group_has_capacity; |
| + sds->this_idle_cpus = sgs.idle_cpus; |
| } else if (sgs.avg_load > sds->max_load && |
| (sgs.sum_nr_running > sgs.group_capacity || |
| sgs.group_imb)) { |
| sds->max_load = sgs.avg_load; |
| sds->busiest = group; |
| sds->busiest_nr_running = sgs.sum_nr_running; |
| + sds->busiest_idle_cpus = sgs.idle_cpus; |
| sds->busiest_group_capacity = sgs.group_capacity; |
| + sds->busiest_group_weight = sgs.group_weight; |
| sds->busiest_load_per_task = sgs.sum_weighted_load; |
| sds->busiest_has_capacity = sgs.group_has_capacity; |
| sds->group_imb = sgs.group_imb; |
| @@ -4235,8 +4245,26 @@ find_busiest_group(struct sched_domain * |
| if (sds.this_load >= sds.avg_load) |
| goto out_balanced; |
| |
| - if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) |
| - goto out_balanced; |
| + /* |
| + * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. |
| + * And to check for busy balance use !idle_cpu instead of |
| + * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE |
| + * even when they are idle. |
| + */ |
| + if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { |
| + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) |
| + goto out_balanced; |
| + } else { |
| + /* |
| + * This cpu is idle. If the busiest group load doesn't |
| + * have more tasks than the number of available cpu's and |
| + * there is no imbalance between this and busiest group |
| + * wrt to idle cpu's, it is balanced. |
| + */ |
| + if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && |
| + sds.busiest_nr_running <= sds.busiest_group_weight) |
| + goto out_balanced; |
| + } |
| |
| force_balance: |
| /* Looks like there is an imbalance. Compute it */ |
| @@ -8751,6 +8779,8 @@ static void init_sched_groups_power(int |
| if (cpu != group_first_cpu(sd->groups)) |
| return; |
| |
| + sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); |
| + |
| child = sd->child; |
| |
| sd->groups->cpu_power = 0; |