| From d670ec13178d0fd8680e6742a2bc6e04f28f87d8 Mon Sep 17 00:00:00 2001 |
| From: Peter Zijlstra <a.p.zijlstra@chello.nl> |
| Date: Thu, 1 Sep 2011 12:42:04 +0200 |
| Subject: posix-cpu-timers: Cure SMP wobbles |
| |
| From: Peter Zijlstra <a.p.zijlstra@chello.nl> |
| |
| commit d670ec13178d0fd8680e6742a2bc6e04f28f87d8 upstream. |
| |
| David reported: |
| |
| Attached below is a watered-down version of rt/tst-cpuclock2.c from |
| GLIBC. Just build it with "gcc -o test test.c -lpthread -lrt" or |
| similar. |
| |
| Run it several times, and you will see cases where the main thread |
| will measure a process clock difference before and after the nanosleep |
| which is smaller than the cpu-burner thread's individual thread clock |
| difference. This doesn't make any sense since the cpu-burner thread |
| is part of the top-level process's thread group. |
| |
| I've reproduced this on both x86-64 and sparc64 (using both 32-bit and |
| 64-bit binaries). |
| |
| For example: |
| |
| [davem@boricha build-x86_64-linux]$ ./test |
| process: before(0.001221967) after(0.498624371) diff(497402404) |
| thread: before(0.000081692) after(0.498316431) diff(498234739) |
| self: before(0.001223521) after(0.001240219) diff(16698) |
| [davem@boricha build-x86_64-linux]$ |
| |
| The diff of 'process' should always be >= the diff of 'thread'. |
| |
| I make sure to wrap the 'thread' clock measurements the most tightly |
| around the nanosleep() call, and that the 'process' clock measurements |
| are the outer-most ones. |
| |
| --- |
| #include <unistd.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <time.h> |
| #include <fcntl.h> |
| #include <string.h> |
| #include <errno.h> |
| #include <pthread.h> |
| |
| static pthread_barrier_t barrier; |
| |
| static void *chew_cpu(void *arg) |
| { |
| pthread_barrier_wait(&barrier); |
| while (1) |
| __asm__ __volatile__("" : : : "memory"); |
| return NULL; |
| } |
| |
| int main(void) |
| { |
| clockid_t process_clock, my_thread_clock, th_clock; |
| struct timespec process_before, process_after; |
| struct timespec me_before, me_after; |
| struct timespec th_before, th_after; |
| struct timespec sleeptime; |
| unsigned long diff; |
| pthread_t th; |
| int err; |
| |
| err = clock_getcpuclockid(0, &process_clock); |
| if (err) |
| return 1; |
| |
| err = pthread_getcpuclockid(pthread_self(), &my_thread_clock); |
| if (err) |
| return 1; |
| |
| pthread_barrier_init(&barrier, NULL, 2); |
| err = pthread_create(&th, NULL, chew_cpu, NULL); |
| if (err) |
| return 1; |
| |
| err = pthread_getcpuclockid(th, &th_clock); |
| if (err) |
| return 1; |
| |
| pthread_barrier_wait(&barrier); |
| |
| err = clock_gettime(process_clock, &process_before); |
| if (err) |
| return 1; |
| |
| err = clock_gettime(my_thread_clock, &me_before); |
| if (err) |
| return 1; |
| |
| err = clock_gettime(th_clock, &th_before); |
| if (err) |
| return 1; |
| |
| sleeptime.tv_sec = 0; |
| sleeptime.tv_nsec = 500000000; |
| nanosleep(&sleeptime, NULL); |
| |
| err = clock_gettime(th_clock, &th_after); |
| if (err) |
| return 1; |
| |
| err = clock_gettime(my_thread_clock, &me_after); |
| if (err) |
| return 1; |
| |
| err = clock_gettime(process_clock, &process_after); |
| if (err) |
| return 1; |
| |
| diff = process_after.tv_nsec - process_before.tv_nsec; |
| printf("process: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", |
| process_before.tv_sec, process_before.tv_nsec, |
| process_after.tv_sec, process_after.tv_nsec, diff); |
| diff = th_after.tv_nsec - th_before.tv_nsec; |
| printf("thread: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", |
| th_before.tv_sec, th_before.tv_nsec, |
| th_after.tv_sec, th_after.tv_nsec, diff); |
| diff = me_after.tv_nsec - me_before.tv_nsec; |
| printf("self: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", |
| me_before.tv_sec, me_before.tv_nsec, |
| me_after.tv_sec, me_after.tv_nsec, diff); |
| |
| return 0; |
| } |
| |
| This is due to us using p->se.sum_exec_runtime in |
| thread_group_cputime() where we iterate the thread group and sum all |
| data. This does not take time since the last schedule operation (tick |
| or otherwise) into account. We can cure this by using |
| task_sched_runtime() at the cost of having to take locks. |
| |
| This also means we can (and must) do away with |
| thread_group_sched_runtime() since the modified thread_group_cputime() |
| is now more accurate and would deadlock when called from |
| thread_group_sched_runtime(). |
| |
| Aside of that it makes the function safe on 32 bit systems. The old |
| code added t->se.sum_exec_runtime unprotected. sum_exec_runtime is a |
| 64bit value and could be changed on another cpu at the same time. |
| |
| Reported-by: David Miller <davem@davemloft.net> |
| Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> |
| Link: http://lkml.kernel.org/r/1314874459.7945.22.camel@twins |
| Tested-by: David Miller <davem@davemloft.net> |
| Signed-off-by: Thomas Gleixner <tglx@linutronix.de> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> |
| |
| --- |
| include/linux/sched.h | 1 - |
| kernel/posix-cpu-timers.c | 5 +++-- |
| kernel/sched.c | 24 ------------------------ |
| 3 files changed, 3 insertions(+), 27 deletions(-) |
| |
| --- a/include/linux/sched.h |
| +++ b/include/linux/sched.h |
| @@ -1937,7 +1937,6 @@ static inline void disable_sched_clock_i |
| |
| extern unsigned long long |
| task_sched_runtime(struct task_struct *task); |
| -extern unsigned long long thread_group_sched_runtime(struct task_struct *task); |
| |
| /* sched_exec is called by processes performing an exec */ |
| #ifdef CONFIG_SMP |
| --- a/kernel/posix-cpu-timers.c |
| +++ b/kernel/posix-cpu-timers.c |
| @@ -250,7 +250,7 @@ void thread_group_cputime(struct task_st |
| do { |
| times->utime = cputime_add(times->utime, t->utime); |
| times->stime = cputime_add(times->stime, t->stime); |
| - times->sum_exec_runtime += t->se.sum_exec_runtime; |
| + times->sum_exec_runtime += task_sched_runtime(t); |
| } while_each_thread(tsk, t); |
| out: |
| rcu_read_unlock(); |
| @@ -312,7 +312,8 @@ static int cpu_clock_sample_group(const |
| cpu->cpu = cputime.utime; |
| break; |
| case CPUCLOCK_SCHED: |
| - cpu->sched = thread_group_sched_runtime(p); |
| + thread_group_cputime(p, &cputime); |
| + cpu->sched = cputime.sum_exec_runtime; |
| break; |
| } |
| return 0; |
| --- a/kernel/sched.c |
| +++ b/kernel/sched.c |
| @@ -3713,30 +3713,6 @@ unsigned long long task_sched_runtime(st |
| } |
| |
| /* |
| - * Return sum_exec_runtime for the thread group. |
| - * In case the task is currently running, return the sum plus current's |
| - * pending runtime that have not been accounted yet. |
| - * |
| - * Note that the thread group might have other running tasks as well, |
| - * so the return value not includes other pending runtime that other |
| - * running tasks might have. |
| - */ |
| -unsigned long long thread_group_sched_runtime(struct task_struct *p) |
| -{ |
| - struct task_cputime totals; |
| - unsigned long flags; |
| - struct rq *rq; |
| - u64 ns; |
| - |
| - rq = task_rq_lock(p, &flags); |
| - thread_group_cputime(p, &totals); |
| - ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); |
| - task_rq_unlock(rq, p, &flags); |
| - |
| - return ns; |
| -} |
| - |
| -/* |
| * Account user cpu time to a process. |
| * @p: the process that the cpu time gets accounted to |
| * @cputime: the cpu time spent in user space since the last update |