| From 833f32d763028c1bb371c64f457788b933773b3e Mon Sep 17 00:00:00 2001 |
| From: John Stultz <john.stultz@linaro.org> |
| Date: Thu, 11 Jun 2015 15:54:55 -0700 |
| Subject: time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap |
| second edge |
| |
| commit 833f32d763028c1bb371c64f457788b933773b3e upstream. |
| |
| Currently, leapsecond adjustments are done at tick time. As a result, |
| the leapsecond was applied at the first timer tick *after* the |
| leapsecond (~1-10ms late depending on HZ), rather then exactly on the |
| second edge. |
| |
| This was in part historical from back when we were always tick based, |
| but correcting this since has been avoided since it adds extra |
| conditional checks in the gettime fastpath, which has performance |
| overhead. |
| |
| However, it was recently pointed out that ABS_TIME CLOCK_REALTIME |
| timers set for right after the leapsecond could fire a second early, |
| since some timers may be expired before we trigger the timekeeping |
| timer, which then applies the leapsecond. |
| |
| This isn't quite as bad as it sounds, since behaviorally it is similar |
| to what is possible w/ ntpd made leapsecond adjustments done w/o using |
| the kernel discipline. Where due to latencies, timers may fire just |
| prior to the settimeofday call. (Also, one should note that all |
| applications using CLOCK_REALTIME timers should always be careful, |
| since they are prone to quirks from settimeofday() disturbances.) |
| |
| However, the purpose of having the kernel do the leap adjustment is to |
| avoid such latencies, so I think this is worth fixing. |
| |
| So in order to properly keep those timers from firing a second early, |
| this patch modifies the ntp and timekeeping logic so that we keep |
| enough state so that the update_base_offsets_now accessor, which |
| provides the hrtimer core the current time, can check and apply the |
| leapsecond adjustment on the second edge. This prevents the hrtimer |
| core from expiring timers too early. |
| |
| This patch does not modify any other time read path, so no additional |
| overhead is incurred. However, this also means that the leap-second |
| continues to be applied at tick time for all other read-paths. |
| |
| Apologies to Richard Cochran, who pushed for similar changes years |
| ago, which I resisted due to the concerns about the performance |
| overhead. |
| |
| While I suspect this isn't extremely critical, folks who care about |
| strict leap-second correctness will likely want to watch |
| this. Potentially a -stable candidate eventually. |
| |
| Originally-suggested-by: Richard Cochran <richardcochran@gmail.com> |
| Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com> |
| Reported-by: Prarit Bhargava <prarit@redhat.com> |
| Signed-off-by: John Stultz <john.stultz@linaro.org> |
| Cc: Richard Cochran <richardcochran@gmail.com> |
| Cc: Jan Kara <jack@suse.cz> |
| Cc: Jiri Bohac <jbohac@suse.cz> |
| Cc: Shuah Khan <shuahkh@osg.samsung.com> |
| Cc: Ingo Molnar <mingo@kernel.org> |
| Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org |
| Signed-off-by: Thomas Gleixner <tglx@linutronix.de> |
| [Yadi: Move do_adjtimex to timekeeping.c and solve context issues] |
| Signed-off-by: Hu <yadi.hu@windriver.com> |
| Signed-off-by: Zefan Li <lizefan@huawei.com> |
| --- |
| kernel/time/ntp.c | 45 ++++++++++++++++++++++++++++++++++++++------- |
| kernel/time/timekeeping.c | 37 +++++++++++++++++++++++++++++++++++-- |
| 2 files changed, 73 insertions(+), 9 deletions(-) |
| |
| --- a/kernel/time/ntp.c |
| +++ b/kernel/time/ntp.c |
| @@ -34,6 +34,7 @@ unsigned long tick_nsec; |
| static u64 tick_length; |
| static u64 tick_length_base; |
| |
| +#define SECS_PER_DAY 86400 |
| #define MAX_TICKADJ 500LL /* usecs */ |
| #define MAX_TICKADJ_SCALED \ |
| (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) |
| @@ -78,6 +79,9 @@ static long time_adjust; |
| /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ |
| static s64 ntp_tick_adj; |
| |
| +/* second value of the next pending leapsecond, or KTIME_MAX if no leap */ |
| +static s64 ntp_next_leap_sec = KTIME_MAX; |
| + |
| #ifdef CONFIG_NTP_PPS |
| |
| /* |
| @@ -354,6 +358,8 @@ void ntp_clear(void) |
| time_maxerror = NTP_PHASE_LIMIT; |
| time_esterror = NTP_PHASE_LIMIT; |
| |
| + ntp_next_leap_sec = KTIME_MAX; |
| + |
| ntp_update_frequency(); |
| |
| tick_length = tick_length_base; |
| @@ -377,6 +383,21 @@ u64 ntp_tick_length(void) |
| return ret; |
| } |
| |
| +/** |
| + * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t |
| + * |
| + * Provides the time of the next leapsecond against CLOCK_REALTIME in |
| + * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. |
| + */ |
| +ktime_t ntp_get_next_leap(void) |
| +{ |
| + ktime_t ret; |
| + |
| + if ((time_state == TIME_INS) && (time_status & STA_INS)) |
| + return ktime_set(ntp_next_leap_sec, 0); |
| + ret.tv64 = KTIME_MAX; |
| + return ret; |
| +} |
| |
| /* |
| * this routine handles the overflow of the microsecond field |
| @@ -403,15 +424,21 @@ int second_overflow(unsigned long secs) |
| */ |
| switch (time_state) { |
| case TIME_OK: |
| - if (time_status & STA_INS) |
| + if (time_status & STA_INS) { |
| time_state = TIME_INS; |
| - else if (time_status & STA_DEL) |
| + ntp_next_leap_sec = secs + SECS_PER_DAY - |
| + (secs % SECS_PER_DAY); |
| + } else if (time_status & STA_DEL) { |
| time_state = TIME_DEL; |
| + ntp_next_leap_sec = secs + SECS_PER_DAY - |
| + ((secs+1) % SECS_PER_DAY); |
| + } |
| break; |
| case TIME_INS: |
| - if (!(time_status & STA_INS)) |
| + if (!(time_status & STA_INS)) { |
| + ntp_next_leap_sec = KTIME_MAX; |
| time_state = TIME_OK; |
| - else if (secs % 86400 == 0) { |
| + } else if (secs % SECS_PER_DAY == 0) { |
| leap = -1; |
| time_state = TIME_OOP; |
| time_tai++; |
| @@ -420,10 +447,12 @@ int second_overflow(unsigned long secs) |
| } |
| break; |
| case TIME_DEL: |
| - if (!(time_status & STA_DEL)) |
| + if (!(time_status & STA_DEL)) { |
| + ntp_next_leap_sec = KTIME_MAX; |
| time_state = TIME_OK; |
| - else if ((secs + 1) % 86400 == 0) { |
| + } else if ((secs + 1) % SECS_PER_DAY == 0) { |
| leap = 1; |
| + ntp_next_leap_sec = KTIME_MAX; |
| time_tai--; |
| time_state = TIME_WAIT; |
| printk(KERN_NOTICE |
| @@ -431,6 +460,7 @@ int second_overflow(unsigned long secs) |
| } |
| break; |
| case TIME_OOP: |
| + ntp_next_leap_sec = KTIME_MAX; |
| time_state = TIME_WAIT; |
| break; |
| |
| @@ -549,6 +579,7 @@ static inline void process_adj_status(st |
| if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { |
| time_state = TIME_OK; |
| time_status = STA_UNSYNC; |
| + ntp_next_leap_sec = KTIME_MAX; |
| /* restart PPS frequency calibration */ |
| pps_reset_freq_interval(); |
| } |
| @@ -619,7 +650,7 @@ static inline void process_adjtimex_mode |
| * adjtimex mainly allows reading (and writing, if superuser) of |
| * kernel time-keeping variables. used by xntpd. |
| */ |
| -int do_adjtimex(struct timex *txc) |
| +int __do_adjtimex(struct timex *txc) |
| { |
| struct timespec ts; |
| int result; |
| --- a/kernel/time/timekeeping.c |
| +++ b/kernel/time/timekeeping.c |
| @@ -21,6 +21,9 @@ |
| #include <linux/tick.h> |
| #include <linux/stop_machine.h> |
| |
| +extern ktime_t ntp_get_next_leap(void); |
| +extern int __do_adjtimex(struct timex *); |
| + |
| /* Structure holding internal timekeeping values. */ |
| struct timekeeper { |
| /* Current clocksource used for timekeeping. */ |
| @@ -30,6 +33,8 @@ struct timekeeper { |
| /* The shift value of the current clocksource. */ |
| int shift; |
| |
| + /* CLOCK_MONOTONIC time value of a pending leap-second*/ |
| + ktime_t next_leap_ktime; |
| /* Number of clock cycles in one NTP interval. */ |
| cycle_t cycle_interval; |
| /* Number of clock shifted nano seconds in one NTP interval. */ |
| @@ -186,6 +191,17 @@ static void update_rt_offset(void) |
| timekeeper.offs_real = timespec_to_ktime(tmp); |
| } |
| |
| +/* |
| + * tk_update_leap_state - helper to update the next_leap_ktime |
| + */ |
| +static inline void tk_update_leap_state(struct timekeeper *tk) |
| +{ |
| + tk->next_leap_ktime = ntp_get_next_leap(); |
| + if (tk->next_leap_ktime.tv64 != KTIME_MAX) |
| + /* Convert to monotonic time */ |
| + tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); |
| +} |
| + |
| /* must hold write on timekeeper.lock */ |
| static void timekeeping_update(bool clearntp) |
| { |
| @@ -193,6 +209,7 @@ static void timekeeping_update(bool clea |
| timekeeper.ntp_error = 0; |
| ntp_clear(); |
| } |
| + tk_update_leap_state(&timekeeper); |
| update_rt_offset(); |
| update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, |
| timekeeper.clock, timekeeper.mult); |
| @@ -1329,10 +1346,16 @@ ktime_t ktime_get_update_offsets(ktime_t |
| |
| *offs_real = timekeeper.offs_real; |
| *offs_boot = timekeeper.offs_boot; |
| + |
| + now = ktime_add_ns(ktime_set(secs, 0), nsecs); |
| + now = ktime_sub(now, *offs_real); |
| + |
| + /* Handle leapsecond insertion adjustments */ |
| + if (unlikely(now.tv64 >= timekeeper.next_leap_ktime.tv64)) |
| + *offs_real = ktime_sub(timekeeper.offs_real, ktime_set(1, 0)); |
| + |
| } while (read_seqretry(&timekeeper.lock, seq)); |
| |
| - now = ktime_add_ns(ktime_set(secs, 0), nsecs); |
| - now = ktime_sub(now, *offs_real); |
| return now; |
| } |
| #endif |
| @@ -1354,6 +1377,16 @@ ktime_t ktime_get_monotonic_offset(void) |
| } |
| EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); |
| |
| +/* |
| + * do_adjtimex() - Accessor function to NTP __do_adjtimex function |
| + */ |
| +int do_adjtimex(struct timex *txc) |
| +{ |
| + int ret; |
| + ret = __do_adjtimex(txc); |
| + tk_update_leap_state(&timekeeper); |
| + return ret; |
| +} |
| |
| /** |
| * xtime_update() - advances the timekeeping infrastructure |