| /* |
| * Copyright (c) 1991,1992,1995 Linus Torvalds |
| * Copyright (c) 1994 Alan Modra |
| * Copyright (c) 1995 Markus Kuhn |
| * Copyright (c) 1996 Ingo Molnar |
| * Copyright (c) 1998 Andrea Arcangeli |
| * Copyright (c) 2002,2006 Vojtech Pavlik |
| * Copyright (c) 2003 Andi Kleen |
| * |
| */ |
| |
| #include <linux/init.h> |
| #include <linux/interrupt.h> |
| #include <linux/time.h> |
| #include <linux/sysctl.h> |
| #include <linux/percpu.h> |
| #include <linux/kernel_stat.h> |
| #include <linux/posix-timers.h> |
| #include <linux/cpufreq.h> |
| #include <linux/clocksource.h> |
| #include <linux/sysdev.h> |
| |
| #include <asm/vsyscall.h> |
| #include <asm/delay.h> |
| #include <asm/time.h> |
| #include <asm/timer.h> |
| |
| #include <xen/evtchn.h> |
| #include <xen/sysctl.h> |
| #include <xen/interface/vcpu.h> |
| |
| #include <asm/i8253.h> |
| DEFINE_SPINLOCK(i8253_lock); |
| EXPORT_SYMBOL(i8253_lock); |
| |
| #ifdef CONFIG_X86_64 |
| volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; |
| #endif |
| |
| #define XEN_SHIFT 22 |
| |
| unsigned int cpu_khz; /* Detected as we calibrate the TSC */ |
| EXPORT_SYMBOL(cpu_khz); |
| |
| /* These are peridically updated in shared_info, and then copied here. */ |
| struct shadow_time_info { |
| u64 tsc_timestamp; /* TSC at last update of time vals. */ |
| u64 system_timestamp; /* Time, in nanosecs, since boot. */ |
| u32 tsc_to_nsec_mul; |
| u32 tsc_to_usec_mul; |
| int tsc_shift; |
| u32 version; |
| }; |
| static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); |
| static struct timespec shadow_tv; |
| static u32 shadow_tv_version; |
| |
| /* Keep track of last time we did processing/updating of jiffies and xtime. */ |
| static u64 processed_system_time; /* System time (ns) at last processing. */ |
| static DEFINE_PER_CPU(u64, processed_system_time); |
| |
| /* How much CPU time was spent blocked and how much was 'stolen'? */ |
| static DEFINE_PER_CPU(u64, processed_stolen_time); |
| static DEFINE_PER_CPU(u64, processed_blocked_time); |
| |
| /* Current runstate of each CPU (updated automatically by the hypervisor). */ |
| DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); |
| |
| /* Must be signed, as it's compared with s64 quantities which can be -ve. */ |
| #define NS_PER_TICK (1000000000LL/HZ) |
| |
| static struct vcpu_set_periodic_timer xen_set_periodic_tick = { |
| .period_ns = NS_PER_TICK |
| }; |
| |
| static void __clock_was_set(struct work_struct *unused) |
| { |
| clock_was_set(); |
| } |
| static DECLARE_WORK(clock_was_set_work, __clock_was_set); |
| |
| /* |
| * GCC 4.3 can turn loops over an induction variable into division. We do |
| * not support arbitrary 64-bit division, and so must break the induction. |
| */ |
| #define clobber_induction_variable(v) asm ( "" : "+r" (v) ) |
| |
| static inline void __normalize_time(time_t *sec, s64 *nsec) |
| { |
| while (*nsec >= NSEC_PER_SEC) { |
| clobber_induction_variable(*nsec); |
| (*nsec) -= NSEC_PER_SEC; |
| (*sec)++; |
| } |
| while (*nsec < 0) { |
| clobber_induction_variable(*nsec); |
| (*nsec) += NSEC_PER_SEC; |
| (*sec)--; |
| } |
| } |
| |
| /* Does this guest OS track Xen time, or set its wall clock independently? */ |
| static int independent_wallclock = 0; |
| static int __init __independent_wallclock(char *str) |
| { |
| independent_wallclock = 1; |
| return 1; |
| } |
| __setup("independent_wallclock", __independent_wallclock); |
| |
| int xen_independent_wallclock(void) |
| { |
| return independent_wallclock; |
| } |
| |
| /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */ |
| static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */ |
| static int __init __permitted_clock_jitter(char *str) |
| { |
| permitted_clock_jitter = simple_strtoul(str, NULL, 0); |
| return 1; |
| } |
| __setup("permitted_clock_jitter=", __permitted_clock_jitter); |
| |
| /* |
| * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, |
| * yielding a 64-bit result. |
| */ |
| static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) |
| { |
| u64 product; |
| #ifdef __i386__ |
| u32 tmp1, tmp2; |
| #endif |
| |
| if (shift < 0) |
| delta >>= -shift; |
| else |
| delta <<= shift; |
| |
| #ifdef __i386__ |
| __asm__ ( |
| "mul %5 ; " |
| "mov %4,%%eax ; " |
| "mov %%edx,%4 ; " |
| "mul %5 ; " |
| "xor %5,%5 ; " |
| "add %4,%%eax ; " |
| "adc %5,%%edx ; " |
| : "=A" (product), "=r" (tmp1), "=r" (tmp2) |
| : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); |
| #else |
| __asm__ ( |
| "mul %%rdx ; shrd $32,%%rdx,%%rax" |
| : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); |
| #endif |
| |
| return product; |
| } |
| |
| static inline u64 get64(volatile u64 *ptr) |
| { |
| #ifndef CONFIG_64BIT |
| return cmpxchg64(ptr, 0, 0); |
| #else |
| return *ptr; |
| #endif |
| } |
| |
| static inline u64 get64_local(volatile u64 *ptr) |
| { |
| #ifndef CONFIG_64BIT |
| return cmpxchg64_local(ptr, 0, 0); |
| #else |
| return *ptr; |
| #endif |
| } |
| |
| static void init_cpu_khz(void) |
| { |
| u64 __cpu_khz = 1000000ULL << 32; |
| struct vcpu_time_info *info = &vcpu_info(0)->time; |
| do_div(__cpu_khz, info->tsc_to_system_mul); |
| if (info->tsc_shift < 0) |
| cpu_khz = __cpu_khz << -info->tsc_shift; |
| else |
| cpu_khz = __cpu_khz >> info->tsc_shift; |
| } |
| |
| static u64 get_nsec_offset(struct shadow_time_info *shadow) |
| { |
| u64 now, delta; |
| rdtscll(now); |
| delta = now - shadow->tsc_timestamp; |
| return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); |
| } |
| |
| static void __update_wallclock(time_t sec, long nsec) |
| { |
| long wtm_nsec, xtime_nsec; |
| time_t wtm_sec, xtime_sec; |
| u64 tmp, wc_nsec; |
| |
| /* Adjust wall-clock time base. */ |
| wc_nsec = processed_system_time; |
| wc_nsec += sec * (u64)NSEC_PER_SEC; |
| wc_nsec += nsec; |
| |
| /* Split wallclock base into seconds and nanoseconds. */ |
| tmp = wc_nsec; |
| xtime_nsec = do_div(tmp, 1000000000); |
| xtime_sec = (time_t)tmp; |
| |
| wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec); |
| wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec); |
| |
| set_normalized_timespec(&xtime, xtime_sec, xtime_nsec); |
| set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); |
| } |
| |
| static void update_wallclock(void) |
| { |
| shared_info_t *s = HYPERVISOR_shared_info; |
| |
| do { |
| shadow_tv_version = s->wc_version; |
| rmb(); |
| shadow_tv.tv_sec = s->wc_sec; |
| shadow_tv.tv_nsec = s->wc_nsec; |
| rmb(); |
| } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version)); |
| |
| if (!independent_wallclock) |
| __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec); |
| } |
| |
| /* |
| * Reads a consistent set of time-base values from Xen, into a shadow data |
| * area. |
| */ |
| static void get_time_values_from_xen(unsigned int cpu) |
| { |
| struct vcpu_time_info *src; |
| struct shadow_time_info *dst; |
| unsigned long flags; |
| u32 pre_version, post_version; |
| |
| src = &vcpu_info(cpu)->time; |
| dst = &per_cpu(shadow_time, cpu); |
| |
| local_irq_save(flags); |
| |
| do { |
| pre_version = dst->version = src->version; |
| rmb(); |
| dst->tsc_timestamp = src->tsc_timestamp; |
| dst->system_timestamp = src->system_time; |
| dst->tsc_to_nsec_mul = src->tsc_to_system_mul; |
| dst->tsc_shift = src->tsc_shift; |
| rmb(); |
| post_version = src->version; |
| } while ((pre_version & 1) | (pre_version ^ post_version)); |
| |
| dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000; |
| |
| local_irq_restore(flags); |
| } |
| |
| static inline int time_values_up_to_date(void) |
| { |
| rmb(); |
| return percpu_read(shadow_time.version) == vcpu_info_read(time.version); |
| } |
| |
| static void sync_xen_wallclock(unsigned long dummy); |
| static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0); |
| static void sync_xen_wallclock(unsigned long dummy) |
| { |
| time_t sec; |
| s64 nsec; |
| struct xen_platform_op op; |
| |
| BUG_ON(!is_initial_xendomain()); |
| if (!ntp_synced() || independent_wallclock) |
| return; |
| |
| write_seqlock_irq(&xtime_lock); |
| |
| sec = xtime.tv_sec; |
| nsec = xtime.tv_nsec; |
| __normalize_time(&sec, &nsec); |
| |
| op.cmd = XENPF_settime; |
| op.u.settime.secs = sec; |
| op.u.settime.nsecs = nsec; |
| op.u.settime.system_time = processed_system_time; |
| WARN_ON(HYPERVISOR_platform_op(&op)); |
| |
| update_wallclock(); |
| |
| write_sequnlock_irq(&xtime_lock); |
| |
| /* Once per minute. */ |
| mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ); |
| } |
| |
| static unsigned long long local_clock(void) |
| { |
| unsigned int cpu = get_cpu(); |
| struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); |
| u64 time; |
| u32 local_time_version; |
| |
| do { |
| local_time_version = shadow->version; |
| rdtsc_barrier(); |
| time = shadow->system_timestamp + get_nsec_offset(shadow); |
| if (!time_values_up_to_date()) |
| get_time_values_from_xen(cpu); |
| barrier(); |
| } while (local_time_version != shadow->version); |
| |
| put_cpu(); |
| |
| return time; |
| } |
| |
| /* |
| * Runstate accounting |
| */ |
| static void get_runstate_snapshot(struct vcpu_runstate_info *res) |
| { |
| u64 state_time; |
| struct vcpu_runstate_info *state; |
| |
| BUG_ON(preemptible()); |
| |
| state = &__get_cpu_var(runstate); |
| |
| do { |
| state_time = get64_local(&state->state_entry_time); |
| *res = *state; |
| } while (get64_local(&state->state_entry_time) != state_time); |
| |
| WARN_ON_ONCE(res->state != RUNSTATE_running); |
| } |
| |
| /* |
| * Xen sched_clock implementation. Returns the number of unstolen |
| * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED |
| * states. |
| */ |
| unsigned long long sched_clock(void) |
| { |
| struct vcpu_runstate_info runstate; |
| cycle_t now; |
| u64 ret; |
| s64 offset; |
| |
| /* |
| * Ideally sched_clock should be called on a per-cpu basis |
| * anyway, so preempt should already be disabled, but that's |
| * not current practice at the moment. |
| */ |
| preempt_disable(); |
| |
| now = local_clock(); |
| |
| get_runstate_snapshot(&runstate); |
| |
| offset = now - runstate.state_entry_time; |
| if (offset < 0) |
| offset = 0; |
| |
| ret = offset + runstate.time[RUNSTATE_running] |
| + runstate.time[RUNSTATE_blocked]; |
| |
| preempt_enable(); |
| |
| return ret; |
| } |
| |
| unsigned long profile_pc(struct pt_regs *regs) |
| { |
| unsigned long pc = instruction_pointer(regs); |
| |
| if (!user_mode_vm(regs) && in_lock_functions(pc)) { |
| #ifdef CONFIG_FRAME_POINTER |
| return *(unsigned long *)(regs->bp + sizeof(long)); |
| #else |
| unsigned long *sp = |
| (unsigned long *)kernel_stack_pointer(regs); |
| |
| /* |
| * Return address is either directly at stack pointer |
| * or above a saved flags. Eflags has bits 22-31 zero, |
| * kernel addresses don't. |
| */ |
| if (sp[0] >> 22) |
| return sp[0]; |
| if (sp[1] >> 22) |
| return sp[1]; |
| #endif |
| } |
| |
| return pc; |
| } |
| EXPORT_SYMBOL(profile_pc); |
| |
| /* |
| * Default timer interrupt handler |
| */ |
| static irqreturn_t timer_interrupt(int irq, void *dev_id) |
| { |
| s64 delta, delta_cpu, stolen, blocked; |
| unsigned int i, cpu = smp_processor_id(); |
| struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); |
| struct vcpu_runstate_info runstate; |
| |
| /* Keep nmi watchdog up to date */ |
| inc_irq_stat(irq0_irqs); |
| |
| /* |
| * Here we are in the timer irq handler. We just have irqs locally |
| * disabled but we don't know if the timer_bh is running on the other |
| * CPU. We need to avoid to SMP race with it. NOTE: we don' t need |
| * the irq version of write_lock because as just said we have irq |
| * locally disabled. -arca |
| */ |
| write_seqlock(&xtime_lock); |
| |
| do { |
| get_time_values_from_xen(cpu); |
| |
| /* Obtain a consistent snapshot of elapsed wallclock cycles. */ |
| delta = delta_cpu = |
| shadow->system_timestamp + get_nsec_offset(shadow); |
| delta -= processed_system_time; |
| delta_cpu -= per_cpu(processed_system_time, cpu); |
| |
| get_runstate_snapshot(&runstate); |
| } while (!time_values_up_to_date()); |
| |
| if ((unlikely(delta < -(s64)permitted_clock_jitter) || |
| unlikely(delta_cpu < -(s64)permitted_clock_jitter)) |
| && printk_ratelimit()) { |
| printk("Timer ISR/%u: Time went backwards: " |
| "delta=%lld delta_cpu=%lld shadow=%lld " |
| "off=%lld processed=%lld cpu_processed=%lld\n", |
| cpu, delta, delta_cpu, shadow->system_timestamp, |
| (s64)get_nsec_offset(shadow), |
| processed_system_time, |
| per_cpu(processed_system_time, cpu)); |
| for (i = 0; i < num_online_cpus(); i++) |
| printk(" %d: %lld\n", i, |
| per_cpu(processed_system_time, i)); |
| } |
| |
| /* System-wide jiffy work. */ |
| if (delta >= NS_PER_TICK) { |
| do_div(delta, NS_PER_TICK); |
| processed_system_time += delta * NS_PER_TICK; |
| while (delta > HZ) { |
| clobber_induction_variable(delta); |
| do_timer(HZ); |
| delta -= HZ; |
| } |
| do_timer(delta); |
| } |
| |
| if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) { |
| update_wallclock(); |
| if (keventd_up()) |
| schedule_work(&clock_was_set_work); |
| } |
| |
| write_sequnlock(&xtime_lock); |
| |
| /* |
| * Account stolen ticks. |
| * ensures that the ticks are accounted as stolen. |
| */ |
| stolen = runstate.time[RUNSTATE_runnable] |
| + runstate.time[RUNSTATE_offline] |
| - per_cpu(processed_stolen_time, cpu); |
| if ((stolen > 0) && (delta_cpu > 0)) { |
| delta_cpu -= stolen; |
| if (unlikely(delta_cpu < 0)) |
| stolen += delta_cpu; /* clamp local-time progress */ |
| do_div(stolen, NS_PER_TICK); |
| per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK; |
| per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK; |
| account_steal_time((cputime_t)stolen); |
| } |
| |
| /* |
| * Account blocked ticks. |
| * ensures that the ticks are accounted as idle/wait. |
| */ |
| blocked = runstate.time[RUNSTATE_blocked] |
| - per_cpu(processed_blocked_time, cpu); |
| if ((blocked > 0) && (delta_cpu > 0)) { |
| delta_cpu -= blocked; |
| if (unlikely(delta_cpu < 0)) |
| blocked += delta_cpu; /* clamp local-time progress */ |
| do_div(blocked, NS_PER_TICK); |
| per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK; |
| per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK; |
| account_idle_time((cputime_t)blocked); |
| } |
| |
| /* Account user/system ticks. */ |
| if (delta_cpu > 0) { |
| do_div(delta_cpu, NS_PER_TICK); |
| per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK; |
| if (user_mode_vm(get_irq_regs())) |
| account_user_time(current, (cputime_t)delta_cpu, |
| (cputime_t)delta_cpu); |
| else if (current != idle_task(cpu)) |
| account_system_time(current, HARDIRQ_OFFSET, |
| (cputime_t)delta_cpu, |
| (cputime_t)delta_cpu); |
| else |
| account_idle_time((cputime_t)delta_cpu); |
| } |
| |
| /* Offlined for more than a few seconds? Avoid lockup warnings. */ |
| if (stolen > 5*HZ) |
| touch_softlockup_watchdog(); |
| |
| /* Local timer processing (see update_process_times()). */ |
| run_local_timers(); |
| rcu_check_callbacks(cpu, user_mode_vm(get_irq_regs())); |
| printk_tick(); |
| scheduler_tick(); |
| run_posix_cpu_timers(current); |
| profile_tick(CPU_PROFILING); |
| |
| return IRQ_HANDLED; |
| } |
| |
| void mark_tsc_unstable(char *reason) |
| { |
| #ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */ |
| tsc_unstable = 1; |
| #endif |
| } |
| EXPORT_SYMBOL_GPL(mark_tsc_unstable); |
| |
| static void init_missing_ticks_accounting(unsigned int cpu) |
| { |
| struct vcpu_runstate_info *runstate = setup_runstate_area(cpu); |
| |
| per_cpu(processed_blocked_time, cpu) = |
| runstate->time[RUNSTATE_blocked]; |
| per_cpu(processed_stolen_time, cpu) = |
| runstate->time[RUNSTATE_runnable] + |
| runstate->time[RUNSTATE_offline]; |
| } |
| |
| static cycle_t cs_last; |
| |
| static cycle_t xen_clocksource_read(struct clocksource *cs) |
| { |
| #ifdef CONFIG_SMP |
| cycle_t last = get64(&cs_last); |
| cycle_t ret = local_clock(); |
| |
| if (unlikely((s64)(ret - last) < 0)) { |
| if (last - ret > permitted_clock_jitter |
| && printk_ratelimit()) { |
| unsigned int cpu = get_cpu(); |
| struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); |
| |
| printk(KERN_WARNING "clocksource/%u: " |
| "Time went backwards: " |
| "ret=%Lx delta=%Ld shadow=%Lx offset=%Lx\n", |
| cpu, ret, ret - last, shadow->system_timestamp, |
| get_nsec_offset(shadow)); |
| put_cpu(); |
| } |
| return last; |
| } |
| |
| for (;;) { |
| cycle_t cur = cmpxchg64(&cs_last, last, ret); |
| |
| if (cur == last || (s64)(ret - cur) < 0) |
| return ret; |
| last = cur; |
| } |
| #else |
| return local_clock(); |
| #endif |
| } |
| |
| /* No locking required. Interrupts are disabled on all CPUs. */ |
| static void xen_clocksource_resume(void) |
| { |
| unsigned int cpu; |
| |
| init_cpu_khz(); |
| |
| for_each_online_cpu(cpu) { |
| switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu, |
| &xen_set_periodic_tick)) { |
| case 0: |
| #if CONFIG_XEN_COMPAT <= 0x030004 |
| case -ENOSYS: |
| #endif |
| break; |
| default: |
| BUG(); |
| } |
| get_time_values_from_xen(cpu); |
| per_cpu(processed_system_time, cpu) = |
| per_cpu(shadow_time, 0).system_timestamp; |
| init_missing_ticks_accounting(cpu); |
| } |
| |
| processed_system_time = per_cpu(shadow_time, 0).system_timestamp; |
| |
| cs_last = local_clock(); |
| } |
| |
| static struct clocksource clocksource_xen = { |
| .name = "xen", |
| .rating = 400, |
| .read = xen_clocksource_read, |
| .mask = CLOCKSOURCE_MASK(64), |
| .mult = 1 << XEN_SHIFT, /* time directly in nanoseconds */ |
| .shift = XEN_SHIFT, |
| .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
| .resume = xen_clocksource_resume, |
| }; |
| |
| struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu) |
| { |
| struct vcpu_register_runstate_memory_area area; |
| struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu); |
| int rc; |
| |
| set_xen_guest_handle(area.addr.h, runstate); |
| rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area); |
| if (rc) { |
| BUILD_BUG_ON(RUNSTATE_running); |
| memset(runstate, 0, sizeof(*runstate)); |
| WARN_ON(rc != -ENOSYS); |
| } |
| |
| return runstate; |
| } |
| |
| void xen_read_persistent_clock(struct timespec *ts) |
| { |
| const shared_info_t *s = HYPERVISOR_shared_info; |
| u32 version, sec, nsec; |
| u64 delta; |
| |
| do { |
| version = s->wc_version; |
| rmb(); |
| sec = s->wc_sec; |
| nsec = s->wc_nsec; |
| rmb(); |
| } while ((s->wc_version & 1) | (version ^ s->wc_version)); |
| |
| delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec; |
| do_div(delta, NSEC_PER_SEC); |
| |
| ts->tv_sec = delta; |
| ts->tv_nsec = 0; |
| } |
| |
| int xen_update_persistent_clock(void) |
| { |
| if (!is_initial_xendomain()) |
| return -1; |
| mod_timer(&sync_xen_wallclock_timer, jiffies + 1); |
| return 0; |
| } |
| |
| /* Dynamically-mapped IRQ. */ |
| static int __read_mostly timer_irq = -1; |
| static struct irqaction timer_action = { |
| .handler = timer_interrupt, |
| .flags = IRQF_DISABLED|IRQF_TIMER, |
| .name = "timer" |
| }; |
| |
| static void __init setup_cpu0_timer_irq(void) |
| { |
| timer_irq = bind_virq_to_irqaction(VIRQ_TIMER, 0, &timer_action); |
| BUG_ON(timer_irq < 0); |
| } |
| |
| void __init time_init(void) |
| { |
| init_cpu_khz(); |
| printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n", |
| cpu_khz / 1000, cpu_khz % 1000); |
| |
| switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0, |
| &xen_set_periodic_tick)) { |
| case 0: |
| #if CONFIG_XEN_COMPAT <= 0x030004 |
| case -ENOSYS: |
| #endif |
| break; |
| default: |
| BUG(); |
| } |
| |
| get_time_values_from_xen(0); |
| |
| processed_system_time = per_cpu(shadow_time, 0).system_timestamp; |
| per_cpu(processed_system_time, 0) = processed_system_time; |
| init_missing_ticks_accounting(0); |
| |
| clocksource_register(&clocksource_xen); |
| |
| update_wallclock(); |
| |
| use_tsc_delay(); |
| |
| /* Cannot request_irq() until kmem is initialised. */ |
| late_time_init = setup_cpu0_timer_irq; |
| } |
| |
| /* Convert jiffies to system time. */ |
| u64 jiffies_to_st(unsigned long j) |
| { |
| unsigned long seq; |
| long delta; |
| u64 st; |
| |
| do { |
| seq = read_seqbegin(&xtime_lock); |
| delta = j - jiffies; |
| if (delta < 1) { |
| /* Triggers in some wrap-around cases, but that's okay: |
| * we just end up with a shorter timeout. */ |
| st = processed_system_time + NS_PER_TICK; |
| } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) { |
| /* Very long timeout means there is no pending timer. |
| * We indicate this to Xen by passing zero timeout. */ |
| st = 0; |
| } else { |
| st = processed_system_time + delta * (u64)NS_PER_TICK; |
| } |
| } while (read_seqretry(&xtime_lock, seq)); |
| |
| return st; |
| } |
| EXPORT_SYMBOL(jiffies_to_st); |
| |
| /* |
| * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu |
| * These functions are based on implementations from arch/s390/kernel/time.c |
| */ |
| static void stop_hz_timer(void) |
| { |
| struct vcpu_set_singleshot_timer singleshot; |
| unsigned int cpu = smp_processor_id(); |
| unsigned long j; |
| int rc; |
| |
| cpumask_set_cpu(cpu, nohz_cpu_mask); |
| |
| /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */ |
| /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */ |
| /* value of rcp->cur that matches rdp->quiescbatch and allows us to */ |
| /* stop the hz timer then the cpumasks created for subsequent values */ |
| /* of cur in rcu_start_batch are guaranteed to pick up the updated */ |
| /* nohz_cpu_mask and so will not depend on this cpu. */ |
| |
| smp_mb(); |
| |
| /* Leave ourselves in tick mode if rcu or softirq or timer pending. */ |
| if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || |
| local_softirq_pending() || |
| (j = get_next_timer_interrupt(jiffies), |
| time_before_eq(j, jiffies))) { |
| cpumask_clear_cpu(cpu, nohz_cpu_mask); |
| j = jiffies + 1; |
| } |
| |
| singleshot.timeout_abs_ns = jiffies_to_st(j) + NS_PER_TICK/2; |
| singleshot.flags = 0; |
| rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot); |
| #if CONFIG_XEN_COMPAT <= 0x030004 |
| if (rc) { |
| BUG_ON(rc != -ENOSYS); |
| rc = HYPERVISOR_set_timer_op(singleshot.timeout_abs_ns); |
| } |
| #endif |
| BUG_ON(rc); |
| } |
| |
| static void start_hz_timer(void) |
| { |
| cpumask_clear_cpu(smp_processor_id(), nohz_cpu_mask); |
| } |
| |
| void xen_safe_halt(void) |
| { |
| stop_hz_timer(); |
| /* Blocking includes an implicit local_irq_enable(). */ |
| HYPERVISOR_block(); |
| start_hz_timer(); |
| } |
| EXPORT_SYMBOL(xen_safe_halt); |
| |
| void xen_halt(void) |
| { |
| if (irqs_disabled()) |
| VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); |
| } |
| EXPORT_SYMBOL(xen_halt); |
| |
| #ifdef CONFIG_SMP |
| int __cpuinit local_setup_timer(unsigned int cpu) |
| { |
| int seq, irq; |
| |
| BUG_ON(cpu == 0); |
| |
| switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu, |
| &xen_set_periodic_tick)) { |
| case 0: |
| #if CONFIG_XEN_COMPAT <= 0x030004 |
| case -ENOSYS: |
| #endif |
| break; |
| default: |
| BUG(); |
| } |
| |
| do { |
| seq = read_seqbegin(&xtime_lock); |
| /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */ |
| per_cpu(processed_system_time, cpu) = |
| per_cpu(shadow_time, 0).system_timestamp; |
| init_missing_ticks_accounting(cpu); |
| } while (read_seqretry(&xtime_lock, seq)); |
| |
| irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action); |
| if (irq < 0) |
| return irq; |
| BUG_ON(timer_irq != irq); |
| |
| return 0; |
| } |
| |
| void __cpuinit local_teardown_timer(unsigned int cpu) |
| { |
| BUG_ON(cpu == 0); |
| unbind_from_per_cpu_irq(timer_irq, cpu, &timer_action); |
| } |
| #endif |
| |
| #ifdef CONFIG_CPU_FREQ |
| static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, |
| void *data) |
| { |
| struct cpufreq_freqs *freq = data; |
| struct xen_platform_op op; |
| |
| if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) |
| return 0; |
| |
| if (val == CPUFREQ_PRECHANGE) |
| return 0; |
| |
| op.cmd = XENPF_change_freq; |
| op.u.change_freq.flags = 0; |
| op.u.change_freq.cpu = freq->cpu; |
| op.u.change_freq.freq = (u64)freq->new * 1000; |
| WARN_ON(HYPERVISOR_platform_op(&op)); |
| |
| return 0; |
| } |
| |
| static struct notifier_block time_cpufreq_notifier_block = { |
| .notifier_call = time_cpufreq_notifier |
| }; |
| |
| static int __init cpufreq_time_setup(void) |
| { |
| if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, |
| CPUFREQ_TRANSITION_NOTIFIER)) { |
| printk(KERN_ERR "failed to set up cpufreq notifier\n"); |
| return -ENODEV; |
| } |
| return 0; |
| } |
| |
| core_initcall(cpufreq_time_setup); |
| #endif |
| |
| /* |
| * /proc/sys/xen: This really belongs in another file. It can stay here for |
| * now however. |
| */ |
| static ctl_table xen_subtable[] = { |
| { |
| .ctl_name = CTL_XEN_INDEPENDENT_WALLCLOCK, |
| .procname = "independent_wallclock", |
| .data = &independent_wallclock, |
| .maxlen = sizeof(independent_wallclock), |
| .mode = 0644, |
| .strategy = sysctl_data, |
| .proc_handler = proc_dointvec |
| }, |
| { |
| .ctl_name = CTL_XEN_PERMITTED_CLOCK_JITTER, |
| .procname = "permitted_clock_jitter", |
| .data = &permitted_clock_jitter, |
| .maxlen = sizeof(permitted_clock_jitter), |
| .mode = 0644, |
| .strategy = sysctl_data, |
| .proc_handler = proc_doulongvec_minmax |
| }, |
| { } |
| }; |
| static ctl_table xen_table[] = { |
| { |
| .ctl_name = CTL_XEN, |
| .procname = "xen", |
| .mode = 0555, |
| .child = xen_subtable |
| }, |
| { } |
| }; |
| static int __init xen_sysctl_init(void) |
| { |
| (void)register_sysctl_table(xen_table); |
| return 0; |
| } |
| __initcall(xen_sysctl_init); |