releases/4.4.110/x86-vdso-pvclock-simplify-and-speed-up-the-vdso-pvclock-reader.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From 6b078f5de7fc0851af4102493c7b5bb07e49c4cb Mon Sep 17 00:00:00 2001
 From: Andy Lutomirski <luto@amacapital.net>
 Date: Thu, 10 Dec 2015 19:20:19 -0800
 Subject: x86, vdso, pvclock: Simplify and speed up the vdso pvclock reader

 From: Andy Lutomirski <luto@amacapital.net>

 commit 6b078f5de7fc0851af4102493c7b5bb07e49c4cb upstream.

 The pvclock vdso code was too abstracted to understand easily
 and excessively paranoid.  Simplify it for a huge speedup.

 This opens the door for additional simplifications, as the vdso
 no longer accesses the pvti for any vcpu other than vcpu 0.

 Before, vclock_gettime using kvm-clock took about 45ns on my
 machine. With this change, it takes 29ns, which is almost as
 fast as the pure TSC implementation.

 Signed-off-by: Andy Lutomirski <luto@amacapital.net>
 Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
 Cc: Borislav Petkov <bp@alien8.de>
 Cc: Brian Gerst <brgerst@gmail.com>
 Cc: Denys Vlasenko <dvlasenk@redhat.com>
 Cc: H. Peter Anvin <hpa@zytor.com>
 Cc: Linus Torvalds <torvalds@linux-foundation.org>
 Cc: Peter Zijlstra <peterz@infradead.org>
 Cc: Thomas Gleixner <tglx@linutronix.de>
 Cc: linux-mm@kvack.org
 Link: http://lkml.kernel.org/r/6b51dcc41f1b101f963945c5ec7093d72bdac429.1449702533.git.luto@kernel.org
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 Cc: Jamie Iles <jamie.iles@oracle.com>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

 ---
  arch/x86/entry/vdso/vclock_gettime.c |   79 +++++++++++++++++++----------------
  1 file changed, 45 insertions(+), 34 deletions(-)

 --- a/arch/x86/entry/vdso/vclock_gettime.c
 +++ b/arch/x86/entry/vdso/vclock_gettime.c
 @@ -78,47 +78,58 @@ static notrace const struct pvclock_vsys

  static notrace cycle_t vread_pvclock(int *mode)
  {
 -	const struct pvclock_vsyscall_time_info *pvti;
 +	const struct pvclock_vcpu_time_info *pvti = &get_pvti(0)->pvti;
  	cycle_t ret;
 -	u64 last;
 -	u32 version;
 -	u8 flags;
 -	unsigned cpu, cpu1;
 -
 +	u64 tsc, pvti_tsc;
 +	u64 last, delta, pvti_system_time;
 +	u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;

  	/*
 -	 * Note: hypervisor must guarantee that:
 -	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
 -	 * 2. that per-CPU pvclock time info is updated if the
 -	 *    underlying CPU changes.
 -	 * 3. that version is increased whenever underlying CPU
 -	 *    changes.
 +	 * Note: The kernel and hypervisor must guarantee that cpu ID
 +	 * number maps 1:1 to per-CPU pvclock time info.
 +	 *
 +	 * Because the hypervisor is entirely unaware of guest userspace
 +	 * preemption, it cannot guarantee that per-CPU pvclock time
 +	 * info is updated if the underlying CPU changes or that that
 +	 * version is increased whenever underlying CPU changes.
 +	 *
 +	 * On KVM, we are guaranteed that pvti updates for any vCPU are
 +	 * atomic as seen by *all* vCPUs.  This is an even stronger
 +	 * guarantee than we get with a normal seqlock.
  	 *
 +	 * On Xen, we don't appear to have that guarantee, but Xen still
 +	 * supplies a valid seqlock using the version field.
 +
 +	 * We only do pvclock vdso timing at all if
 +	 * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
 +	 * mean that all vCPUs have matching pvti and that the TSC is
 +	 * synced, so we can just look at vCPU 0's pvti.
  	 */
 -	do {
 -		cpu = __getcpu() & VGETCPU_CPU_MASK;
 -		/* TODO: We can put vcpu id into higher bits of pvti.version.
 -		 * This will save a couple of cycles by getting rid of
 -		 * __getcpu() calls (Gleb).
 -		 */
 -
 -		pvti = get_pvti(cpu);
 -
 -		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
 -
 -		/*
 -		 * Test we're still on the cpu as well as the version.
 -		 * We could have been migrated just after the first
 -		 * vgetcpu but before fetching the version, so we
 -		 * wouldn't notice a version change.
 -		 */
 -		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
 -	} while (unlikely(cpu != cpu1 ||
 -			  (pvti->pvti.version & 1) ||
 -			  pvti->pvti.version != version));

 -	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
 +	if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
  		*mode = VCLOCK_NONE;
 +		return 0;
 +	}
 +
 +	do {
 +		version = pvti->version;
 +
 +		/* This is also a read barrier, so we'll read version first. */
 +		tsc = rdtsc_ordered();
 +
 +		pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
 +		pvti_tsc_shift = pvti->tsc_shift;
 +		pvti_system_time = pvti->system_time;
 +		pvti_tsc = pvti->tsc_timestamp;
 +
 +		/* Make sure that the version double-check is last. */
 +		smp_rmb();
 +	} while (unlikely((version & 1) || version != pvti->version));
 +
 +	delta = tsc - pvti_tsc;
 +	ret = pvti_system_time +
 +		pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
 +				    pvti_tsc_shift);

  	/* refer to tsc.c read_tsc() comment for rationale */
  	last = gtod->cycle_last;
	From 6b078f5de7fc0851af4102493c7b5bb07e49c4cb Mon Sep 17 00:00:00 2001
	From: Andy Lutomirski <luto@amacapital.net>
	Date: Thu, 10 Dec 2015 19:20:19 -0800
	Subject: x86, vdso, pvclock: Simplify and speed up the vdso pvclock reader

	From: Andy Lutomirski <luto@amacapital.net>

	commit 6b078f5de7fc0851af4102493c7b5bb07e49c4cb upstream.

	The pvclock vdso code was too abstracted to understand easily
	and excessively paranoid. Simplify it for a huge speedup.

	This opens the door for additional simplifications, as the vdso
	no longer accesses the pvti for any vcpu other than vcpu 0.

	Before, vclock_gettime using kvm-clock took about 45ns on my
	machine. With this change, it takes 29ns, which is almost as
	fast as the pure TSC implementation.

	Signed-off-by: Andy Lutomirski <luto@amacapital.net>
	Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
	Cc: Borislav Petkov <bp@alien8.de>
	Cc: Brian Gerst <brgerst@gmail.com>
	Cc: Denys Vlasenko <dvlasenk@redhat.com>
	Cc: H. Peter Anvin <hpa@zytor.com>
	Cc: Linus Torvalds <torvalds@linux-foundation.org>
	Cc: Peter Zijlstra <peterz@infradead.org>
	Cc: Thomas Gleixner <tglx@linutronix.de>
	Cc: linux-mm@kvack.org
	Link: http://lkml.kernel.org/r/6b51dcc41f1b101f963945c5ec7093d72bdac429.1449702533.git.luto@kernel.org
	Signed-off-by: Ingo Molnar <mingo@kernel.org>
	Cc: Jamie Iles <jamie.iles@oracle.com>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

	---
	arch/x86/entry/vdso/vclock_gettime.c \| 79 +++++++++++++++++++----------------
	1 file changed, 45 insertions(+), 34 deletions(-)

	--- a/arch/x86/entry/vdso/vclock_gettime.c
	+++ b/arch/x86/entry/vdso/vclock_gettime.c
	@@ -78,47 +78,58 @@ static notrace const struct pvclock_vsys

	static notrace cycle_t vread_pvclock(int *mode)
	{
	- const struct pvclock_vsyscall_time_info *pvti;
	+ const struct pvclock_vcpu_time_info *pvti = &get_pvti(0)->pvti;
	cycle_t ret;
	- u64 last;
	- u32 version;
	- u8 flags;
	- unsigned cpu, cpu1;
	-
	+ u64 tsc, pvti_tsc;
	+ u64 last, delta, pvti_system_time;
	+ u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;

	/*
	- * Note: hypervisor must guarantee that:
	- * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
	- * 2. that per-CPU pvclock time info is updated if the
	- * underlying CPU changes.
	- * 3. that version is increased whenever underlying CPU
	- * changes.
	+ * Note: The kernel and hypervisor must guarantee that cpu ID
	+ * number maps 1:1 to per-CPU pvclock time info.
	+ *
	+ * Because the hypervisor is entirely unaware of guest userspace
	+ * preemption, it cannot guarantee that per-CPU pvclock time
	+ * info is updated if the underlying CPU changes or that that
	+ * version is increased whenever underlying CPU changes.
	+ *
	+ * On KVM, we are guaranteed that pvti updates for any vCPU are
	+ * atomic as seen by all vCPUs. This is an even stronger
	+ * guarantee than we get with a normal seqlock.
	*
	+ * On Xen, we don't appear to have that guarantee, but Xen still
	+ * supplies a valid seqlock using the version field.
	+
	+ * We only do pvclock vdso timing at all if
	+ * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
	+ * mean that all vCPUs have matching pvti and that the TSC is
	+ * synced, so we can just look at vCPU 0's pvti.
	*/
	- do {
	- cpu = __getcpu() & VGETCPU_CPU_MASK;
	- /* TODO: We can put vcpu id into higher bits of pvti.version.
	- * This will save a couple of cycles by getting rid of
	- * __getcpu() calls (Gleb).
	- */
	-
	- pvti = get_pvti(cpu);
	-
	- version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
	-
	- /*
	- * Test we're still on the cpu as well as the version.
	- * We could have been migrated just after the first
	- * vgetcpu but before fetching the version, so we
	- * wouldn't notice a version change.
	- */
	- cpu1 = __getcpu() & VGETCPU_CPU_MASK;
	- } while (unlikely(cpu != cpu1 \|\|
	- (pvti->pvti.version & 1) \|\|
	- pvti->pvti.version != version));

	- if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
	+ if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
	*mode = VCLOCK_NONE;
	+ return 0;
	+ }
	+
	+ do {
	+ version = pvti->version;
	+
	+ /* This is also a read barrier, so we'll read version first. */
	+ tsc = rdtsc_ordered();
	+
	+ pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
	+ pvti_tsc_shift = pvti->tsc_shift;
	+ pvti_system_time = pvti->system_time;
	+ pvti_tsc = pvti->tsc_timestamp;
	+
	+ /* Make sure that the version double-check is last. */
	+ smp_rmb();
	+ } while (unlikely((version & 1) \|\| version != pvti->version));
	+
	+ delta = tsc - pvti_tsc;
	+ ret = pvti_system_time +
	+ pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
	+ pvti_tsc_shift);

	/* refer to tsc.c read_tsc() comment for rationale */
	last = gtod->cycle_last;