KVM: enable in-kernel APIC INIT/SIPI handling

This patch enables INIT/SIPI handling using in-kernel APIC by
introducing a ->mp_state field to emulate the SMP state transition.

Signed-off-by: Qing He <qing.he@intel.com>
Signed-off-by: Xin Li <xin.b.li@intel.com>
diff --git a/drivers/kvm/irq.c b/drivers/kvm/irq.c
index 608c530..225c541 100644
--- a/drivers/kvm/irq.c
+++ b/drivers/kvm/irq.c
@@ -79,7 +79,7 @@
 		wake_up_interruptible(&vcpu->wq);
 		++vcpu->stat.halt_wakeup;
 	}
-	if (vcpu->guest_mode)
+	if (vcpu->guest_mode && ipi_pcpu != smp_processor_id())
 		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
 }
 
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h
index f324cfb..5f97e25 100644
--- a/drivers/kvm/irq.h
+++ b/drivers/kvm/irq.h
@@ -137,6 +137,7 @@
 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
 int kvm_create_lapic(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu);
 void kvm_free_apic(struct kvm_lapic *apic);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index dbb929d..5e318b6 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -326,6 +326,13 @@
 	u64 shadow_efer;
 	u64 apic_base;
 	struct kvm_lapic *apic;    /* kernel irqchip context */
+#define VCPU_MP_STATE_RUNNABLE          0
+#define VCPU_MP_STATE_UNINITIALIZED     1
+#define VCPU_MP_STATE_INIT_RECEIVED     2
+#define VCPU_MP_STATE_SIPI_RECEIVED     3
+#define VCPU_MP_STATE_HALTED            4
+	int mp_state;
+	int sipi_vector;
 	u64 ia32_misc_enable_msr;
 
 	struct kvm_mmu mmu;
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index a25dfc9..f101430 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -249,6 +249,10 @@
 	vcpu->mmu.root_hpa = INVALID_PAGE;
 	vcpu->kvm = kvm;
 	vcpu->vcpu_id = id;
+	if (!irqchip_in_kernel(kvm) || id == 0)
+		vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
+	else
+		vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
 	init_waitqueue_head(&vcpu->wq);
 
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
@@ -1371,7 +1375,7 @@
 /*
  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
  */
-static void kvm_vcpu_kernel_halt(struct kvm_vcpu *vcpu)
+static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -1380,24 +1384,28 @@
 	/*
 	 * We will block until either an interrupt or a signal wakes us up
 	 */
-	while(!(irqchip_in_kernel(vcpu->kvm) && kvm_cpu_has_interrupt(vcpu))
-	      && !vcpu->irq_summary
-	      && !signal_pending(current)) {
+	while (!kvm_cpu_has_interrupt(vcpu)
+	       && !signal_pending(current)
+	       && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
+	       && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		vcpu_put(vcpu);
 		schedule();
 		vcpu_load(vcpu);
 	}
 
+	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&vcpu->wq, &wait);
-	set_current_state(TASK_RUNNING);
 }
 
 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.halt_exits;
 	if (irqchip_in_kernel(vcpu->kvm)) {
-		kvm_vcpu_kernel_halt(vcpu);
+		vcpu->mp_state = VCPU_MP_STATE_HALTED;
+		kvm_vcpu_block(vcpu);
+		if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
+			return -EINTR;
 		return 1;
 	} else {
 		vcpu->run->exit_reason = KVM_EXIT_HLT;
@@ -2001,6 +2009,12 @@
 
 	vcpu_load(vcpu);
 
+	if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
+		kvm_vcpu_block(vcpu);
+		vcpu_put(vcpu);
+		return -EAGAIN;
+	}
+
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c
index aaf810b..57810fa 100644
--- a/drivers/kvm/lapic.c
+++ b/drivers/kvm/lapic.c
@@ -311,8 +311,8 @@
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			     int vector, int level, int trig_mode)
 {
-	int result = 0;
-	int orig_irr;
+	int orig_irr, result = 0;
+	struct kvm_vcpu *vcpu = apic->vcpu;
 
 	switch (delivery_mode) {
 	case APIC_DM_FIXED:
@@ -334,7 +334,13 @@
 		} else
 			apic_clear_vector(vector, apic->regs + APIC_TMR);
 
-		kvm_vcpu_kick(apic->vcpu);
+		if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
+			kvm_vcpu_kick(vcpu);
+		else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) {
+			vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
+			if (waitqueue_active(&vcpu->wq))
+				wake_up_interruptible(&vcpu->wq);
+		}
 
 		result = (orig_irr == 0);
 		break;
@@ -351,11 +357,30 @@
 		break;
 
 	case APIC_DM_INIT:
-		printk(KERN_DEBUG "Ignoring guest INIT\n");
+		if (level) {
+			if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
+				printk(KERN_DEBUG
+				       "INIT on a runnable vcpu %d\n",
+				       vcpu->vcpu_id);
+			vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED;
+			kvm_vcpu_kick(vcpu);
+		} else {
+			printk(KERN_DEBUG
+			       "Ignoring de-assert INIT to vcpu %d\n",
+			       vcpu->vcpu_id);
+		}
+
 		break;
 
 	case APIC_DM_STARTUP:
-		printk(KERN_DEBUG "Ignoring guest STARTUP\n");
+		printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
+		       vcpu->vcpu_id, vector);
+		if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
+			vcpu->sipi_vector = vector;
+			vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
+			if (waitqueue_active(&vcpu->wq))
+				wake_up_interruptible(&vcpu->wq);
+		}
 		break;
 
 	default:
@@ -791,7 +816,7 @@
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
 
-static void lapic_reset(struct kvm_vcpu *vcpu)
+void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic;
 	int i;
@@ -836,6 +861,7 @@
 		   vcpu, GET_APIC_ID(apic_get_reg(apic, APIC_ID)),
 		   vcpu->apic_base, apic->base_address);
 }
+EXPORT_SYMBOL_GPL(kvm_lapic_reset);
 
 int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
 {
@@ -864,7 +890,10 @@
 
 	atomic_inc(&apic->timer.pending);
 	if (waitqueue_active(q))
+	{
+		apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
 		wake_up_interruptible(q);
+	}
 	if (apic_lvtt_period(apic)) {
 		result = 1;
 		apic->timer.dev.expires = ktime_add_ns(
@@ -925,7 +954,7 @@
 	apic->base_address = APIC_DEFAULT_PHYS_BASE;
 	vcpu->apic_base = APIC_DEFAULT_PHYS_BASE;
 
-	lapic_reset(vcpu);
+	kvm_lapic_reset(vcpu);
 	apic->dev.read = apic_mmio_read;
 	apic->dev.write = apic_mmio_write;
 	apic->dev.in_range = apic_mmio_range;
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index b9cb9b8..e18d1f81 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -1408,6 +1408,8 @@
 		goto out;
 	}
 
+	vmx->vcpu.rmode.active = 0;
+
 	vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
 	set_cr8(&vmx->vcpu, 0);
 	msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
@@ -1421,8 +1423,13 @@
 	 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
 	 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
 	 */
-	vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
-	vmcs_writel(GUEST_CS_BASE, 0x000f0000);
+	if (vmx->vcpu.vcpu_id == 0) {
+		vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
+		vmcs_writel(GUEST_CS_BASE, 0x000f0000);
+	} else {
+		vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8);
+		vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12);
+	}
 	vmcs_write32(GUEST_CS_LIMIT, 0xffff);
 	vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
 
@@ -1447,7 +1454,10 @@
 	vmcs_writel(GUEST_SYSENTER_EIP, 0);
 
 	vmcs_writel(GUEST_RFLAGS, 0x02);
-	vmcs_writel(GUEST_RIP, 0xfff0);
+	if (vmx->vcpu.vcpu_id == 0)
+		vmcs_writel(GUEST_RIP, 0xfff0);
+	else
+		vmcs_writel(GUEST_RIP, 0);
 	vmcs_writel(GUEST_RSP, 0);
 
 	//todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
@@ -2197,6 +2207,14 @@
 	u8 fail;
 	int r;
 
+	if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
+		printk("vcpu %d received sipi with vector # %x\n",
+		       vcpu->vcpu_id, vcpu->sipi_vector);
+		kvm_lapic_reset(vcpu);
+		vmx_vcpu_setup(vmx);
+		vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
+	}
+
 preempted:
 	if (vcpu->guest_debug.enabled)
 		kvm_guest_debug_pre(vcpu);