Merge commit 'v2.6.32.48' into xen/next-2.6.32

* commit 'v2.6.32.48':
  Linux 2.6.32.48
  Revert "powerpc/mpic: Fix problem that affinity is not updated"
  Revert "ASoC: wm8940: Properly set codec->dapm.bias_level"
  Revert "genirq: Add IRQF_RESUME_EARLY and resume such IRQs earlier"
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c840e7d..9238f05 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -113,6 +113,7 @@
 			More X86-64 boot options can be found in
 			Documentation/x86/x86_64/boot-options.txt .
 	X86	Either 32bit or 64bit x86 (same as X86-32+X86-64)
+	XEN	Xen support is enabled
 
 In addition, the following text indicates that the option:
 
@@ -2765,6 +2766,18 @@
 	xd=		[HW,XT] Original XT pre-IDE (RLL encoded) disks.
 	xd_geo=		See header of drivers/block/xd.c.
 
+	xen_emul_unplug=		[HW,X86,XEN]
+			Unplug Xen emulated devices
+			Format: [unplug0,][unplug1]
+			ide-disks -- unplug primary master IDE devices
+			aux-ide-disks -- unplug non-primary-master IDE devices
+			nics -- unplug network devices
+			all -- unplug all emulated devices (NICs and IDE disks)
+			unnecessary -- unplugging emulated devices is
+				unnecessary even if the host did not respond to
+				the unplug protocol
+			never -- do not unplug even if version check succeeds
+
 	xirc2ps_cs=	[NET,PCMCIA]
 			Format:
 			<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 29a6ff8..81f9b94 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -267,10 +267,14 @@
 
   iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU
   implementation:
-    swiotlb=<pages>[,force]
+    swiotlb=[npages=<pages>]
+    swiotlb=[force]
+    swiotlb=[overflow=<size>]
+
     <pages>            Prereserve that many 128K pages for the software IO
                        bounce buffering.
     force              Force all IO through the software TLB.
+    <size>             Size in bytes of the overflow buffer.
 
   Settings for the IBM Calgary hardware IOMMU currently found in IBM
   pSeries and xSeries machines:
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index 8d3c79c..7d09a09 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -73,7 +73,7 @@
 	if (!dev->dma_mask)
 		return 0;
 
-	return addr + size <= *dev->dma_mask;
+	return addr + size - 1 <= *dev->dma_mask;
 }
 
 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
diff --git a/arch/ia64/include/asm/swiotlb.h b/arch/ia64/include/asm/swiotlb.h
index dcbaea7..f0acde6 100644
--- a/arch/ia64/include/asm/swiotlb.h
+++ b/arch/ia64/include/asm/swiotlb.h
@@ -4,8 +4,6 @@
 #include <linux/dma-mapping.h>
 #include <linux/swiotlb.h>
 
-extern int swiotlb_force;
-
 #ifdef CONFIG_SWIOTLB
 extern int swiotlb;
 extern void pci_swiotlb_init(void);
diff --git a/arch/ia64/include/asm/xen/events.h b/arch/ia64/include/asm/xen/events.h
index b8370c8..baa74c8 100644
--- a/arch/ia64/include/asm/xen/events.h
+++ b/arch/ia64/include/asm/xen/events.h
@@ -36,10 +36,6 @@
 	return !(ia64_psr(regs)->i);
 }
 
-static inline void handle_irq(int irq, struct pt_regs *regs)
-{
-	__do_IRQ(irq);
-}
 #define irq_ctx_init(cpu)	do { } while (0)
 
 #endif /* _ASM_IA64_XEN_EVENTS_H */
diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
index 285aae8..53292ab 100644
--- a/arch/ia64/kernel/pci-swiotlb.c
+++ b/arch/ia64/kernel/pci-swiotlb.c
@@ -41,7 +41,7 @@
 void __init swiotlb_dma_init(void)
 {
 	dma_ops = &swiotlb_dma_ops;
-	swiotlb_init();
+	swiotlb_init(1);
 }
 
 void __init pci_swiotlb_init(void)
@@ -51,7 +51,7 @@
 		swiotlb = 1;
 		printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
 		machvec_init("dig");
-		swiotlb_init();
+		swiotlb_init(1);
 		dma_ops = &swiotlb_dma_ops;
 #else
 		panic("Unable to find Intel IOMMU");
diff --git a/arch/ia64/xen/suspend.c b/arch/ia64/xen/suspend.c
index fd66b04..419c862 100644
--- a/arch/ia64/xen/suspend.c
+++ b/arch/ia64/xen/suspend.c
@@ -37,19 +37,14 @@
 	/* nothing */
 }
 
-void xen_pre_device_suspend(void)
+void
+xen_arch_pre_suspend()
 {
 	/* nothing */
 }
 
 void
-xen_pre_suspend()
-{
-	/* nothing */
-}
-
-void
-xen_post_suspend(int suspend_cancelled)
+xen_arch_post_suspend(int suspend_cancelled)
 {
 	if (suspend_cancelled)
 		return;
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index e281dae..80a973b 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -197,7 +197,7 @@
 	if (!dev->dma_mask)
 		return 0;
 
-	return addr + size <= *dev->dma_mask;
+	return addr + size - 1 <= *dev->dma_mask;
 }
 
 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 53bcf3d..b152de3 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -345,7 +345,7 @@
 
 #ifdef CONFIG_SWIOTLB
 	if (ppc_swiotlb_enable)
-		swiotlb_init();
+		swiotlb_init(1);
 #endif
 
 	paging_init();
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 00d3b65..dc5eede 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -559,7 +559,7 @@
 
 #ifdef CONFIG_SWIOTLB
 	if (ppc_swiotlb_enable)
-		swiotlb_init();
+		swiotlb_init(1);
 #endif
 
 	paging_init();
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 73ae02a..0266f87 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1896,6 +1896,10 @@
 	def_bool y
 	depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
 
+config PCI_XEN
+	bool
+	select SWIOTLB
+
 config PCI_DOMAINS
 	def_bool y
 	depends on PCI
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index 18aa3f8..4413ba4 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -23,20 +23,16 @@
 #include <linux/irqreturn.h>
 
 #ifdef CONFIG_AMD_IOMMU
-extern int amd_iommu_init(void);
 extern int amd_iommu_init_dma_ops(void);
 extern int amd_iommu_init_passthrough(void);
 extern void amd_iommu_detect(void);
 extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
 extern void amd_iommu_flush_all_domains(void);
 extern void amd_iommu_flush_all_devices(void);
-extern void amd_iommu_shutdown(void);
 extern void amd_iommu_apply_erratum_63(u16 devid);
 extern void amd_iommu_init_api(void);
 #else
-static inline int amd_iommu_init(void) { return -ENODEV; }
 static inline void amd_iommu_detect(void) { }
-static inline void amd_iommu_shutdown(void) { }
 #endif
 
 #endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index b03bedb..0918654 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -62,10 +62,8 @@
 extern int use_calgary;
 
 #ifdef CONFIG_CALGARY_IOMMU
-extern int calgary_iommu_init(void);
 extern void detect_calgary(void);
 #else
-static inline int calgary_iommu_init(void) { return 1; }
 static inline void detect_calgary(void) { return; }
 #endif
 
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 6a25d5d..ac91eed 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -20,7 +20,8 @@
 # define ISA_DMA_BIT_MASK DMA_BIT_MASK(32)
 #endif
 
-extern dma_addr_t bad_dma_address;
+#define DMA_ERROR_CODE	0
+
 extern int iommu_merge;
 extern struct device x86_dma_fallback_dev;
 extern int panic_on_overflow;
@@ -48,7 +49,7 @@
 	if (ops->mapping_error)
 		return ops->mapping_error(dev, dma_addr);
 
-	return (dma_addr == bad_dma_address);
+	return (dma_addr == DMA_ERROR_CODE);
 }
 
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
@@ -66,7 +67,7 @@
 	if (!dev->dma_mask)
 		return 0;
 
-	return addr + size <= *dev->dma_mask;
+	return addr + size - 1 <= *dev->dma_mask;
 }
 
 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 40b4e61..fa3fd43 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -109,6 +109,8 @@
 extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
 extern void free_early(u64 start, u64 end);
 extern void early_res_to_bootmem(u64 start, u64 end);
+extern u64 early_res_next_free(u64 start);
+extern u64 early_res_next_reserved(u64 addr, u64 max);
 extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
 
 extern unsigned long e820_end_of_ram_pfn(void);
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 6cfdafa..4ac5b0f 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -35,8 +35,7 @@
 extern int gart_iommu_aperture_disabled;
 
 extern void early_gart_iommu_check(void);
-extern void gart_iommu_init(void);
-extern void gart_iommu_shutdown(void);
+extern int gart_iommu_init(void);
 extern void __init gart_parse_options(char *);
 extern void gart_iommu_hole_init(void);
 
@@ -48,12 +47,6 @@
 static inline void early_gart_iommu_check(void)
 {
 }
-static inline void gart_iommu_init(void)
-{
-}
-static inline void gart_iommu_shutdown(void)
-{
-}
 static inline void gart_parse_options(char *options)
 {
 }
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 3251e23..fa152cb 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -68,6 +68,7 @@
 extern int hpet_force_user;
 extern u8 hpet_msi_disable;
 extern int is_hpet_enabled(void);
+extern int disable_hpet(char *);
 extern int hpet_enable(void);
 extern void hpet_disable(void);
 extern unsigned long hpet_readl(unsigned long a);
@@ -108,6 +109,7 @@
 #else /* CONFIG_HPET_TIMER */
 
 static inline int hpet_enable(void) { return 0; }
+static inline int disable_hpet(char *s) { return 0; }
 static inline int is_hpet_enabled(void) { return 0; }
 #define hpet_readl(a) 0
 
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 439a9ac..bf88684 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -36,16 +36,28 @@
 	free_pgd_range(tlb, addr, end, floor, ceiling);
 }
 
+static inline pte_t huge_ptep_get(pte_t *ptep)
+{
+	return *ptep;
+}
+
 static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 				   pte_t *ptep, pte_t pte)
 {
-	set_pte_at(mm, addr, ptep, pte);
+#if PAGETABLE_LEVELS >= 3
+	set_pmd((pmd_t *)ptep, native_make_pmd(native_pte_val(pte)));
+#else
+	set_pgd((pgd_t *)ptep, native_make_pgd(native_pte_val(pte)));
+#endif
 }
 
 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 					    unsigned long addr, pte_t *ptep)
 {
-	return ptep_get_and_clear(mm, addr, ptep);
+	pte_t pte = huge_ptep_get(ptep);
+
+	set_huge_pte_at(mm, addr, ptep, __pte(0));
+	return pte;
 }
 
 static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
@@ -66,19 +78,25 @@
 static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
 					   unsigned long addr, pte_t *ptep)
 {
-	ptep_set_wrprotect(mm, addr, ptep);
+	pte_t pte = huge_ptep_get(ptep);
+
+	pte = pte_wrprotect(pte);
+	set_huge_pte_at(mm, addr, ptep, pte);
 }
 
 static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 					     unsigned long addr, pte_t *ptep,
 					     pte_t pte, int dirty)
 {
-	return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-}
+	pte_t oldpte = huge_ptep_get(ptep);
+	int changed = !pte_same(oldpte, pte);
 
-static inline pte_t huge_ptep_get(pte_t *ptep)
-{
-	return *ptep;
+	if (changed && dirty) {
+		set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+		flush_tlb_page(vma, addr);
+	}
+
+	return changed;
 }
 
 static inline int arch_prepare_hugepage(struct page *page)
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6a63b86..9ad387e 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -7,6 +7,10 @@
 #include <asm-generic/int-ll64.h>
 #include <asm/page.h>
 
+#include <xen/xen.h>
+
+extern int isapnp_disable;
+
 #define build_mmio_read(name, size, type, reg, barrier) \
 static inline type name(const volatile void __iomem *addr) \
 { type ret; asm volatile("mov" size " %1,%0":reg (ret) \
@@ -199,6 +203,18 @@
 extern void __iomem *early_memremap(resource_size_t phys_addr,
 				    unsigned long size);
 extern void early_iounmap(void __iomem *addr, unsigned long size);
+extern bool is_early_ioremap_ptep(pte_t *ptep);
+
+#ifdef CONFIG_XEN
+struct bio_vec;
+
+extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
+				      const struct bio_vec *vec2);
+
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)				\
+	(__BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&				\
+	 (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
+#endif	/* CONFIG_XEN */
 
 #define IO_SPACE_LIMIT 0xffff
 
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 5f61f6e..b852da9 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -172,6 +172,7 @@
 extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 
 extern void probe_nr_irqs_gsi(void);
+extern int get_nr_irqs_gsi(void);
 
 extern int setup_ioapic_entry(int apic, int irq,
 			      struct IO_APIC_route_entry *entry,
@@ -201,4 +202,6 @@
 
 #endif
 
+void xen_io_apic_init(void);
+
 #endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index fd6d21b..345c99c 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -1,8 +1,6 @@
 #ifndef _ASM_X86_IOMMU_H
 #define _ASM_X86_IOMMU_H
 
-extern void pci_iommu_shutdown(void);
-extern void no_iommu_init(void);
 extern struct dma_map_ops nommu_dma_ops;
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6e90a04..ba4dc7b 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -120,6 +120,12 @@
  */
 #define MCE_SELF_VECTOR			0xeb
 
+#ifdef CONFIG_XEN
+/* Xen vector callback to receive events in a HVM domain */
+#define XEN_HVM_EVTCHN_CALLBACK		0xe9
+#endif
+
+
 /*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
@@ -157,6 +163,14 @@
 #define CPU_VECTOR_LIMIT		(  8 * NR_CPUS      )
 #define IO_APIC_VECTOR_LIMIT		( 32 * MAX_IO_APICS )
 
+#ifndef __ASSEMBLY__
+# if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SPARSE_IRQ)
+extern int nr_dynamic_irqs;
+# else
+#  define NR_DYNAMIC_IRQS			 256
+# endif
+#endif
+
 #ifdef CONFIG_X86_IO_APIC
 # ifdef CONFIG_SPARSE_IRQ
 #  define NR_IRQS					\
@@ -165,13 +179,13 @@
 		(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
 # else
 #  if NR_CPUS < MAX_IO_APICS
-#   define NR_IRQS 			(NR_VECTORS + 4*CPU_VECTOR_LIMIT)
+#   define NR_IRQS 			(NR_VECTORS + 4*CPU_VECTOR_LIMIT) + NR_DYNAMIC_IRQS
 #  else
-#   define NR_IRQS			(NR_VECTORS + IO_APIC_VECTOR_LIMIT)
+#   define NR_IRQS			(NR_VECTORS + IO_APIC_VECTOR_LIMIT) + NR_DYNAMIC_IRQS
 #  endif
 # endif
 #else /* !CONFIG_X86_IO_APIC: */
-# define NR_IRQS			NR_IRQS_LEGACY
+# define NR_IRQS			NR_IRQS_LEGACY + NR_DYNAMIC_IRQS
 #endif
 
 #endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ef51b50..e15fca1 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -55,4 +55,13 @@
 }
 #endif
 
+#ifdef CONFIG_MICROCODE_XEN
+extern struct microcode_ops * __init init_xen_microcode(void);
+#else
+static inline struct microcode_ops * __init init_xen_microcode(void)
+{
+	return NULL;
+}
+#endif
+
 #endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 80a1dee..67eaa91 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -13,6 +13,9 @@
 	int size;
 	struct mutex lock;
 	void *vdso;
+#ifdef CONFIG_XEN
+	int has_foreign_mappings;
+#endif
 } mm_context_t;
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index efb3899..e571db4 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -330,11 +330,18 @@
 {
 	PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
 }
+
 static inline void set_iopl_mask(unsigned mask)
 {
 	PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
 }
 
+static inline void set_io_bitmap(struct thread_struct *thread,
+				 unsigned long bytes_updated)
+{
+	PVOP_VCALL2(pv_cpu_ops.set_io_bitmap, thread, bytes_updated);
+}
+
 /* The paravirtualized I/O functions */
 static inline void slow_down_io(void)
 {
@@ -770,15 +777,28 @@
 #define PV_RESTORE_REGS "popl %edx; popl %ecx;"
 
 /* save and restore all caller-save registers, except return value */
-#define PV_SAVE_ALL_CALLER_REGS		"pushl %ecx;"
-#define PV_RESTORE_ALL_CALLER_REGS	"popl  %ecx;"
+#define __PV_SAVE_ALL_CALLER_REGS	"pushl %ecx;"
+#define __PV_RESTORE_ALL_CALLER_REGS	"popl  %ecx;"
+
+#ifdef CONFIG_FRAME_POINTER
+#define PV_SAVE_ALL_CALLER_REGS			\
+	"push %ebp;"				\
+	"mov %esp, %ebp;"			\
+	__PV_SAVE_ALL_CALLER_REGS
+#define PV_RESTORE_ALL_CALLER_REGS		\
+	__PV_RESTORE_ALL_CALLER_REGS		\
+	"leave;"
+#else
+#define PV_SAVE_ALL_CALLER_REGS		__PV_SAVE_ALL_CALLER_REGS
+#define PV_RESTORE_ALL_CALLER_REGS	__PV_RESTORE_ALL_CALLER_REGS
+#endif
 
 #define PV_FLAGS_ARG "0"
 #define PV_EXTRA_CLOBBERS
 #define PV_VEXTRA_CLOBBERS
 #else
 /* save and restore all caller-save registers, except return value */
-#define PV_SAVE_ALL_CALLER_REGS						\
+#define __PV_SAVE_ALL_CALLER_REGS					\
 	"push %rcx;"							\
 	"push %rdx;"							\
 	"push %rsi;"							\
@@ -787,7 +807,7 @@
 	"push %r9;"							\
 	"push %r10;"							\
 	"push %r11;"
-#define PV_RESTORE_ALL_CALLER_REGS					\
+#define __PV_RESTORE_ALL_CALLER_REGS					\
 	"pop %r11;"							\
 	"pop %r10;"							\
 	"pop %r9;"							\
@@ -797,6 +817,19 @@
 	"pop %rdx;"							\
 	"pop %rcx;"
 
+#ifdef CONFIG_FRAME_POINTER
+#define PV_SAVE_ALL_CALLER_REGS			\
+	"push %rbp;"				\
+	"mov %rsp, %rbp;"			\
+	__PV_SAVE_ALL_CALLER_REGS
+#define PV_RESTORE_ALL_CALLER_REGS		\
+	__PV_RESTORE_ALL_CALLER_REGS		\
+	"leaveq;"
+#else
+#define PV_SAVE_ALL_CALLER_REGS		__PV_SAVE_ALL_CALLER_REGS
+#define PV_RESTORE_ALL_CALLER_REGS	__PV_RESTORE_ALL_CALLER_REGS
+#endif
+
 /* We save some registers, but all of them, that's too much. We clobber all
  * caller saved registers but the argument parameter */
 #define PV_SAVE_REGS "pushq %%rdi;"
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 9357473..3202dcc 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -135,6 +135,8 @@
 	void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
 
 	void (*set_iopl_mask)(unsigned mask);
+	void (*set_io_bitmap)(struct thread_struct *thread,
+			      unsigned long bytes_updated);
 
 	void (*wbinvd)(void);
 	void (*io_delay)(void);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index ada8c20..faa0af17 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -21,6 +21,7 @@
 extern int pci_routeirq;
 extern int noioapicquirk;
 extern int noioapicreroute;
+extern int pci_scan_all_fns;
 
 /* scan a bus after allocating a pci_sysdata for it */
 extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
@@ -49,6 +50,11 @@
 #define pcibios_assign_all_busses()	0
 #endif
 
+static inline int pcibios_scan_all_fns(struct pci_bus *bus, int devfn)
+{
+	return pci_scan_all_fns;
+}
+
 extern unsigned long pci_mem_start;
 #define PCIBIOS_MIN_IO		0x1000
 #define PCIBIOS_MIN_MEM		(pci_mem_start)
@@ -87,6 +93,7 @@
 
 /* MSI arch hook */
 #define arch_setup_msi_irqs arch_setup_msi_irqs
+#define arch_teardown_msi_irqs arch_teardown_msi_irqs
 
 #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
 
@@ -128,6 +135,7 @@
 #include <asm-generic/pci-dma-compat.h>
 
 /* generic pci stuff */
+#define HAVE_ARCH_PCIBIOS_SCAN_ALL_FNS
 #include <asm-generic/pci.h>
 #define PCIBIOS_MAX_MEM_32 0xffffffff
 
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index b399988..30cbf49 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -45,6 +45,7 @@
 extern unsigned int pcibios_max_latency;
 
 void pcibios_resource_survey(void);
+void pcibios_set_cache_line_size(void);
 
 /* pci-pc.c */
 
@@ -106,6 +107,7 @@
 extern void pci_direct_init(int type);
 extern void pci_pcbios_init(void);
 extern int pci_olpc_init(void);
+extern int pci_xen_init(void);
 extern void __init dmi_check_pciprobe(void);
 extern void __init dmi_check_skip_isa_align(void);
 
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index af6fd36..430e3cc 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -15,7 +15,6 @@
 	 : (prot))
 
 #ifndef __ASSEMBLY__
-
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
@@ -26,6 +25,8 @@
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;
 
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
@@ -76,6 +77,11 @@
 
 #endif	/* CONFIG_PARAVIRT */
 
+static inline pteval_t pte_flags(pte_t pte)
+{
+	return pte_val(pte) & PTE_FLAGS_MASK;
+}
+
 /*
  * The following only work if pte_present() is true.
  * Undefined behaviour if not..
@@ -397,6 +403,9 @@
 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot)	\
 	remap_pfn_range(vma, vaddr, pfn, size, prot)
 
+#define arch_vm_get_page_prot arch_vm_get_page_prot
+extern pgprot_t arch_vm_get_page_prot(unsigned vm_flags);
+
 #if PAGETABLE_LEVELS > 2
 static inline int pud_none(pud_t pud)
 {
@@ -616,6 +625,9 @@
        memcpy(dst, src, count * sizeof(pgd_t));
 }
 
+int create_lookup_pte_addr(struct mm_struct *mm,
+                           unsigned long address,
+                           uint64_t *ptep);
 
 #include <asm-generic/pgtable.h>
 #endif	/* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index c57a301..4e46931 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -160,7 +160,7 @@
 #define pgtable_cache_init()   do { } while (0)
 #define check_pgt_cache()      do { } while (0)
 
-#define PAGE_AGP    PAGE_KERNEL_NOCACHE
+#define PAGE_AGP    PAGE_KERNEL_IO_NOCACHE
 #define HAVE_PAGE_AGP 1
 
 /* fs/proc/kcore.c */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d1f4a76..a81b0ed 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -265,11 +265,6 @@
 	return pte.pte;
 }
 
-static inline pteval_t pte_flags(pte_t pte)
-{
-	return native_pte_val(pte) & PTE_FLAGS_MASK;
-}
-
 #define pgprot_val(x)	((x).pgprot)
 #define __pgprot(x)	((pgprot_t) { (x) } )
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index fa04dea..3cf805d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -551,6 +551,9 @@
 #endif
 }
 
+extern void native_set_io_bitmap(struct thread_struct *thread,
+				 unsigned long updated_bytes);
+
 static inline void
 native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
 {
@@ -592,6 +595,7 @@
 }
 
 #define set_iopl_mask native_set_iopl_mask
+#define set_io_bitmap native_set_io_bitmap
 #endif /* CONFIG_PARAVIRT */
 
 /*
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 18e496c..154a5f1 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -95,6 +95,11 @@
 			: : "i" (sz));					\
 	}
 
+/* Helper for reserving space for arrays of things */
+#define RESERVE_BRK_ARRAY(type, name, entries)		\
+	type *name;					\
+	RESERVE_BRK(name, sizeof(type) * entries)
+
 #ifdef __i386__
 
 void __init i386_start_kernel(void);
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index b9e4e20..8085277 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -3,15 +3,16 @@
 
 #include <linux/swiotlb.h>
 
-/* SWIOTLB interface */
-
-extern int swiotlb_force;
-
 #ifdef CONFIG_SWIOTLB
 extern int swiotlb;
-extern void pci_swiotlb_init(void);
+extern int __init pci_swiotlb_detect(void);
+extern void __init pci_swiotlb_init(void);
 #else
 #define swiotlb 0
+static inline int pci_swiotlb_detect(void)
+{
+	return 0;
+}
 static inline void pci_swiotlb_init(void)
 {
 }
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 1bb6e39..ef0fa4d 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -33,11 +33,11 @@
 asmlinkage int sys_set_thread_area(struct user_desc __user *);
 asmlinkage int sys_get_thread_area(struct user_desc __user *);
 
+/* kernel/ioport.c */
+asmlinkage long sys_iopl(unsigned int);
+
 /* X86_32 only */
 #ifdef CONFIG_X86_32
-/* kernel/ioport.c */
-long sys_iopl(struct pt_regs *);
-
 /* kernel/process_32.c */
 int sys_clone(struct pt_regs *);
 int sys_execve(struct pt_regs *);
@@ -68,8 +68,6 @@
 #else /* CONFIG_X86_32 */
 
 /* X86_64 only */
-/* kernel/ioport.c */
-asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
 
 /* kernel/process_64.c */
 asmlinkage long sys_clone(unsigned long, unsigned long,
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 7f3eba0..e4fc8ea 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -89,6 +89,10 @@
 
 #ifndef CONFIG_SMP
 
+static inline void __init init_smp_flush(void)
+{
+}
+
 #define flush_tlb() __flush_tlb()
 #define flush_tlb_all() __flush_tlb_all()
 #define local_flush_tlb() __flush_tlb()
@@ -129,6 +133,8 @@
 
 #define local_flush_tlb() __flush_tlb()
 
+extern void init_smp_flush(void);
+
 extern void flush_tlb_all(void);
 extern void flush_tlb_current_task(void);
 extern void flush_tlb_mm(struct mm_struct *);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 2c756fd..d8e7145 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -91,6 +91,14 @@
 };
 
 /**
+ * struct x86_init_iommu - platform specific iommu setup
+ * @iommu_init:			platform specific iommu setup
+ */
+struct x86_init_iommu {
+	int (*iommu_init)(void);
+};
+
+/**
  * struct x86_init_ops - functions for platform specific setup
  *
  */
@@ -101,6 +109,7 @@
 	struct x86_init_oem		oem;
 	struct x86_init_paging		paging;
 	struct x86_init_timers		timers;
+	struct x86_init_iommu		iommu;
 };
 
 /**
@@ -121,6 +130,7 @@
 	unsigned long (*calibrate_tsc)(void);
 	unsigned long (*get_wallclock)(void);
 	int (*set_wallclock)(unsigned long nowtime);
+	void (*iommu_shutdown)(void);
 };
 
 extern struct x86_init_ops x86_init;
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 9c371e4..8bd4e63 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -45,6 +45,8 @@
 #include <xen/interface/xen.h>
 #include <xen/interface/sched.h>
 #include <xen/interface/physdev.h>
+#include <xen/interface/platform.h>
+#include <xen/interface/xen-mca.h>
 
 /*
  * The hypercall asms have to meet several constraints:
@@ -200,6 +202,23 @@
 	(type)__res;							\
 })
 
+static inline long
+privcmd_call(unsigned call,
+	     unsigned long a1, unsigned long a2,
+	     unsigned long a3, unsigned long a4,
+	     unsigned long a5)
+{
+	__HYPERCALL_DECLS;
+	__HYPERCALL_5ARG(a1, a2, a3, a4, a5);
+
+	asm volatile("call *%[call]"
+		     : __HYPERCALL_5PARAM
+		     : [call] "a" (&hypercall_page[call])
+		     : __HYPERCALL_CLOBBER5);
+
+	return (long)__res;
+}
+
 static inline int
 HYPERVISOR_set_trap_table(struct trap_info *table)
 {
@@ -270,7 +289,7 @@
 static inline int
 HYPERVISOR_sched_op(int cmd, void *arg)
 {
-	return _hypercall2(int, sched_op_new, cmd, arg);
+	return _hypercall2(int, sched_op, cmd, arg);
 }
 
 static inline long
@@ -282,6 +301,20 @@
 }
 
 static inline int
+HYPERVISOR_mca(struct xen_mc *mc_op)
+{
+	mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
+	return _hypercall1(int, mca, mc_op);
+}
+
+static inline int
+HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
+{
+	platform_op->interface_version = XENPF_INTERFACE_VERSION;
+	return _hypercall1(int, dom0_op, platform_op);
+}
+
+static inline int
 HYPERVISOR_set_debugreg(int reg, unsigned long value)
 {
 	return _hypercall2(int, set_debugreg, reg, value);
@@ -405,10 +438,17 @@
 #endif
 
 static inline int
-HYPERVISOR_suspend(unsigned long srec)
+HYPERVISOR_suspend(unsigned long start_info_mfn)
 {
-	return _hypercall3(int, sched_op, SCHEDOP_shutdown,
-			   SHUTDOWN_suspend, srec);
+	struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
+
+	/*
+	 * For a PV guest the tools require that the start_info mfn be
+	 * present in rdx/edx when the hypercall is made. Per the
+	 * hypercall calling convention this is the third hypercall
+	 * argument, which is start_info_mfn here.
+	 */
+	return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn);
 }
 
 static inline int
@@ -417,6 +457,12 @@
 	return _hypercall2(int, nmi_op, op, arg);
 }
 
+static inline unsigned long __must_check
+HYPERVISOR_hvm_op(int op, void *arg)
+{
+       return _hypercall2(unsigned long, hvm_op, op, arg);
+}
+
 static inline void
 MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
 {
@@ -424,6 +470,14 @@
 	mcl->args[0] = set;
 }
 
+#if defined(CONFIG_X86_64)
+#define MULTI_UVMFLAGS_INDEX 2
+#define MULTI_UVMDOMID_INDEX 3
+#else
+#define MULTI_UVMFLAGS_INDEX 3
+#define MULTI_UVMDOMID_INDEX 4
+#endif
+
 static inline void
 MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
 			pte_t new_val, unsigned long flags)
@@ -432,12 +486,11 @@
 	mcl->args[0] = va;
 	if (sizeof(new_val) == sizeof(long)) {
 		mcl->args[1] = new_val.pte;
-		mcl->args[2] = flags;
 	} else {
 		mcl->args[1] = new_val.pte;
 		mcl->args[2] = new_val.pte >> 32;
-		mcl->args[3] = flags;
 	}
+	mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
 }
 
 static inline void
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index d5b7e90..396ff4c 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -37,31 +37,4 @@
 extern struct shared_info *HYPERVISOR_shared_info;
 extern struct start_info *xen_start_info;
 
-enum xen_domain_type {
-	XEN_NATIVE,		/* running on bare hardware    */
-	XEN_PV_DOMAIN,		/* running in a PV domain      */
-	XEN_HVM_DOMAIN,		/* running in a Xen hvm domain */
-};
-
-#ifdef CONFIG_XEN
-extern enum xen_domain_type xen_domain_type;
-#else
-#define xen_domain_type		XEN_NATIVE
-#endif
-
-#define xen_domain()		(xen_domain_type != XEN_NATIVE)
-#define xen_pv_domain()		(xen_domain() &&			\
-				 xen_domain_type == XEN_PV_DOMAIN)
-#define xen_hvm_domain()	(xen_domain() &&			\
-				 xen_domain_type == XEN_HVM_DOMAIN)
-
-#ifdef CONFIG_XEN_DOM0
-#include <xen/interface/xen.h>
-
-#define xen_initial_domain()	(xen_pv_domain() && \
-				 xen_start_info->flags & SIF_INITDOMAIN)
-#else  /* !CONFIG_XEN_DOM0 */
-#define xen_initial_domain()	(0)
-#endif	/* CONFIG_XEN_DOM0 */
-
 #endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index e8506c1..9539998 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -61,9 +61,9 @@
 #define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
 #endif
 
-#ifndef machine_to_phys_mapping
-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
-#endif
+#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT)
 
 /* Maximum number of virtual CPUs in multi-processor guests. */
 #define MAX_VIRT_CPUS 32
@@ -97,6 +97,8 @@
 #define TI_SET_IF(_ti, _if)	((_ti)->flags |= ((!!(_if))<<2))
 
 #ifndef __ASSEMBLY__
+#include <linux/types.h>
+
 struct trap_info {
     uint8_t       vector;  /* exception vector                              */
     uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
index 42a7e00..8413688 100644
--- a/arch/x86/include/asm/xen/interface_32.h
+++ b/arch/x86/include/asm/xen/interface_32.h
@@ -32,6 +32,11 @@
 /* And the trap vector is... */
 #define TRAP_INSTR "int $0x82"
 
+#define __MACH2PHYS_VIRT_START 0xF5800000
+#define __MACH2PHYS_VIRT_END   0xF6800000
+
+#define __MACH2PHYS_SHIFT      2
+
 /*
  * Virtual addresses beyond this are not modifiable by guest OSes. The
  * machine->physical mapping table starts at this address, read-only.
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
index 100d266..839a481 100644
--- a/arch/x86/include/asm/xen/interface_64.h
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -39,18 +39,7 @@
 #define __HYPERVISOR_VIRT_END   0xFFFF880000000000
 #define __MACH2PHYS_VIRT_START  0xFFFF800000000000
 #define __MACH2PHYS_VIRT_END    0xFFFF804000000000
-
-#ifndef HYPERVISOR_VIRT_START
-#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
-#define HYPERVISOR_VIRT_END   mk_unsigned_long(__HYPERVISOR_VIRT_END)
-#endif
-
-#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
-#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
-#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
-#ifndef machine_to_phys_mapping
-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
-#endif
+#define __MACH2PHYS_SHIFT       3
 
 /*
  * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
diff --git a/arch/x86/include/asm/xen/iommu.h b/arch/x86/include/asm/xen/iommu.h
new file mode 100644
index 0000000..75df312
--- /dev/null
+++ b/arch/x86/include/asm/xen/iommu.h
@@ -0,0 +1,12 @@
+#ifndef ASM_X86__XEN_IOMMU_H
+
+#ifdef CONFIG_PCI_XEN
+extern void xen_iommu_init(void);
+#else
+static inline void xen_iommu_init(void)
+{
+}
+#endif
+
+#endif
+
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 018a0a4..05c5cf5 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -5,6 +5,7 @@
 #include <linux/types.h>
 #include <linux/spinlock.h>
 #include <linux/pfn.h>
+#include <linux/mm.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -28,23 +29,32 @@
 
 /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
 #define INVALID_P2M_ENTRY	(~0UL)
-#define FOREIGN_FRAME_BIT	(1UL<<31)
+#define FOREIGN_FRAME_BIT	(1UL << (sizeof(unsigned long) * 8 - 1))
 #define FOREIGN_FRAME(m)	((m) | FOREIGN_FRAME_BIT)
 
 /* Maximum amount of memory we can handle in a domain in pages */
 #define MAX_DOMAIN_PAGES						\
     ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
 
+extern unsigned long *machine_to_phys_mapping;
+extern unsigned int   machine_to_phys_order;
 
 extern unsigned long get_phys_to_machine(unsigned long pfn);
-extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 
 static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {
+	unsigned long mfn;
+
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return pfn;
 
-	return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT;
+	mfn = get_phys_to_machine(pfn);
+
+	if (mfn != INVALID_P2M_ENTRY)
+		mfn &= ~FOREIGN_FRAME_BIT;
+
+	return mfn;
 }
 
 static inline int phys_to_machine_mapping_valid(unsigned long pfn)
@@ -62,10 +72,8 @@
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return mfn;
 
-#if 0
 	if (unlikely((mfn >> machine_to_phys_order) != 0))
-		return max_mapnr;
-#endif
+		return ~0;
 
 	pfn = 0;
 	/*
@@ -112,13 +120,9 @@
  */
 static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
 {
-	extern unsigned long max_mapnr;
 	unsigned long pfn = mfn_to_pfn(mfn);
-	if ((pfn < max_mapnr)
-	    && !xen_feature(XENFEAT_auto_translated_physmap)
-	    && (get_phys_to_machine(pfn) != mfn))
-		return max_mapnr; /* force !pfn_valid() */
-	/* XXX fixme; not true with sparsemem */
+	if (get_phys_to_machine(pfn) != mfn)
+		return -1; /* force !pfn_valid() */
 	return pfn;
 }
 
@@ -163,6 +167,7 @@
 
 #define pgd_val_ma(x)	((x).pgd)
 
+void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid);
 
 xmaddr_t arbitrary_virt_to_machine(void *address);
 unsigned long arbitrary_virt_to_mfn(void *vaddr);
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
new file mode 100644
index 0000000..6683196
--- /dev/null
+++ b/arch/x86/include/asm/xen/pci.h
@@ -0,0 +1,104 @@
+#ifndef _ASM_X86_XEN_PCI_H
+#define _ASM_X86_XEN_PCI_H
+
+#if defined(CONFIG_PCI_MSI)
+#if defined(CONFIG_PCI_XEN)
+int xen_register_pirq(u32 gsi, int triggering);
+int xen_register_gsi(u32 gsi, int triggering, int polarity);
+int xen_create_msi_irq(struct pci_dev *dev,
+			struct msi_desc *msidesc,
+			int type);
+void xen_pci_teardown_msi_dev(struct pci_dev *dev);
+void xen_pci_teardown_msi_irq(int irq);
+int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
+
+/* The drivers/pci/xen-pcifront.c sets this structure to
+ * its own functions.
+ */
+struct xen_pci_frontend_ops {
+	int (*enable_msi)(struct pci_dev *dev, int **vectors);
+	void (*disable_msi)(struct pci_dev *dev);
+	int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec);
+	void (*disable_msix)(struct pci_dev *dev);
+};
+
+extern struct xen_pci_frontend_ops *xen_pci_frontend;
+	
+static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
+					      int **vectors)
+{
+	if (xen_pci_frontend && xen_pci_frontend->enable_msi)
+		return xen_pci_frontend->enable_msi(dev, vectors);
+	return -ENODEV;
+}
+static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
+{
+	if (xen_pci_frontend && xen_pci_frontend->disable_msi)
+			xen_pci_frontend->disable_msi(dev);
+}
+static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
+					       int **vectors, int nvec)
+{
+	if (xen_pci_frontend && xen_pci_frontend->enable_msix)
+		return xen_pci_frontend->enable_msix(dev, vectors, nvec);
+	return -ENODEV;
+}
+static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev)
+{
+	if (xen_pci_frontend && xen_pci_frontend->disable_msix)
+			xen_pci_frontend->disable_msix(dev);
+}
+#else
+static inline int xen_create_msi_irq(struct pci_dev *dev,
+				struct msi_desc *msidesc,
+				int type)
+{
+	return -1;
+}
+static inline void xen_pci_teardown_msi_dev(struct pci_dev *dev) { }
+static inline void xen_pci_teardown_msi_irq(int irq) { }
+static inline int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	return -ENODEV;
+}
+#endif /* CONFIG_PCI_XEN */
+
+#endif /* CONFIG_PCI_MSI */
+
+#ifdef CONFIG_XEN_DOM0_PCI
+int xen_register_gsi(u32 gsi, int triggering, int polarity);
+int xen_find_device_domain_owner(struct pci_dev *dev);
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
+int xen_unregister_device_domain_owner(struct pci_dev *dev);
+
+#else
+static inline int xen_register_gsi(u32 gsi, int triggering, int polarity)
+{
+	return -1;
+}
+
+static inline int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+	return -1;
+}
+static inline int xen_register_device_domain_owner(struct pci_dev *dev,
+						   uint16_t domain)
+{
+ 	return -1;
+}
+static inline int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+ 	return -1;
+}
+#endif
+
+#if defined(CONFIG_PCI_MSI) && defined(CONFIG_XEN_DOM0_PCI)
+int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
+#else
+static inline int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	return -1;
+}
+#endif
+
+#endif	/* _ASM_X86_XEN_PCI_H */
diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h
new file mode 100644
index 0000000..e4fe299
--- /dev/null
+++ b/arch/x86/include/asm/xen/swiotlb-xen.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_SWIOTLB_XEN_H 
+#define _ASM_X86_SWIOTLB_XEN_H
+ 
+#ifdef CONFIG_PCI_XEN
+extern int xen_swiotlb;
+extern int __init pci_xen_swiotlb_detect(void);
+extern void __init pci_xen_swiotlb_init(void);
+#else
+#define xen_swiotlb 0
+static inline int __init pci_xen_swiotlb_detect(void) { return 0; }
+static inline void __init pci_xen_swiotlb_init(void) { }
+#endif
+ 
+#endif /* _ASM_X86_SWIOTLB_XEN_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d1911ab..cfe00bc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -113,6 +113,7 @@
 microcode-y				:= microcode_core.o
 microcode-$(CONFIG_MICROCODE_INTEL)	+= microcode_intel.o
 microcode-$(CONFIG_MICROCODE_AMD)	+= microcode_amd.o
+microcode-$(CONFIG_MICROCODE_XEN)	+= microcode_xen.o
 obj-$(CONFIG_MICROCODE)			+= microcode.o
 
 obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 8ba08c7..90be997 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -42,6 +42,10 @@
 #include <asm/mpspec.h>
 #include <asm/smp.h>
 
+#include <asm/xen/pci.h>
+
+#include <asm/xen/hypervisor.h>
+
 static int __initdata acpi_force = 0;
 u32 acpi_rsdt_forced;
 int acpi_disabled;
@@ -150,6 +154,10 @@
 {
 	unsigned int ver = 0;
 
+	/* We don't want to register lapics when in Xen dom0 */
+	if (xen_initial_domain())
+		return;
+
 	if (!enabled) {
 		++disabled_cpus;
 		return;
@@ -467,9 +475,13 @@
  */
 int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 {
-	unsigned int irq;
+	int irq;
 	unsigned int plat_gsi = gsi;
 
+	irq = xen_register_gsi(gsi, trigger, polarity);
+	if (irq >= 0)
+		return irq;
+
 #ifdef CONFIG_PCI
 	/*
 	 * Make sure all (legacy) PCI IRQs are set as level-triggered.
@@ -746,6 +758,10 @@
 
 static void __init acpi_register_lapic_address(unsigned long address)
 {
+	/* Xen dom0 doesn't have usable lapics */
+	if (xen_initial_domain())
+		return;
+
 	mp_lapic_addr = address;
 
 	set_fixmap_nocache(FIX_APIC_BASE, address);
@@ -866,6 +882,9 @@
 			max_gsi = gsi;
 	}
 
+	if (xen_initial_domain())
+		max_gsi += 255; /* Plus maximum entries of an ioapic. */
+
 	return max_gsi + 1;
 }
 
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index d85d1b2..8aabedd 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -12,6 +12,8 @@
 #include <acpi/processor.h>
 #include <asm/acpi.h>
 
+#include <xen/xen.h>
+
 static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
 {
 	struct acpi_object_list *obj_list;
@@ -59,7 +61,7 @@
 	/*
 	 * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
 	 */
-	if (!cpu_has(c, X86_FEATURE_MWAIT))
+	if (!cpu_has(c, X86_FEATURE_MWAIT) && !xen_initial_domain())
 		buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
 
 	obj->type = ACPI_TYPE_BUFFER;
@@ -88,6 +90,19 @@
 
 EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
 
+/* Initialize _PDC data based on the CPU vendor */
+void xen_arch_acpi_processor_init_pdc(struct acpi_processor *pr)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+
+	pr->pdc = NULL;
+	if (c->x86_vendor == X86_VENDOR_INTEL)
+		init_intel_pdc(pr, c);
+
+	return;
+}
+EXPORT_SYMBOL(xen_arch_acpi_processor_init_pdc);
+
 void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
 {
 	if (pr->pdc) {
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ca93638..9eff23c 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -12,6 +12,8 @@
 #include <asm/segment.h>
 #include <asm/desc.h>
 
+#include <xen/acpi.h>
+
 #include "realmode/wakeup.h"
 #include "sleep.h"
 
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 3a44b75..e305008 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -928,7 +928,7 @@
 	}
 
 	if (unlikely(address == -1))
-		address = bad_dma_address;
+		address = DMA_ERROR_CODE;
 
 	WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
 
@@ -1545,7 +1545,7 @@
 
 	pte  = dma_ops_get_pte(dom, address);
 	if (!pte)
-		return bad_dma_address;
+		return DMA_ERROR_CODE;
 
 	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
 
@@ -1626,7 +1626,7 @@
 retry:
 	address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
 					  dma_mask);
-	if (unlikely(address == bad_dma_address)) {
+	if (unlikely(address == DMA_ERROR_CODE)) {
 		/*
 		 * setting next_address here will let the address
 		 * allocator only scan the new allocated range in the
@@ -1647,7 +1647,7 @@
 	start = address;
 	for (i = 0; i < pages; ++i) {
 		ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
-		if (ret == bad_dma_address)
+		if (ret == DMA_ERROR_CODE)
 			goto out_unmap;
 
 		paddr += PAGE_SIZE;
@@ -1675,7 +1675,7 @@
 
 	dma_ops_free_addresses(dma_dom, address, pages);
 
-	return bad_dma_address;
+	return DMA_ERROR_CODE;
 }
 
 /*
@@ -1692,7 +1692,7 @@
 	dma_addr_t i, start;
 	unsigned int pages;
 
-	if ((dma_addr == bad_dma_address) ||
+	if ((dma_addr == DMA_ERROR_CODE) ||
 	    (dma_addr + size > dma_dom->aperture_size))
 		return;
 
@@ -1735,7 +1735,7 @@
 	INC_STATS_COUNTER(cnt_map_single);
 
 	if (!check_device(dev))
-		return bad_dma_address;
+		return DMA_ERROR_CODE;
 
 	dma_mask = *dev->dma_mask;
 
@@ -1746,12 +1746,12 @@
 		return (dma_addr_t)paddr;
 
 	if (!dma_ops_domain(domain))
-		return bad_dma_address;
+		return DMA_ERROR_CODE;
 
 	spin_lock_irqsave(&domain->lock, flags);
 	addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
 			    dma_mask);
-	if (addr == bad_dma_address)
+	if (addr == DMA_ERROR_CODE)
 		goto out;
 
 	iommu_completion_wait(iommu);
@@ -1960,7 +1960,7 @@
 	*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
 				 size, DMA_BIDIRECTIONAL, true, dma_mask);
 
-	if (*dma_addr == bad_dma_address) {
+	if (*dma_addr == DMA_ERROR_CODE) {
 		spin_unlock_irqrestore(&domain->lock, flags);
 		goto out_free;
 	}
@@ -2122,8 +2122,7 @@
 		prealloc_protection_domains();
 
 	iommu_detected = 1;
-	force_iommu = 1;
-	bad_dma_address = 0;
+	swiotlb = 0;
 #ifdef CONFIG_GART_IOMMU
 	gart_iommu_aperture_disabled = 1;
 	gart_iommu_aperture = 0;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index fc5d470..838e1d7 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -29,6 +29,7 @@
 #include <asm/amd_iommu.h>
 #include <asm/iommu.h>
 #include <asm/gart.h>
+#include <asm/x86_init.h>
 
 /*
  * definitions for the ACPI scanning code
@@ -1206,19 +1207,10 @@
  * functions. Finally it prints some information about AMD IOMMUs and
  * the driver state and enables the hardware.
  */
-int __init amd_iommu_init(void)
+static int __init amd_iommu_init(void)
 {
 	int i, ret = 0;
 
-
-	if (no_iommu) {
-		printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
-		return 0;
-	}
-
-	if (!amd_iommu_detected)
-		return -ENODEV;
-
 	/*
 	 * First parse ACPI tables to find the largest Bus/Dev/Func
 	 * we need to handle. Upon this information the shared data
@@ -1333,6 +1325,7 @@
 	else
 		printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
 
+	x86_platform.iommu_shutdown = disable_iommus;
 out:
 	return ret;
 
@@ -1361,11 +1354,6 @@
 	goto out;
 }
 
-void amd_iommu_shutdown(void)
-{
-	disable_iommus();
-}
-
 /****************************************************************************
  *
  * Early detect code. This code runs at IOMMU detection time in the DMA
@@ -1380,16 +1368,13 @@
 
 void __init amd_iommu_detect(void)
 {
-	if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
+	if (no_iommu || (iommu_detected && !gart_iommu_aperture))
 		return;
 
 	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
 		iommu_detected = 1;
 		amd_iommu_detected = 1;
-#ifdef CONFIG_GART_IOMMU
-		gart_iommu_aperture_disabled = 1;
-		gart_iommu_aperture = 0;
-#endif
+		x86_init.iommu.iommu_init = amd_iommu_init;
 	}
 }
 
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 082089e..8d34362 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -28,6 +28,7 @@
 #include <asm/pci-direct.h>
 #include <asm/dma.h>
 #include <asm/k8.h>
+#include <asm/x86_init.h>
 
 int gart_iommu_aperture;
 int gart_iommu_aperture_disabled __initdata;
@@ -401,6 +402,7 @@
 
 			iommu_detected = 1;
 			gart_iommu_aperture = 1;
+			x86_init.iommu.iommu_init = gart_iommu_init;
 
 			ctl = read_pci_config(bus, slot, 3,
 					      AMD64_GARTAPERTURECTL);
@@ -469,7 +471,7 @@
 
 	if (aper_alloc) {
 		/* Got the aperture from the AGP bridge */
-	} else if (swiotlb && !valid_agp) {
+	} else if (!valid_agp) {
 		/* Do nothing */
 	} else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
 		   force_iommu ||
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 8928d97..4848d5d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -63,7 +63,12 @@
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_irq.h>
 
+#include <asm/xen/hypervisor.h>
 #include <asm/apic.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/pci.h>
+
+#include <asm/xen/pci.h>
 
 #define __apicdebuginit(type) static type __init
 #define for_each_irq_pin(entry, head) \
@@ -395,14 +400,18 @@
 
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
 {
-	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	struct io_apic __iomem *io_apic;
+
+	io_apic = io_apic_base(apic);
 	writel(reg, &io_apic->index);
 	return readl(&io_apic->data);
 }
 
 static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
 {
-	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	struct io_apic __iomem *io_apic;
+
+	io_apic = io_apic_base(apic);
 	writel(reg, &io_apic->index);
 	writel(value, &io_apic->data);
 }
@@ -415,7 +424,9 @@
  */
 static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
 {
-	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	struct io_apic __iomem *io_apic;
+
+	io_apic = io_apic_base(apic);
 
 	if (sis_apic_bug)
 		writel(reg, &io_apic->index);
@@ -3494,6 +3505,9 @@
 	if (type == PCI_CAP_ID_MSI && nvec > 1)
 		return 1;
 
+	if (xen_pv_domain())
+		return xen_pci_setup_msi_irqs(dev, nvec, type);
+
 	node = dev_to_node(&dev->dev);
 	irq_want = nr_irqs_gsi;
 	sub_handle = 0;
@@ -3543,7 +3557,29 @@
 
 void arch_teardown_msi_irq(unsigned int irq)
 {
-	destroy_irq(irq);
+	if (xen_domain())
+		xen_pci_teardown_msi_irq(irq);
+	else
+		destroy_irq(irq);
+}
+
+void arch_teardown_msi_irqs(struct pci_dev *dev)
+{
+	struct msi_desc *entry;
+
+	/* If we are non-privileged PV domain, we have to
+	* to call xen_teardown_msi_dev first. */
+	if (xen_domain())
+		xen_pci_teardown_msi_dev(dev);
+
+	list_for_each_entry(entry, &dev->msi_list, list) {
+		int i, nvec;
+		if (entry->irq == 0)
+			continue;
+		nvec = 1 << entry->msi_attrib.multiple;
+		for (i = 0; i < nvec; i++)
+			arch_teardown_msi_irq(entry->irq + i);
+	}
 }
 
 #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
@@ -3860,7 +3896,14 @@
 	printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
 }
 
+int get_nr_irqs_gsi(void)
+{
+	return nr_irqs_gsi;
+}
+
 #ifdef CONFIG_SPARSE_IRQ
+int nr_dynamic_irqs;
+
 int __init arch_probe_nr_irqs(void)
 {
 	int nr;
@@ -3878,6 +3921,8 @@
 	if (nr < nr_irqs)
 		nr_irqs = nr;
 
+	nr_irqs += nr_dynamic_irqs;
+
 	return 0;
 }
 #endif
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 7ff61d6..d1e6e60 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -558,6 +558,9 @@
 {
 	int i;
 
+	if (!cpu_has_apic)
+		return;
+
 	cpumask_copy(&backtrace_mask, cpu_online_mask);
 
 	printk(KERN_INFO "sending NMI to all CPUs:\n");
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index f4361b5..404e458 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,4 @@
 obj-y		:= main.o if.o generic.o state.o cleanup.o
 obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
+obj-$(CONFIG_XEN_DOM0) += xen.o
 
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 33af141..378f8dc 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -108,6 +108,11 @@
 	return 0;
 }
 
+static int amd_num_var_ranges(void)
+{
+	return 2;
+}
+
 static struct mtrr_ops amd_mtrr_ops = {
 	.vendor            = X86_VENDOR_AMD,
 	.set               = amd_set_mtrr,
@@ -115,6 +120,7 @@
 	.get_free_region   = generic_get_free_region,
 	.validate_add_page = amd_validate_add_page,
 	.have_wrcomb       = positive_have_wrcomb,
+	.num_var_ranges	   = amd_num_var_ranges,
 };
 
 int __init amd_init_mtrr(void)
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index de89f14..7c686a0 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -110,6 +110,11 @@
 	return 0;
 }
 
+static int centaur_num_var_ranges(void)
+{
+	return 8;
+}
+
 static struct mtrr_ops centaur_mtrr_ops = {
 	.vendor            = X86_VENDOR_CENTAUR,
 	.set               = centaur_set_mcr,
@@ -117,6 +122,7 @@
 	.get_free_region   = centaur_get_free_region,
 	.validate_add_page = centaur_validate_add_page,
 	.have_wrcomb       = positive_have_wrcomb,
+	.num_var_ranges	   = centaur_num_var_ranges,
 };
 
 int __init centaur_init_mtrr(void)
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 228d982..fd6edcc 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -265,6 +265,11 @@
 	post_set();
 }
 
+static int cyrix_num_var_ranges(void)
+{
+	return 8;
+}
+
 static struct mtrr_ops cyrix_mtrr_ops = {
 	.vendor            = X86_VENDOR_CYRIX,
 	.set_all	   = cyrix_set_all,
@@ -273,6 +278,7 @@
 	.get_free_region   = cyrix_get_free_region,
 	.validate_add_page = generic_validate_add_page,
 	.have_wrcomb       = positive_have_wrcomb,
+	.num_var_ranges	   = cyrix_num_var_ranges,
 };
 
 int __init cyrix_init_mtrr(void)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 55da0c5..42f30cd 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -749,8 +749,16 @@
 	return 1;
 }
 
-/*
- * Generic structure...
+static int generic_num_var_ranges(void)
+{
+	unsigned long config = 0, dummy;
+
+	rdmsr(MSR_MTRRcap, config, dummy);
+
+	return config & 0xff;
+}
+
+/* generic structure...
  */
 struct mtrr_ops generic_mtrr_ops = {
 	.use_intel_if		= 1,
@@ -760,4 +768,5 @@
 	.set			= generic_set_mtrr,
 	.validate_add_page	= generic_validate_add_page,
 	.have_wrcomb		= generic_have_wrcomb,
+	.num_var_ranges	   	= generic_num_var_ranges,
 };
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index fd60f09..aa21994 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -110,21 +110,6 @@
 	return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0;
 }
 
-/*  This function returns the number of variable MTRRs  */
-static void __init set_num_var_ranges(void)
-{
-	unsigned long config = 0, dummy;
-
-	if (use_intel())
-		rdmsr(MSR_MTRRcap, config, dummy);
-	else if (is_cpu(AMD))
-		config = 2;
-	else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
-		config = 8;
-
-	num_var_ranges = config & 0xff;
-}
-
 static void __init init_table(void)
 {
 	int i, max;
@@ -721,8 +706,11 @@
 		}
 	}
 
+	/* Let Xen code override the above if it wants */
+	xen_init_mtrr();
+
 	if (mtrr_if) {
-		set_num_var_ranges();
+		num_var_ranges = mtrr_if->num_var_ranges();
 		init_table();
 		if (use_intel()) {
 			get_mtrr_state();
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index a501dee..98569c3 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,6 +5,8 @@
 #include <linux/types.h>
 #include <linux/stddef.h>
 
+#include <asm/mtrr.h>
+
 #define MTRR_CHANGE_MASK_FIXED     0x01
 #define MTRR_CHANGE_MASK_VARIABLE  0x02
 #define MTRR_CHANGE_MASK_DEFTYPE   0x04
@@ -25,6 +27,8 @@
 	int	(*validate_add_page)(unsigned long base, unsigned long size,
 				     unsigned int type);
 	int	(*have_wrcomb)(void);
+
+	int	(*num_var_ranges)(void);
 };
 
 extern int generic_get_free_region(unsigned long base, unsigned long size,
@@ -73,6 +77,13 @@
 int amd_init_mtrr(void);
 int cyrix_init_mtrr(void);
 int centaur_init_mtrr(void);
+#ifdef CONFIG_XEN_DOM0
+void xen_init_mtrr(void);
+#else
+static inline void xen_init_mtrr(void)
+{
+}
+#endif
 
 extern int changed_by_mtrr_cleanup;
 extern int mtrr_cleanup(unsigned address_bits);
diff --git a/arch/x86/kernel/cpu/mtrr/xen.c b/arch/x86/kernel/cpu/mtrr/xen.c
new file mode 100644
index 0000000..852018b
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/xen.c
@@ -0,0 +1,109 @@
+#include <linux/init.h>
+#include <linux/mm.h>
+
+#include <asm/pat.h>
+
+#include "mtrr.h"
+
+#include <xen/xen.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+static void xen_set_mtrr(unsigned int reg, unsigned long base,
+			 unsigned long size, mtrr_type type)
+{
+	struct xen_platform_op op;
+	int error;
+
+	/* mtrr_ops->set() is called once per CPU,
+	 * but Xen's ops apply to all CPUs.
+	 */
+	if (smp_processor_id())
+		return;
+
+	if (size == 0) {
+		op.cmd = XENPF_del_memtype;
+		op.u.del_memtype.handle = 0;
+		op.u.del_memtype.reg    = reg;
+	} else {
+		op.cmd = XENPF_add_memtype;
+		op.u.add_memtype.mfn     = base;
+		op.u.add_memtype.nr_mfns = size;
+		op.u.add_memtype.type    = type;
+	}
+
+	error = HYPERVISOR_dom0_op(&op);
+	BUG_ON(error != 0);
+}
+
+static void xen_get_mtrr(unsigned int reg, unsigned long *base,
+			 unsigned long *size, mtrr_type *type)
+{
+	struct xen_platform_op op;
+
+	op.cmd = XENPF_read_memtype;
+	op.u.read_memtype.reg = reg;
+	if (HYPERVISOR_dom0_op(&op) != 0) {
+		*base = 0;
+		*size = 0;
+		*type = 0;
+		return;
+	}
+
+	*size = op.u.read_memtype.nr_mfns;
+	*base = op.u.read_memtype.mfn;
+	*type = op.u.read_memtype.type;
+}
+
+static int __init xen_num_var_ranges(void)
+{
+	int ranges;
+	struct xen_platform_op op;
+
+	op.cmd = XENPF_read_memtype;
+
+	for (ranges = 0; ; ranges++) {
+		op.u.read_memtype.reg = ranges;
+		if (HYPERVISOR_dom0_op(&op) != 0)
+			break;
+	}
+	return ranges;
+}
+
+/*
+ * DOM0 TODO: Need to fill in the remaining mtrr methods to have full
+ * working userland mtrr support.
+ */
+static struct mtrr_ops xen_mtrr_ops = {
+	.vendor            = X86_VENDOR_UNKNOWN,
+	.get_free_region   = generic_get_free_region,
+	.set               = xen_set_mtrr,
+	.get               = xen_get_mtrr,
+	.have_wrcomb       = positive_have_wrcomb,
+	.validate_add_page = generic_validate_add_page,
+	.use_intel_if	   = 0,
+	.num_var_ranges	   = xen_num_var_ranges,
+};
+
+void __init xen_init_mtrr(void)
+{
+	/* 
+	 * Check that we're running under Xen, and privileged enough
+	 * to play with MTRRs.
+	 */
+	if (!xen_initial_domain())
+		return;
+
+	/* 
+	 * Check that the CPU has an MTRR implementation we can
+	 * support.
+	 */
+	if (cpu_has_mtrr ||
+	    cpu_has_k6_mtrr ||
+	    cpu_has_cyrix_arr ||
+	    cpu_has_centaur_mcr) {
+		mtrr_if = &xen_mtrr_ops;
+		pat_init();
+	}
+}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ff95824..ebd4c51 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -28,7 +28,6 @@
 #include <asm/reboot.h>
 #include <asm/virtext.h>
 
-
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
 
 static void kdump_nmi_callback(int cpu, struct die_args *args)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a89739a..5476113 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -750,6 +750,36 @@
 	return i;
 }
 
+u64 __init early_res_next_free(u64 addr)
+{
+	int i;
+	u64 end = addr;
+	struct early_res *r;
+
+	for (i = 0; i < MAX_EARLY_RES; i++) {
+		r = &early_res[i];
+		if (addr >= r->start && addr < r->end) {
+			end = r->end;
+			break;
+		}
+	}
+	return end;
+}
+
+u64 __init early_res_next_reserved(u64 addr, u64 max)
+{
+	int i;
+	struct early_res *r;
+	u64 next_res = max;
+
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		r = &early_res[i];
+		if ((r->start >= addr) && (r->start < next_res))
+			next_res = r->start;
+	}
+	return next_res;
+}
+
 /*
  * Drop the i-th range from the early reservation map,
  * by copying any higher ranges down one over it, and
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d..7764118 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1088,6 +1088,9 @@
 .previous
 ENDPROC(xen_failsafe_callback)
 
+BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
+		xen_evtchn_do_upcall)
+
 #endif	/* CONFIG_XEN */
 
 #ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 34a56a9..cba6d5a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1364,6 +1364,9 @@
 	CFI_ENDPROC
 END(xen_failsafe_callback)
 
+apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
+	xen_hvm_callback_vector xen_evtchn_do_upcall
+
 #endif /* CONFIG_XEN */
 
 /*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 0b06cd7..6c5d59b 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -76,6 +76,7 @@
 	/* Make NULL pointers segfault */
 	zap_identity_mappings();
 
+	max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
 	/* Cleanup the over mapped high alias */
 	cleanup_highmap();
 
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 987ef29..5d1026c 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -101,7 +101,7 @@
 }
 __setup("hpet=", hpet_setup);
 
-static int __init disable_hpet(char *str)
+int __init disable_hpet(char *str)
 {
 	boot_hpet_disable = 1;
 	return 1;
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 99c4d30..919c1a8 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -30,13 +30,29 @@
 	}
 }
 
+void native_set_io_bitmap(struct thread_struct *t,
+			  unsigned long bytes_updated)
+{
+	struct tss_struct *tss;
+
+	if (!bytes_updated)
+		return;
+
+	tss = &__get_cpu_var(init_tss);
+
+	/* Update the TSS: */
+	if (t->io_bitmap_ptr)
+		memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+	else
+		memset(tss->io_bitmap, 0xff, bytes_updated);
+}
+
 /*
  * this changes the io permissions bitmap in the current task.
  */
 asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 {
 	struct thread_struct *t = &current->thread;
-	struct tss_struct *tss;
 	unsigned int i, max_long, bytes, bytes_updated;
 
 	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
@@ -61,13 +77,13 @@
 	}
 
 	/*
-	 * do it in the per-thread copy and in the TSS ...
+	 * do it in the per-thread copy
 	 *
-	 * Disable preemption via get_cpu() - we must not switch away
+	 * Disable preemption - we must not switch away
 	 * because the ->io_bitmap_max value must match the bitmap
 	 * contents:
 	 */
-	tss = &per_cpu(init_tss, get_cpu());
+	preempt_disable();
 
 	set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
 
@@ -85,10 +101,9 @@
 
 	t->io_bitmap_max = bytes;
 
-	/* Update the TSS: */
-	memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+	set_io_bitmap(t, bytes_updated);
 
-	put_cpu();
+	preempt_enable();
 
 	return 0;
 }
@@ -119,11 +134,10 @@
 	return 0;
 }
 
-#ifdef CONFIG_X86_32
-long sys_iopl(struct pt_regs *regs)
+asmlinkage long sys_iopl(unsigned int level)
 {
-	unsigned int level = regs->bx;
 	struct thread_struct *t = &current->thread;
+	struct pt_regs *regs = task_pt_regs(current);
 	int rc;
 
 	rc = do_iopl(level, regs);
@@ -135,9 +149,3 @@
 out:
 	return rc;
 }
-#else
-asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
-{
-	return do_iopl(level, regs);
-}
-#endif
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ec6ef60..fa5b061 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -109,6 +109,9 @@
 
 	mutex_init(&mm->context.lock);
 	mm->context.size = 0;
+#ifdef CONFIG_XEN
+	mm->context.has_foreign_mappings = 0;
+#endif
 	old_mm = current->mm;
 	if (old_mm && old_mm->context.size > 0) {
 		mutex_lock(&old_mm->context.lock);
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 378e9a8..86ca771 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -81,6 +81,8 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 
+#include <xen/xen.h>
+#include <asm/xen/hypervisor.h>
 #include <asm/microcode.h>
 #include <asm/processor.h>
 
@@ -503,7 +505,9 @@
 	struct cpuinfo_x86 *c = &cpu_data(0);
 	int error;
 
-	if (c->x86_vendor == X86_VENDOR_INTEL)
+	if (xen_pv_domain())
+		microcode_ops = init_xen_microcode();
+	else if (c->x86_vendor == X86_VENDOR_INTEL)
 		microcode_ops = init_intel_microcode();
 	else if (c->x86_vendor == X86_VENDOR_AMD)
 		microcode_ops = init_amd_microcode();
diff --git a/arch/x86/kernel/microcode_xen.c b/arch/x86/kernel/microcode_xen.c
new file mode 100644
index 0000000..16c742e
--- /dev/null
+++ b/arch/x86/kernel/microcode_xen.c
@@ -0,0 +1,201 @@
+/*
+ * Xen microcode update driver
+ *
+ * Xen does most of the work here.  We just pass the whole blob into
+ * Xen, and it will apply it to all CPUs as appropriate.  Xen will
+ * worry about how different CPU models are actually updated.
+ */
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/firmware.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+
+#include <asm/microcode.h>
+
+#include <xen/xen.h>
+#include <xen/interface/platform.h>
+#include <xen/interface/xen.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+MODULE_DESCRIPTION("Xen microcode update driver");
+MODULE_LICENSE("GPL");
+
+struct xen_microcode {
+	size_t len;
+	char data[0];
+};
+
+static int xen_microcode_update(int cpu)
+{
+	int err;
+	struct xen_platform_op op;
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	struct xen_microcode *uc = uci->mc;
+
+	if (uc == NULL || uc->len == 0) {
+		/*
+		 * We do all cpus at once, so we don't need to do
+		 * other cpus explicitly (besides, these vcpu numbers
+		 * have no relationship to underlying physical cpus).
+		 */
+		return 0;
+	}
+
+	op.cmd = XENPF_microcode_update;
+	set_xen_guest_handle(op.u.microcode.data, uc->data);
+	op.u.microcode.length = uc->len;
+
+	err = HYPERVISOR_dom0_op(&op);
+
+	if (err != 0)
+		printk(KERN_WARNING "microcode_xen: microcode update failed: %d\n", err);
+
+	return err;
+}
+
+static enum ucode_state xen_request_microcode_fw(int cpu, struct device *device)
+{
+	char name[30];
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	const struct firmware *firmware;
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	enum ucode_state ret;
+	struct xen_microcode *uc;
+	size_t size;
+	int err;
+
+	switch (c->x86_vendor) {
+	case X86_VENDOR_INTEL:
+		snprintf(name, sizeof(name), "intel-ucode/%02x-%02x-%02x",
+			 c->x86, c->x86_model, c->x86_mask);
+		break;
+
+	case X86_VENDOR_AMD:
+		snprintf(name, sizeof(name), "amd-ucode/microcode_amd.bin");
+		break;
+
+	default:
+		return UCODE_NFOUND;
+	}
+
+	err = request_firmware(&firmware, name, device);
+	if (err) {
+		pr_debug("microcode: data file %s load failed\n", name);
+		return UCODE_NFOUND;
+	}
+
+	/*
+	 * Only bother getting real firmware for cpu 0; the others get
+	 * dummy placeholders.
+	 */
+	if (cpu == 0)
+		size = firmware->size;
+	else
+		size = 0;
+
+	if (uci->mc != NULL) {
+		vfree(uci->mc);
+		uci->mc = NULL;
+	}
+
+	ret = UCODE_ERROR;
+	uc = vmalloc(sizeof(*uc) + size);
+	if (uc == NULL)
+		goto out;
+
+	ret = UCODE_OK;
+	uc->len = size;
+	memcpy(uc->data, firmware->data, uc->len);
+
+	uci->mc = uc;
+
+out:
+	release_firmware(firmware);
+
+	return ret;
+}
+
+static enum ucode_state xen_request_microcode_user(int cpu,
+						   const void __user *buf, size_t size)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	struct xen_microcode *uc;
+	enum ucode_state ret;
+	size_t unread;
+
+	if (cpu != 0) {
+		/* No real firmware for non-zero cpus; just store a
+		   placeholder */
+		size = 0;
+	}
+
+	if (uci->mc != NULL) {
+		vfree(uci->mc);
+		uci->mc = NULL;
+	}
+
+	ret = UCODE_ERROR;
+	uc = vmalloc(sizeof(*uc) + size);
+	if (uc == NULL)
+		goto out;
+
+	uc->len = size;
+
+	ret = UCODE_NFOUND;
+
+	/* XXX This sporadically returns uncopied bytes, so we return
+	   EFAULT.  As far as I can see, the usermode code
+	   (microcode_ctl) isn't doing anything wrong... */
+	unread = copy_from_user(uc->data, buf, size);
+
+	if (unread != 0) {
+		printk(KERN_WARNING "failed to read %zd of %zd bytes at %p -> %p\n",
+		       unread, size, buf, uc->data);
+		goto out;
+	}
+
+	ret = UCODE_OK;
+
+out:
+	if (ret == 0)
+		uci->mc = uc;
+	else
+		vfree(uc);
+
+	return ret;
+}
+
+static void xen_microcode_fini_cpu(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	vfree(uci->mc);
+	uci->mc = NULL;
+}
+
+static int xen_collect_cpu_info(int cpu, struct cpu_signature *sig)
+{
+	sig->sig = 0;
+	sig->pf = 0;
+	sig->rev = 0;
+
+	return 0;
+}
+
+static struct microcode_ops microcode_xen_ops = {
+	.request_microcode_user		  = xen_request_microcode_user,
+	.request_microcode_fw             = xen_request_microcode_fw,
+	.collect_cpu_info                 = xen_collect_cpu_info,
+	.apply_microcode                  = xen_microcode_update,
+	.microcode_fini_cpu               = xen_microcode_fini_cpu,
+};
+
+struct microcode_ops * __init init_xen_microcode(void)
+{
+	if (!xen_initial_domain())
+		return NULL;
+	return &microcode_xen_ops;
+}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b1739d..f7e115c 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -376,6 +376,7 @@
 	.swapgs = native_swapgs,
 
 	.set_iopl_mask = native_set_iopl_mask,
+	.set_io_bitmap = native_set_io_bitmap,
 	.io_delay = native_io_delay,
 
 	.start_context_switch = paravirt_nop,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 1a2d4b1..2f158a5 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -46,6 +46,7 @@
 #include <asm/dma.h>
 #include <asm/rio.h>
 #include <asm/bios_ebda.h>
+#include <asm/x86_init.h>
 
 #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
 int use_calgary __read_mostly = 1;
@@ -249,7 +250,7 @@
 			if (panic_on_overflow)
 				panic("Calgary: fix the allocator.\n");
 			else
-				return bad_dma_address;
+				return DMA_ERROR_CODE;
 		}
 	}
 
@@ -265,11 +266,11 @@
 			      void *vaddr, unsigned int npages, int direction)
 {
 	unsigned long entry;
-	dma_addr_t ret = bad_dma_address;
+	dma_addr_t ret = DMA_ERROR_CODE;
 
 	entry = iommu_range_alloc(dev, tbl, npages);
 
-	if (unlikely(entry == bad_dma_address))
+	if (unlikely(entry == DMA_ERROR_CODE))
 		goto error;
 
 	/* set the return dma address */
@@ -284,7 +285,7 @@
 error:
 	printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
 	       "iommu %p\n", npages, tbl);
-	return bad_dma_address;
+	return DMA_ERROR_CODE;
 }
 
 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -295,8 +296,8 @@
 	unsigned long flags;
 
 	/* were we called with bad_dma_address? */
-	badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
-	if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
+	badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
+	if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
 		WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
 		       "address 0x%Lx\n", dma_addr);
 		return;
@@ -380,7 +381,7 @@
 		npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
 
 		entry = iommu_range_alloc(dev, tbl, npages);
-		if (entry == bad_dma_address) {
+		if (entry == DMA_ERROR_CODE) {
 			/* makes sure unmap knows to stop */
 			s->dma_length = 0;
 			goto error;
@@ -398,7 +399,7 @@
 error:
 	calgary_unmap_sg(dev, sg, nelems, dir, NULL);
 	for_each_sg(sg, s, nelems, i) {
-		sg->dma_address = bad_dma_address;
+		sg->dma_address = DMA_ERROR_CODE;
 		sg->dma_length = 0;
 	}
 	return 0;
@@ -453,7 +454,7 @@
 
 	/* set up tces to cover the allocated range */
 	mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
-	if (mapping == bad_dma_address)
+	if (mapping == DMA_ERROR_CODE)
 		goto free;
 	*dma_handle = mapping;
 	return ret;
@@ -734,7 +735,7 @@
 	struct iommu_table *tbl = pci_iommu(dev->bus);
 
 	/* reserve EMERGENCY_PAGES from bad_dma_address and up */
-	iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
+	iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
 
 	/* avoid the BIOS/VGA first 640KB-1MB region */
 	/* for CalIOC2 - avoid the entire first MB */
@@ -1349,6 +1350,23 @@
 	return;
 }
 
+static int __init calgary_iommu_init(void)
+{
+	int ret;
+
+	/* ok, we're trying to use Calgary - let's roll */
+	printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
+
+	ret = calgary_init();
+	if (ret) {
+		printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
+		       "falling back to no_iommu\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
 void __init detect_calgary(void)
 {
 	int bus;
@@ -1362,7 +1380,7 @@
 	 * if the user specified iommu=off or iommu=soft or we found
 	 * another HW IOMMU already, bail out.
 	 */
-	if (swiotlb || no_iommu || iommu_detected)
+	if (no_iommu || iommu_detected)
 		return;
 
 	if (!use_calgary)
@@ -1447,9 +1465,7 @@
 		printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
 		       specified_table_size);
 
-		/* swiotlb for devices that aren't behind the Calgary. */
-		if (max_pfn > MAX_DMA32_PFN)
-			swiotlb = 1;
+		x86_init.iommu.iommu_init = calgary_iommu_init;
 	}
 	return;
 
@@ -1462,35 +1478,6 @@
 	}
 }
 
-int __init calgary_iommu_init(void)
-{
-	int ret;
-
-	if (no_iommu || (swiotlb && !calgary_detected))
-		return -ENODEV;
-
-	if (!calgary_detected)
-		return -ENODEV;
-
-	/* ok, we're trying to use Calgary - let's roll */
-	printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
-
-	ret = calgary_init();
-	if (ret) {
-		printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
-		       "falling back to no_iommu\n", ret);
-		return ret;
-	}
-
-	force_iommu = 1;
-	bad_dma_address = 0x0;
-	/* dma_ops is set to swiotlb or nommu */
-	if (!dma_ops)
-		dma_ops = &nommu_dma_ops;
-
-	return 0;
-}
-
 static int __init calgary_parse_options(char *p)
 {
 	unsigned int bridge;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 6ac3931..3e57c58 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,10 +11,12 @@
 #include <asm/gart.h>
 #include <asm/calgary.h>
 #include <asm/amd_iommu.h>
+#include <asm/x86_init.h>
+#include <asm/xen/swiotlb-xen.h>
 
 static int forbid_dac __read_mostly;
 
-struct dma_map_ops *dma_ops;
+struct dma_map_ops *dma_ops = &nommu_dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 static int iommu_sac_force __read_mostly;
@@ -42,9 +44,6 @@
  */
 int iommu_pass_through __read_mostly;
 
-dma_addr_t bad_dma_address __read_mostly = 0;
-EXPORT_SYMBOL(bad_dma_address);
-
 /* Dummy device used for NULL arguments (normally ISA). */
 struct device x86_dma_fallback_dev = {
 	.init_name = "fallback device",
@@ -126,18 +125,19 @@
 	/* free the range so iommu could get some range less than 4G */
 	dma32_free_bootmem();
 #endif
+	if (pci_xen_swiotlb_detect() || pci_swiotlb_detect())
+		goto out;
 
-	/*
-	 * The order of these functions is important for
-	 * fall-back/fail-over reasons
-	 */
 	gart_iommu_hole_init();
 
 	detect_calgary();
 
 	detect_intel_iommu();
 
+	/* needs to be called after gart_iommu_hole_init */
 	amd_iommu_detect();
+out:
+	pci_xen_swiotlb_init();
 
 	pci_swiotlb_init();
 }
@@ -289,25 +289,17 @@
 #ifdef CONFIG_PCI
 	dma_debug_add_bus(&pci_bus_type);
 #endif
+	x86_init.iommu.iommu_init();
 
-	calgary_iommu_init();
+	if (swiotlb || xen_swiotlb) {
+		printk(KERN_INFO "PCI-DMA: "
+		       "Using software bounce buffering for IO (SWIOTLB)\n");
+		swiotlb_print_info();
+	} else
+		swiotlb_free();
 
-	intel_iommu_init();
-
-	amd_iommu_init();
-
-	gart_iommu_init();
-
-	no_iommu_init();
 	return 0;
 }
-
-void pci_iommu_shutdown(void)
-{
-	gart_iommu_shutdown();
-
-	amd_iommu_shutdown();
-}
 /* Must execute after PCI subsystem */
 rootfs_initcall(pci_iommu_init);
 
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 1c76691..8c9dd05 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -39,6 +39,7 @@
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
 #include <asm/k8.h>
+#include <asm/x86_init.h>
 
 static unsigned long iommu_bus_base;	/* GART remapping area (physical) */
 static unsigned long iommu_size;	/* size of remapping area bytes */
@@ -46,6 +47,8 @@
 
 static u32 *iommu_gatt_base;		/* Remapping table */
 
+static dma_addr_t bad_dma_addr;
+
 /*
  * If this is disabled the IOMMU will use an optimized flushing strategy
  * of only flushing when an mapping is reused. With it true the GART is
@@ -216,7 +219,7 @@
 		if (panic_on_overflow)
 			panic("dma_map_area overflow %lu bytes\n", size);
 		iommu_full(dev, size, dir);
-		return bad_dma_address;
+		return bad_dma_addr;
 	}
 
 	for (i = 0; i < npages; i++) {
@@ -302,7 +305,7 @@
 
 		if (nonforced_iommu(dev, addr, s->length)) {
 			addr = dma_map_area(dev, addr, s->length, dir, 0);
-			if (addr == bad_dma_address) {
+			if (addr == bad_dma_addr) {
 				if (i > 0)
 					gart_unmap_sg(dev, sg, i, dir, NULL);
 				nents = 0;
@@ -455,7 +458,7 @@
 
 	iommu_full(dev, pages << PAGE_SHIFT, dir);
 	for_each_sg(sg, s, nents, i)
-		s->dma_address = bad_dma_address;
+		s->dma_address = bad_dma_addr;
 	return 0;
 }
 
@@ -479,7 +482,7 @@
 				     DMA_BIDIRECTIONAL, align_mask);
 
 		flush_gart();
-		if (paddr != bad_dma_address) {
+		if (paddr != bad_dma_addr) {
 			*dma_addr = paddr;
 			return page_address(page);
 		}
@@ -499,6 +502,11 @@
 	free_pages((unsigned long)vaddr, get_order(size));
 }
 
+static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return (dma_addr == bad_dma_addr);
+}
+
 static int no_agp;
 
 static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -689,14 +697,15 @@
 	.unmap_page			= gart_unmap_page,
 	.alloc_coherent			= gart_alloc_coherent,
 	.free_coherent			= gart_free_coherent,
+	.mapping_error			= gart_mapping_error,
 };
 
-void gart_iommu_shutdown(void)
+static void gart_iommu_shutdown(void)
 {
 	struct pci_dev *dev;
 	int i;
 
-	if (no_agp && (dma_ops != &gart_dma_ops))
+	if (no_agp)
 		return;
 
 	for (i = 0; i < num_k8_northbridges; i++) {
@@ -711,7 +720,7 @@
 	}
 }
 
-void __init gart_iommu_init(void)
+int __init gart_iommu_init(void)
 {
 	struct agp_kern_info info;
 	unsigned long iommu_start;
@@ -721,7 +730,7 @@
 	long i;
 
 	if (num_k8_northbridges == 0)
-		return;
+		return 0;
 
 #ifndef CONFIG_AGP_AMD64
 	no_agp = 1;
@@ -733,13 +742,6 @@
 		(agp_copy_info(agp_bridge, &info) < 0);
 #endif
 
-	if (swiotlb)
-		return;
-
-	/* Did we detect a different HW IOMMU? */
-	if (iommu_detected && !gart_iommu_aperture)
-		return;
-
 	if (no_iommu ||
 	    (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
 	    !gart_iommu_aperture ||
@@ -749,7 +751,7 @@
 			       "but GART IOMMU not available.\n");
 			printk(KERN_WARNING "falling back to iommu=soft.\n");
 		}
-		return;
+		return 0;
 	}
 
 	/* need to map that range */
@@ -794,7 +796,7 @@
 
 	iommu_start = aper_size - iommu_size;
 	iommu_bus_base = info.aper_base + iommu_start;
-	bad_dma_address = iommu_bus_base;
+	bad_dma_addr = iommu_bus_base;
 	iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
 
 	/*
@@ -841,6 +843,10 @@
 
 	flush_gart();
 	dma_ops = &gart_dma_ops;
+	x86_platform.iommu_shutdown = gart_iommu_shutdown;
+	swiotlb = 0;
+
+	return 0;
 }
 
 void __init gart_parse_options(char *p)
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a3933d4..22be12b 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -33,7 +33,7 @@
 	dma_addr_t bus = page_to_phys(page) + offset;
 	WARN_ON(size == 0);
 	if (!check_addr("map_single", dev, bus, size))
-		return bad_dma_address;
+		return DMA_ERROR_CODE;
 	flush_write_buffers();
 	return bus;
 }
@@ -103,12 +103,3 @@
 	.sync_sg_for_device	= nommu_sync_sg_for_device,
 	.is_phys		= 1,
 };
-
-void __init no_iommu_init(void)
-{
-	if (dma_ops)
-		return;
-
-	force_iommu = 0; /* no HW IOMMU */
-	dma_ops = &nommu_dma_ops;
-}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index aaa6b78..7d2829d 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -42,18 +42,31 @@
 	.dma_supported = NULL,
 };
 
-void __init pci_swiotlb_init(void)
+/*
+ * pci_swiotlb_detect - set swiotlb to 1 if necessary
+ *
+ * This returns non-zero if we are forced to use swiotlb (by the boot
+ * option).
+ */
+int __init pci_swiotlb_detect(void)
 {
+	int use_swiotlb = swiotlb | swiotlb_force;
+
 	/* don't initialize swiotlb if iommu=off (no_iommu=1) */
 #ifdef CONFIG_X86_64
-	if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN))
+	if (!no_iommu && max_pfn > MAX_DMA32_PFN)
 		swiotlb = 1;
 #endif
 	if (swiotlb_force)
 		swiotlb = 1;
+
+	return use_swiotlb;
+}
+
+void __init pci_swiotlb_init(void)
+{
 	if (swiotlb) {
-		printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
-		swiotlb_init();
+		swiotlb_init(0);
 		dma_ops = &swiotlb_dma_ops;
 	}
 }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index fc6c84d..d079f92 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -73,16 +73,12 @@
 	unsigned long *bp = t->io_bitmap_ptr;
 
 	if (bp) {
-		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
-
+		preempt_disable();
 		t->io_bitmap_ptr = NULL;
 		clear_thread_flag(TIF_IO_BITMAP);
-		/*
-		 * Careful, clear this in the TSS too:
-		 */
-		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
+		set_io_bitmap(t, t->io_bitmap_max);
 		t->io_bitmap_max = 0;
-		put_cpu();
+		preempt_enable();
 		kfree(bp);
 	}
 }
@@ -199,19 +195,10 @@
 			hard_enable_TSC();
 	}
 
-	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
-		/*
-		 * Copy the relevant range of the IO bitmap.
-		 * Normally this is 128 bytes or less:
-		 */
-		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
-		       max(prev->io_bitmap_max, next->io_bitmap_max));
-	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
-		/*
-		 * Clear any possible leftover bits:
-		 */
-		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
-	}
+	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP) ||
+	    test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
+		set_io_bitmap(next,
+			      max(prev->io_bitmap_max, next->io_bitmap_max));
 }
 
 int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index cf98100..7a5cb07 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -23,7 +23,7 @@
 # include <linux/ctype.h>
 # include <linux/mc146818rtc.h>
 #else
-# include <asm/iommu.h>
+# include <asm/x86_init.h>
 #endif
 
 /*
@@ -655,7 +655,7 @@
 #endif
 
 #ifdef CONFIG_X86_64
-	pci_iommu_shutdown();
+	x86_platform.iommu_shutdown();
 #endif
 }
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5449a26..56b4707 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -70,6 +70,7 @@
 #include <linux/tboot.h>
 
 #include <video/edid.h>
+#include <xen/xen.h>
 
 #include <asm/mtrr.h>
 #include <asm/apic.h>
@@ -89,6 +90,7 @@
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
 #include <asm/bugs.h>
+#include <asm/tlbflush.h>
 
 #include <asm/system.h>
 #include <asm/vsyscall.h>
@@ -909,7 +911,6 @@
 		max_low_pfn = max_pfn;
 
 	high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
-	max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
 #endif
 
 #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
@@ -967,6 +968,9 @@
 
 	initmem_init(0, max_pfn);
 
+	/* Initialize cross-cpu tlb flushes */
+	init_smp_flush();
+
 #ifdef CONFIG_ACPI_SLEEP
 	/*
 	 * Reserve low memory region for sleep support.
@@ -1037,6 +1041,7 @@
 	probe_nr_irqs_gsi();
 
 	kvm_guest_init();
+	xen_hvm_guest_init();
 
 	e820_reserve_resources();
 	e820_mark_nosave_regions(max_low_pfn);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 4449a4a..d11c5ff 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -14,10 +14,13 @@
 #include <asm/time.h>
 #include <asm/irq.h>
 #include <asm/tsc.h>
+#include <asm/iommu.h>
 
 void __cpuinit x86_init_noop(void) { }
 void __init x86_init_uint_noop(unsigned int unused) { }
 void __init x86_init_pgd_noop(pgd_t *unused) { }
+int __init iommu_init_noop(void) { return 0; }
+void iommu_shutdown_noop(void) { }
 
 /*
  * The platform setup functions are preset with the default functions
@@ -62,6 +65,10 @@
 		.tsc_pre_init		= x86_init_noop,
 		.timer_init		= hpet_time_init,
 	},
+
+	.iommu = {
+		.iommu_init		= iommu_init_noop,
+	},
 };
 
 struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
@@ -72,4 +79,5 @@
 	.calibrate_tsc			= native_calibrate_tsc,
 	.get_wallclock			= mach_get_cmos_time,
 	.set_wallclock			= mach_set_rtc_mmss,
+	.iommu_shutdown			= iommu_shutdown_noop,
 };
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 06630d2..ad895ae 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -6,6 +6,11 @@
 CFLAGS_physaddr.o		:= $(nostackp)
 CFLAGS_setup_nx.o		:= $(nostackp)
 
+# Make sure __phys_addr has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_ioremap.o		:= $(nostackp)
+CFLAGS_init.o			:= $(nostackp)
+
 obj-$(CONFIG_SMP)		+= tlb.o
 
 obj-$(CONFIG_X86_32)		+= pgtable_32.o iomap_32.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8ac0d76..7e7dbd1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -228,7 +228,16 @@
 
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
-			if (!vmalloc_sync_one(page_address(page), address))
+			spinlock_t *pgt_lock;
+			int ret;
+
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+			spin_lock(pgt_lock);
+			ret = vmalloc_sync_one(page_address(page), address);
+			spin_unlock(pgt_lock);
+
+			if (!ret)
 				break;
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
@@ -340,11 +349,19 @@
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
+			spinlock_t *pgt_lock;
+
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+			spin_lock(pgt_lock);
+
 			if (pgd_none(*pgd))
 				set_pgd(pgd, *pgd_ref);
 			else
 				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+			spin_unlock(pgt_lock);
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 71da1bc..892b8eb 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -313,6 +313,11 @@
 		goto slow_irqon;
 #endif
 
+#ifdef CONFIG_XEN
+	if (unlikely(mm->context.has_foreign_mappings))
+		goto slow_irqon;
+#endif
+
 	/*
 	 * XXX: batch / limit 'nr', to avoid large irq off latency
 	 * needs some instrumenting to determine the common sizes used by
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 73ffd55..3e3d5f9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -8,7 +8,6 @@
 #include <asm/page.h>
 #include <asm/page_types.h>
 #include <asm/sections.h>
-#include <asm/setup.h>
 #include <asm/system.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
@@ -288,23 +287,8 @@
 #endif
 
 #ifdef CONFIG_X86_64
-	if (!after_bootmem && !start) {
-		pud_t *pud;
-		pmd_t *pmd;
-
+	if (!after_bootmem)
 		mmu_cr4_features = read_cr4();
-
-		/*
-		 * _brk_end cannot change anymore, but it and _end may be
-		 * located on different 2M pages. cleanup_highmap(), however,
-		 * can only consider _end when it runs, so destroy any
-		 * mappings beyond _brk_end here.
-		 */
-		pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
-		pmd = pmd_offset(pud, _brk_end - 1);
-		while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
-			pmd_clear(pmd);
-	}
 #endif
 	__flush_tlb_all();
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 30938c1..10c3719 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -430,22 +430,45 @@
 {
 	int node_pfn;
 	struct page *page;
+	phys_addr_t chunk_end, chunk_max;
 	unsigned long final_start_pfn, final_end_pfn;
-	struct add_highpages_data *data;
-
-	data = (struct add_highpages_data *)datax;
+	struct add_highpages_data *data = (struct add_highpages_data *)datax;
 
 	final_start_pfn = max(start_pfn, data->start_pfn);
 	final_end_pfn = min(end_pfn, data->end_pfn);
 	if (final_start_pfn >= final_end_pfn)
 		return 0;
 
-	for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
-	     node_pfn++) {
-		if (!pfn_valid(node_pfn))
-			continue;
-		page = pfn_to_page(node_pfn);
-		add_one_highpage_init(page, node_pfn);
+	chunk_end = PFN_PHYS(final_start_pfn);
+	chunk_max = PFN_PHYS(final_end_pfn);
+
+	/*
+	 * Check for reserved areas.
+	 */
+	for (;;) {
+		phys_addr_t chunk_start;
+		chunk_start = early_res_next_free(chunk_end);
+		
+		/*
+		 * Reserved area. Just count high mem pages.
+		 */
+		for (node_pfn = PFN_DOWN(chunk_end);
+		     node_pfn < PFN_DOWN(chunk_start); node_pfn++) {
+			if (pfn_valid(node_pfn))
+				totalhigh_pages++;
+		}
+
+		if (chunk_start >= chunk_max)
+			break;
+
+		chunk_end = early_res_next_reserved(chunk_start, chunk_max);
+		for (node_pfn = PFN_DOWN(chunk_start);
+		     node_pfn < PFN_DOWN(chunk_end); node_pfn++) {
+			if (!pfn_valid(node_pfn))
+				continue;
+			page = pfn_to_page(node_pfn);
+			add_one_highpage_init(page, node_pfn);
+		}
 	}
 
 	return 0;
@@ -459,7 +482,6 @@
 
 	data.start_pfn = start_pfn;
 	data.end_pfn = end_pfn;
-
 	work_with_active_regions(nid, add_highpages_work_fn, &data);
 }
 
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 2feb9bd..2601df2 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -425,6 +425,11 @@
 	return &bm_pte[pte_index(addr)];
 }
 
+bool __init is_early_ioremap_ptep(pte_t *ptep)
+{
+	return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
+}
+
 static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
 
 void __init early_ioremap_init(void)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e78cd0e..fb91994 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -666,7 +666,7 @@
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				unsigned long size, pgprot_t vma_prot)
 {
-	return vma_prot;
+	return __pgprot(pgprot_val(vma_prot) | _PAGE_IOMAP);
 }
 
 #ifdef CONFIG_STRICT_DEVMEM
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e0e6fad..a0db43b 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,6 +4,9 @@
 #include <asm/tlb.h>
 #include <asm/fixmap.h>
 
+#include <xen/xen.h>
+#include <asm/xen/hypervisor.h>
+
 #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
 
 #ifdef CONFIG_HIGHPTE
@@ -14,6 +17,16 @@
 
 gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
 
+pgprot_t arch_vm_get_page_prot(unsigned vm_flags)
+{
+	pgprot_t ret = __pgprot(0);
+
+	if (vm_flags & VM_IO)
+		ret = __pgprot(_PAGE_IOMAP);
+
+	return ret;
+}
+
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
 	return (pte_t *)__get_free_page(PGALLOC_GFP);
@@ -86,7 +99,19 @@
 #define UNSHARED_PTRS_PER_PGD				\
 	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 
-static void pgd_ctor(pgd_t *pgd)
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+	virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+	return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 {
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -104,8 +129,10 @@
 	}
 
 	/* list required to sync kernel mapping updates */
-	if (!SHARED_KERNEL_PMD)
+	if (!SHARED_KERNEL_PMD) {
+		pgd_set_mm(pgd, mm);
 		pgd_list_add(pgd);
+	}
 }
 
 static void pgd_dtor(pgd_t *pgd)
@@ -270,7 +297,7 @@
 	 */
 	spin_lock_irqsave(&pgd_lock, flags);
 
-	pgd_ctor(pgd);
+	pgd_ctor(mm, pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
 	spin_unlock_irqrestore(&pgd_lock, flags);
@@ -287,6 +314,12 @@
 
 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
+#ifdef CONFIG_XEN
+	/* EEW */
+	extern void xen_late_unpin_pgd(struct mm_struct *mm, pgd_t *pgd);
+	if (xen_pv_domain())
+		xen_late_unpin_pgd(mm, pgd);
+#endif
 	pgd_mop_up_pmds(mm, pgd);
 	pgd_dtor(pgd);
 	paravirt_pgd_free(mm, pgd);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 36fe08e..7317947 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -148,13 +148,25 @@
 		 * BUG();
 		 */
 
-	if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
-		if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
+	if (f->flush_mm == NULL ||
+	    f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+		int tlbstate = percpu_read(cpu_tlbstate.state);
+
+		/*
+		 * flush_mm == NULL means flush everything, including
+		 * global tlbs, which will only happen when flushing
+		 * kernel mappings.
+		 */
+		if (f->flush_mm == NULL)
+			__flush_tlb_all();
+		else if (tlbstate == TLBSTATE_OK) {
 			if (f->flush_va == TLB_FLUSH_ALL)
 				local_flush_tlb();
 			else
 				__flush_tlb_one(f->flush_va);
-		} else
+		}
+
+		if (tlbstate == TLBSTATE_LAZY)
 			leave_mm(cpu);
 	}
 out:
@@ -217,16 +229,13 @@
 	flush_tlb_others_ipi(cpumask, mm, va);
 }
 
-static int __cpuinit init_smp_flush(void)
+void __init init_smp_flush(void)
 {
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(flush_state); i++)
 		spin_lock_init(&flush_state[i].tlbstate_lock);
-
-	return 0;
 }
-core_initcall(init_smp_flush);
 
 void flush_tlb_current_task(void)
 {
@@ -274,17 +283,19 @@
 
 	preempt_enable();
 }
+EXPORT_SYMBOL_GPL(flush_tlb_page);
 
-static void do_flush_tlb_all(void *info)
+void flush_tlb_all(void)
 {
-	unsigned long cpu = smp_processor_id();
+	/* flush_tlb_others expects preempt to be disabled */
+	int cpu = get_cpu();
+
+	flush_tlb_others(cpu_online_mask, NULL, TLB_FLUSH_ALL);
 
 	__flush_tlb_all();
 	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
 		leave_mm(cpu);
-}
 
-void flush_tlb_all(void)
-{
-	on_each_cpu(do_flush_tlb_all, NULL, 1);
+	put_cpu();
 }
+EXPORT_SYMBOL_GPL(flush_tlb_all);
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index d49202e..64182c5 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -4,6 +4,7 @@
 obj-$(CONFIG_PCI_MMCONFIG)	+= mmconfig_$(BITS).o direct.o mmconfig-shared.o
 obj-$(CONFIG_PCI_DIRECT)	+= direct.o
 obj-$(CONFIG_PCI_OLPC)		+= olpc.o
+obj-$(CONFIG_PCI_XEN)		+= xen.o
 
 obj-y				+= fixup.o
 obj-$(CONFIG_ACPI)		+= acpi.o
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 1331fcf..30a9808 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -22,6 +22,7 @@
 unsigned int pci_early_dump_regs;
 static int pci_bf_sort;
 int pci_routeirq;
+int pci_scan_all_fns;
 int noioapicquirk;
 #ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 int noioapicreroute = 0;
@@ -412,26 +413,31 @@
 
 extern u8 pci_cache_line_size;
 
-int __init pcibios_init(void)
+void __init pcibios_set_cache_line_size(void)
 {
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 
-	if (!raw_pci_ops) {
-		printk(KERN_WARNING "PCI: System does not support PCI\n");
-		return 0;
-	}
-
 	/*
 	 * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8
 	 * and P4. It's also good for 386/486s (which actually have 16)
 	 * as quite a few PCI devices do not support smaller values.
 	 */
+
 	pci_cache_line_size = 32 >> 2;
 	if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
 		pci_cache_line_size = 64 >> 2;	/* K7 & K8 */
 	else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
 		pci_cache_line_size = 128 >> 2;	/* P4 */
+}
 
+int __init pcibios_init(void)
+{
+	if (!raw_pci_ops) {
+		printk(KERN_WARNING "PCI: System does not support PCI\n");
+		return 0;
+	}
+
+	pcibios_set_cache_line_size();
 	pcibios_resource_survey();
 
 	if (pci_bf_sort >= pci_force_bf)
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index a672f12..91d040e 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -283,6 +283,8 @@
 
 	prot = pgprot_val(vma->vm_page_prot);
 
+	prot |= _PAGE_IOMAP;	/* creating a mapping for IO */
+
 	/*
  	 * Return error if pat is not enabled and write_combine is requested.
  	 * Caller can followup with UC MINUS request and add a WC mtrr if there
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index 25a1f8e..4e2f90a 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -15,10 +15,16 @@
 	if (!(pci_probe & PCI_PROBE_NOEARLY))
 		pci_mmcfg_early_init();
 
+#ifdef CONFIG_PCI_XEN
+	if (!pci_xen_init())
+		return 0;
+#endif
+
 #ifdef CONFIG_PCI_OLPC
 	if (!pci_olpc_init())
 		return 0;	/* skip additional checks if it's an XO */
 #endif
+
 #ifdef CONFIG_PCI_BIOS
 	pci_pcbios_init();
 #endif
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
new file mode 100644
index 0000000..67fa926
--- /dev/null
+++ b/arch/x86/pci/xen.c
@@ -0,0 +1,154 @@
+/*
+ * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux
+ * 			   x86 PCI core to support the Xen PCI Frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+
+#include <asm/io.h>
+#include <asm/pci_x86.h>
+
+#include <asm/xen/hypervisor.h>
+
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+
+#if defined(CONFIG_PCI_MSI)
+#include <linux/msi.h>
+
+struct xen_pci_frontend_ops *xen_pci_frontend;
+EXPORT_SYMBOL_GPL(xen_pci_frontend);
+
+/*
+ * For MSI interrupts we have to use drivers/xen/event.s functions to
+ * allocate an irq_desc and setup the right */
+
+
+int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	int irq, ret, i;
+	struct msi_desc *msidesc;
+	int *v;
+
+	/* Dom0 has another mechanism for this. The exit path
+	 * (xen_pci_teardown_msi_irq) is shared with Dom0.
+	 */
+	if (xen_initial_domain())
+		return xen_setup_msi_irqs(dev, nvec, type);
+
+	v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
+	if (!v)
+		return -ENOMEM;
+
+	if (!xen_initial_domain()) {
+		if (type == PCI_CAP_ID_MSIX)
+			ret = xen_pci_frontend_enable_msix(dev, &v, nvec);
+		else
+			ret = xen_pci_frontend_enable_msi(dev, &v);
+		if (ret)
+			goto error;
+	}
+	i = 0;
+	list_for_each_entry(msidesc, &dev->msi_list, list) {
+		irq = xen_allocate_pirq(v[i], 0, /* not sharable */
+			(type == PCI_CAP_ID_MSIX) ?
+			"pcifront-msi-x":"pcifront-msi");
+		if (irq < 0)
+			return -1;
+
+		ret = set_irq_msi(irq, msidesc);
+		if (ret)
+			goto error_while;
+		i++;
+	}
+	kfree(v);
+	return 0;
+
+error_while:
+	unbind_from_irqhandler(irq, NULL);
+error:
+	if (ret == -ENODEV)
+		dev_err(&dev->dev,"Xen PCI frontend has not registered" \
+			" MSI/MSI-X support!\n");
+
+	kfree(v);
+	return ret;
+}
+
+void xen_pci_teardown_msi_dev(struct pci_dev *dev)
+{
+	/* Only do this when were are in non-privileged mode.*/
+	if (!xen_initial_domain()) {
+		struct msi_desc *msidesc;
+
+		msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
+		if (msidesc->msi_attrib.is_msix)
+			xen_pci_frontend_disable_msix(dev);
+		else
+			xen_pci_frontend_disable_msi(dev);
+	}
+
+}
+
+void xen_pci_teardown_msi_irq(int irq)
+{
+	xen_destroy_irq(irq);
+}
+#endif
+
+static int xen_pcifront_enable_irq(struct pci_dev *dev)
+{
+	int rc;
+	int share = 1;
+
+	dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq);
+
+	if (dev->irq < 0)
+		return -EINVAL;
+
+	if (dev->irq < NR_IRQS_LEGACY)
+		share = 0;
+
+	rc = xen_allocate_pirq(dev->irq, share, "pcifront");
+	if (rc < 0) {
+		dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n",
+			 dev->irq, rc);
+		return rc;
+	}
+	return 0;
+}
+
+int __init pci_xen_init(void)
+{
+	if (!xen_pv_domain() || xen_initial_domain())
+		return -ENODEV;
+
+	printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
+
+	pcibios_set_cache_line_size();
+
+	pcibios_enable_irq = xen_pcifront_enable_irq;
+	pcibios_disable_irq = NULL;
+
+#ifdef CONFIG_ACPI
+	/* Keep ACPI out of the picture */
+	acpi_noirq = 1;
+#endif
+
+#ifdef CONFIG_ISAPNP
+	/* Stop isapnp from probing */
+	isapnp_disable = 1;
+#endif
+
+	/* Ensure a device still gets scanned even if it's fn number
+	 * is non-zero.
+	 */
+	pci_scan_all_fns = 1;
+
+	return 0;
+}
+
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index b83e119..3f9f4a0 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -13,16 +13,18 @@
 	  kernel to boot in a paravirtualized environment under the
 	  Xen hypervisor.
 
+config XEN_PVHVM
+	def_bool y
+	depends on XEN
+	depends on X86_LOCAL_APIC
+
 config XEN_MAX_DOMAIN_MEMORY
-       int "Maximum allowed size of a domain in gigabytes"
-       default 8 if X86_32
-       default 32 if X86_64
+       int
+       default 128
        depends on XEN
        help
-         The pseudo-physical to machine address array is sized
-         according to the maximum possible memory size of a Xen
-         domain.  This array uses 1 page per gigabyte, so there's no
-         need to be too stingy here.
+         This only affects the sizing of some bss arrays, the unused
+         portions of which are freed.
 
 config XEN_SAVE_RESTORE
        bool
@@ -36,3 +38,40 @@
 	help
 	  Enable statistics output and various tuning options in debugfs.
 	  Enabling this option may incur a significant performance overhead.
+
+config SWIOTLB_XEN
+       def_bool y
+       depends on XEN && SWIOTLB
+
+config MICROCODE_XEN
+       def_bool y
+       depends on XEN_DOM0 && MICROCODE
+
+config XEN_DOM0
+	bool "Enable Xen privileged domain support"
+	depends on XEN && X86_IO_APIC && ACPI
+	help
+	  The Xen hypervisor requires a privileged domain ("dom0") to
+	  actually manage the machine, provide devices drivers, etc.
+	  This option enables dom0 support.  A dom0 kernel can also
+	  run as an unprivileged domU kernel, or a kernel running
+	  native on bare hardware.
+
+# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
+# name in tools.
+config XEN_PRIVILEGED_GUEST
+	def_bool XEN_DOM0
+
+config XEN_DOM0_PCI
+       def_bool y
+       depends on XEN_DOM0 && PCI
+       select PCI_XEN
+
+config XEN_PCI_PASSTHROUGH
+       bool "Enable support for Xen PCI passthrough devices"
+       depends on XEN && PCI
+       select PCI_XEN
+       select SWIOTLB_XEN
+       help
+         Enable support for passing PCI devices through to
+	 unprivileged domains. (COMPLETELY UNTESTED)
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3bb4fc2..13ca65c 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,9 +12,12 @@
 
 obj-y		:= enlighten.o setup.o multicalls.o mmu.o irq.o \
 			time.o xen-asm.o xen-asm_$(BITS).o \
-			grant-table.o suspend.o
+			grant-table.o suspend.o platform-pci-unplug.o
 
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
 obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
-
+obj-$(CONFIG_XEN_DOM0)		+= vga.o
+obj-$(CONFIG_XEN_DOM0)		+= apic.o
+obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb-xen.o
+obj-$(CONFIG_XEN_DOM0_PCI)	+= pci.o
\ No newline at end of file
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
new file mode 100644
index 0000000..21a3089
--- /dev/null
+++ b/arch/x86/xen/apic.c
@@ -0,0 +1,33 @@
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/bitmap.h>
+
+#include <asm/io_apic.h>
+#include <asm/acpi.h>
+#include <asm/hw_irq.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+
+void __init xen_io_apic_init(void)
+{
+	enable_IO_APIC();
+}
+
+void xen_init_apic(void)
+{
+	if (!xen_initial_domain())
+		return;
+
+#ifdef CONFIG_ACPI
+	/*
+	 * Pretend ACPI found our lapic even though we've disabled it,
+	 * to prevent MP tables from setting up lapics.
+	 */
+	acpi_lapic = 1;
+#endif
+}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0087b00..7a48c0b 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -11,6 +11,7 @@
  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  */
 
+#include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/smp.h>
@@ -28,12 +29,15 @@
 #include <linux/highmem.h>
 #include <linux/console.h>
 
+#include <xen/xen.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/version.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/vcpu.h>
+#include <xen/interface/memory.h>
 #include <xen/features.h>
 #include <xen/page.h>
+#include <xen/hvm.h>
 #include <xen/hvc-console.h>
 
 #include <asm/paravirt.h>
@@ -53,6 +57,7 @@
 #include <asm/tlbflush.h>
 #include <asm/reboot.h>
 #include <asm/stackprotector.h>
+#include <asm/hypervisor.h>
 
 #include "xen-ops.h"
 #include "mmu.h"
@@ -66,6 +71,11 @@
 enum xen_domain_type xen_domain_type = XEN_NATIVE;
 EXPORT_SYMBOL_GPL(xen_domain_type);
 
+unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
+EXPORT_SYMBOL(machine_to_phys_mapping);
+unsigned int   machine_to_phys_order;
+EXPORT_SYMBOL(machine_to_phys_order);
+
 struct start_info *xen_start_info;
 EXPORT_SYMBOL_GPL(xen_start_info);
 
@@ -73,6 +83,9 @@
 
 void *xen_initial_gdt;
 
+__read_mostly int xen_have_vector_callback;
+EXPORT_SYMBOL_GPL(xen_have_vector_callback);
+
 /*
  * Point at some empty memory to start with. We map the real shared_info
  * page as soon as fixmap is up and running.
@@ -94,6 +107,14 @@
  */
 static int have_vcpu_info_placement = 1;
 
+static void clamp_max_cpus(void)
+{
+#ifdef CONFIG_SMP
+	if (setup_max_cpus > MAX_VIRT_CPUS)
+		setup_max_cpus = MAX_VIRT_CPUS;
+#endif
+}
+
 static void xen_vcpu_setup(int cpu)
 {
 	struct vcpu_register_vcpu_info info;
@@ -101,19 +122,20 @@
 	struct vcpu_info *vcpup;
 
 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
-	per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
 
-	if (!have_vcpu_info_placement)
-		return;		/* already tested, not available */
+	if (cpu < MAX_VIRT_CPUS)
+		per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+
+	if (!have_vcpu_info_placement) {
+		if (cpu >= MAX_VIRT_CPUS)
+			clamp_max_cpus();
+		return;
+	}
 
 	vcpup = &per_cpu(xen_vcpu_info, cpu);
-
 	info.mfn = arbitrary_virt_to_mfn(vcpup);
 	info.offset = offset_in_page(vcpup);
 
-	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
-	       cpu, vcpup, info.mfn, info.offset);
-
 	/* Check to see if the hypervisor will put the vcpu_info
 	   structure where we want it, which allows direct access via
 	   a percpu-variable. */
@@ -122,13 +144,11 @@
 	if (err) {
 		printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
 		have_vcpu_info_placement = 0;
+		clamp_max_cpus();
 	} else {
 		/* This cpu is using the registered vcpu info, even if
 		   later ones fail to. */
 		per_cpu(xen_vcpu, cpu) = vcpup;
-
-		printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
-		       cpu, vcpup);
 	}
 }
 
@@ -167,13 +187,16 @@
 
 	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
 	       pv_info.name);
-	printk(KERN_INFO "Xen version: %d.%d%s%s\n",
+	printk(KERN_INFO "Xen version: %d.%d%s%s%s\n",
 	       version >> 16, version & 0xffff, extra.extraversion,
-	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
+	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ?
+			" (preserve-AD)" : "",
+	       xen_initial_domain() ? " (dom0)" : "");
 }
 
 static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
 static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
+static __read_mostly unsigned int cpuid_leaf81_edx_mask = ~0;
 
 static void xen_cpuid(unsigned int *ax, unsigned int *bx,
 		      unsigned int *cx, unsigned int *dx)
@@ -187,7 +210,7 @@
 	 * unsupported kernel subsystems as possible.
 	 */
 	switch (*ax) {
-	case 1:
+	case 0x1:
 		maskecx = cpuid_leaf1_ecx_mask;
 		maskedx = cpuid_leaf1_edx_mask;
 		break;
@@ -196,6 +219,10 @@
 		/* Suppress extended topology stuff */
 		maskebx = 0;
 		break;
+
+	case 0x80000001:
+		maskedx = cpuid_leaf81_edx_mask;
+		break;
 	}
 
 	asm(XEN_EMULATE_PREFIX "cpuid"
@@ -213,34 +240,29 @@
 static __init void xen_init_cpuid_mask(void)
 {
 	unsigned int ax, bx, cx, dx;
+	unsigned int xsave_mask;
 
 	cpuid_leaf1_edx_mask =
-		~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
-		  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
-		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */
+		~(1 << X86_FEATURE_ACC);   /* thermal monitoring */
+
+	cpuid_leaf81_edx_mask = ~(1 << (X86_FEATURE_GBPAGES % 32));
 
 	if (!xen_initial_domain())
 		cpuid_leaf1_edx_mask &=
-			~((1 << X86_FEATURE_APIC) |  /* disable local APIC */
+			~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
+			  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
+			  (1 << X86_FEATURE_APIC) |  /* disable local APIC */
 			  (1 << X86_FEATURE_ACPI));  /* disable ACPI */
-
 	ax = 1;
-	cx = 0;
 	xen_cpuid(&ax, &bx, &cx, &dx);
 
-	/* cpuid claims we support xsave; try enabling it to see what happens */
-	if (cx & (1 << (X86_FEATURE_XSAVE % 32))) {
-		unsigned long cr4;
+	xsave_mask =
+		(1 << (X86_FEATURE_XSAVE % 32)) |
+		(1 << (X86_FEATURE_OSXSAVE % 32));
 
-		set_in_cr4(X86_CR4_OSXSAVE);
-		
-		cr4 = read_cr4();
-
-		if ((cr4 & X86_CR4_OSXSAVE) == 0)
-			cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
-
-		clear_in_cr4(X86_CR4_OSXSAVE);
-	}
+	/* Xen will set CR4.OSXSAVE if supported and not disabled by force */
+	if ((cx & xsave_mask) != xsave_mask)
+		cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
 }
 
 static void xen_set_debugreg(int reg, unsigned long val)
@@ -406,7 +428,7 @@
 
 		pte = pfn_pte(pfn, PAGE_KERNEL_RO);
 
-		if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
+		if (HYPERVISOR_update_va_mapping(va, pte, 0))
 			BUG();
 
 		frames[f] = mfn;
@@ -517,13 +539,13 @@
 		return 0;
 #ifdef CONFIG_X86_MCE
 	} else if (addr == (unsigned long)machine_check) {
-		return 0;
+		/* We can use the original machine_check handler,
+		   despite IST. */
 #endif
-	} else {
-		/* Some other trap using IST? */
-		if (WARN_ON(val->ist != 0))
-			return 0;
-	}
+	} else if (WARN(val->ist != 0,
+			"Unknown IST-using trap: vector %d, %pF, val->ist=%d\n",
+			vector, (void *)addr, val->ist))
+		return 0;
 #endif	/* CONFIG_X86_64 */
 	info->address = addr;
 
@@ -679,6 +701,18 @@
 	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
 }
 
+static void xen_set_io_bitmap(struct thread_struct *thread,
+			      unsigned long bytes_updated)
+{
+	struct physdev_set_iobitmap set_iobitmap;
+
+	set_xen_guest_handle(set_iobitmap.bitmap,
+			     (char *)thread->io_bitmap_ptr);
+	set_iobitmap.nr_ports = thread->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
+	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
+				      &set_iobitmap));
+}
+
 static void xen_io_delay(void)
 {
 }
@@ -716,7 +750,7 @@
         return 0;
 }
 
-static void set_xen_basic_apic_ops(void)
+static __init void set_xen_basic_apic_ops(void)
 {
 	apic->read = xen_apic_read;
 	apic->write = xen_apic_write;
@@ -728,7 +762,6 @@
 
 #endif
 
-
 static void xen_clts(void)
 {
 	struct multicall_space mcs;
@@ -811,6 +844,11 @@
 		   Xen console noise. */
 		break;
 
+	case MSR_IA32_CR_PAT:
+		if (smp_processor_id() == 0)
+			xen_set_pat(((u64)high << 32) | low);
+		break;
+
 	default:
 		ret = native_write_msr_safe(msr, low, high);
 	}
@@ -849,8 +887,6 @@
 	/* xen_vcpu_setup managed to place the vcpu_info within the
 	   percpu area for all cpus, so make use of it */
 	if (have_vcpu_info_placement) {
-		printk(KERN_INFO "Xen: using vcpu_info placement\n");
-
 		pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
 		pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
 		pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
@@ -923,10 +959,6 @@
 	.patch = xen_patch,
 };
 
-static const struct pv_time_ops xen_time_ops __initdata = {
-	.sched_clock = xen_clocksource_read,
-};
-
 static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	.cpuid = xen_cpuid,
 
@@ -978,6 +1010,7 @@
 	.load_sp0 = xen_load_sp0,
 
 	.set_iopl_mask = xen_set_iopl_mask,
+	.set_io_bitmap = xen_set_io_bitmap,
 	.io_delay = xen_io_delay,
 
 	/* Xen takes care of %gs when switching to usermode for us */
@@ -1016,15 +1049,40 @@
 	xen_reboot(SHUTDOWN_poweroff);
 }
 
+static void xen_machine_power_off(void)
+{
+	if (pm_power_off)
+		pm_power_off();
+	else
+		xen_reboot(SHUTDOWN_poweroff);
+}
+
 static void xen_crash_shutdown(struct pt_regs *regs)
 {
 	xen_reboot(SHUTDOWN_crash);
 }
 
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	xen_reboot(SHUTDOWN_crash);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block xen_panic_block = {
+	.notifier_call= xen_panic_event,
+};
+
+int xen_panic_handler_init(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+	return 0;
+}
+
 static const struct machine_ops __initdata xen_machine_ops = {
 	.restart = xen_restart,
 	.halt = xen_machine_halt,
-	.power_off = xen_machine_halt,
+	.power_off = xen_machine_power_off,
 	.shutdown = xen_machine_halt,
 	.crash_shutdown = xen_crash_shutdown,
 	.emergency_restart = xen_emergency_restart,
@@ -1057,10 +1115,11 @@
 
 	xen_domain_type = XEN_PV_DOMAIN;
 
+	xen_setup_machphys_mapping();
+
 	/* Install Xen paravirt ops */
 	pv_info = xen_info;
 	pv_init_ops = xen_init_ops;
-	pv_time_ops = xen_time_ops;
 	pv_cpu_ops = xen_cpu_ops;
 	pv_apic_ops = xen_apic_ops;
 
@@ -1068,13 +1127,7 @@
 	x86_init.oem.arch_setup = xen_arch_setup;
 	x86_init.oem.banner = xen_banner;
 
-	x86_init.timers.timer_init = xen_time_init;
-	x86_init.timers.setup_percpu_clockev = x86_init_noop;
-	x86_cpuinit.setup_percpu_clockev = x86_init_noop;
-
-	x86_platform.calibrate_tsc = xen_tsc_khz;
-	x86_platform.get_wallclock = xen_get_wallclock;
-	x86_platform.set_wallclock = xen_set_wallclock;
+	xen_init_time_ops();
 
 	/*
 	 * Set up some pagetable state before starting to set any ptes.
@@ -1112,6 +1165,10 @@
 	 */
 	xen_setup_stackprotector();
 
+#ifdef CONFIG_SPARSE_IRQ
+	nr_dynamic_irqs += 256;
+#endif
+
 	xen_init_irq_ops();
 	xen_init_cpuid_mask();
 
@@ -1138,8 +1195,19 @@
 
 	xen_smp_init();
 
+#ifdef CONFIG_ACPI_NUMA
+	/*
+	 * The pages we from Xen are not related to machine pages, so
+	 * any NUMA information the kernel tries to get from ACPI will
+	 * be meaningless.  Prevent it from trying.
+	 */
+	acpi_numa = -1;
+#endif
+
 	pgd = (pgd_t *)xen_start_info->pt_base;
 
+	__supported_pte_mask |= _PAGE_IOMAP;
+
 	/* Don't do the full vcpu_info placement stuff until we have a
 	   possible map and a non-dummy shared_info. */
 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
@@ -1149,6 +1217,10 @@
 
 	xen_raw_console_write("mapping kernel into physical memory\n");
 	pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
+	xen_ident_map_ISA();
+
+	/* Allocate and initialize top and mid mfn levels for p2m structure */
+	xen_build_mfn_list_list();
 
 	init_mm.pgd = pgd;
 
@@ -1158,6 +1230,14 @@
 	if (xen_feature(XENFEAT_supervisor_mode_kernel))
 		pv_info.kernel_rpl = 0;
 
+	if (xen_initial_domain()) {
+		struct physdev_set_iopl set_iopl;
+		set_iopl.iopl = 1;
+		if (HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl) == -1)
+			BUG();
+		xen_init_apic();
+	}
+
 	/* set the limit of our address space */
 	xen_reserve_top();
 
@@ -1180,6 +1260,16 @@
 		add_preferred_console("xenboot", 0, NULL);
 		add_preferred_console("tty", 0, NULL);
 		add_preferred_console("hvc", 0, NULL);
+
+		boot_params.screen_info.orig_video_isVGA = 0;
+	} else {
+		const struct dom0_vga_console_info *info =
+			(void *)((char *)xen_start_info +
+				 xen_start_info->console.dom0.info_off);
+
+		xen_init_vga(info, xen_start_info->console.dom0.info_size);
+		xen_start_info->console.domU.mfn = 0;
+		xen_start_info->console.domU.evtchn = 0;
 	}
 
 	xen_raw_console_write("about to get started...\n");
@@ -1193,3 +1283,128 @@
 	x86_64_start_reservations((char *)__pa_symbol(&boot_params));
 #endif
 }
+
+static uint32_t xen_cpuid_base(void)
+{
+	uint32_t base, eax, ebx, ecx, edx;
+	char signature[13];
+
+	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
+		cpuid(base, &eax, &ebx, &ecx, &edx);
+		*(uint32_t *)(signature + 0) = ebx;
+		*(uint32_t *)(signature + 4) = ecx;
+		*(uint32_t *)(signature + 8) = edx;
+		signature[12] = 0;
+
+		if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
+			return base;
+	}
+
+	return 0;
+}
+
+static int init_hvm_pv_info(int *major, int *minor)
+{
+	uint32_t eax, ebx, ecx, edx, pages, msr, base;
+	u64 pfn;
+
+	base = xen_cpuid_base();
+	if (!base)
+		return -EINVAL;
+
+	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+	*major = eax >> 16;
+	*minor = eax & 0xffff;
+	printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
+
+	cpuid(base + 2, &pages, &msr, &ecx, &edx);
+
+	pfn = __pa(hypercall_page);
+	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+	xen_setup_features();
+
+	pv_info.name = "Xen HVM";
+
+	xen_domain_type = XEN_HVM_DOMAIN;
+
+	return 0;
+}
+
+void xen_hvm_init_shared_info(void)
+{
+	int cpu;
+	struct xen_add_to_physmap xatp;
+	static struct shared_info *shared_info_page = 0;
+
+	if (!shared_info_page)
+		shared_info_page = (struct shared_info *) alloc_bootmem_pages(PAGE_SIZE);
+	xatp.domid = DOMID_SELF;
+	xatp.idx = 0;
+	xatp.space = XENMAPSPACE_shared_info;
+	xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+		BUG();
+
+	HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+
+	/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
+	 * page, we use it in the event channel upcall and in some pvclock
+	 * related functions. We don't need the vcpu_info placement
+	 * optimizations because we don't use any pv_mmu or pv_irq op on
+	 * HVM.
+	 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
+	 * online but xen_hvm_init_shared_info is run at resume time too and
+	 * in that case multiple vcpus might be online. */
+	for_each_online_cpu(cpu) {
+		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+	}
+}
+
+#ifdef CONFIG_XEN_PVHVM
+static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
+				    unsigned long action, void *hcpu)
+{
+	int cpu = (long)hcpu;
+	switch (action) {
+	case CPU_UP_PREPARE:
+		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+		if (xen_have_vector_callback)
+			xen_init_lock_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = {
+	.notifier_call	= xen_hvm_cpu_notify,
+};
+
+void __init xen_hvm_guest_init(void)
+{
+	int r;
+	int major, minor;
+
+	if (xen_pv_domain())
+		return;
+
+	r = init_hvm_pv_info(&major, &minor);
+	if (r < 0)
+		return;
+
+	xen_hvm_init_shared_info();
+
+	if (xen_feature(XENFEAT_hvm_callback_vector))
+		xen_have_vector_callback = 1;
+	xen_hvm_smp_init();
+	register_cpu_notifier(&xen_hvm_cpu_notifier);
+	xen_unplug_emulated_devices();
+	have_vcpu_info_placement = 0;
+	x86_init.irqs.intr_init = xen_init_IRQ;
+	xen_hvm_init_time_ops();
+	xen_hvm_init_mmu_ops();
+}
+#endif
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3f90a2c..18f940e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -42,6 +42,7 @@
 #include <linux/highmem.h>
 #include <linux/debugfs.h>
 #include <linux/bug.h>
+#include <linux/vmalloc.h>
 #include <linux/module.h>
 
 #include <asm/pgtable.h>
@@ -50,14 +51,20 @@
 #include <asm/mmu_context.h>
 #include <asm/setup.h>
 #include <asm/paravirt.h>
+#include <asm/e820.h>
 #include <asm/linkage.h>
+#include <asm/pat.h>
+#include <asm/init.h>
+#include <asm/page.h>
 
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
 
 #include <xen/page.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/hvm/hvm_op.h>
 #include <xen/interface/version.h>
+#include <xen/interface/memory.h>
 #include <xen/hvc-console.h>
 
 #include "multicalls.h"
@@ -66,6 +73,13 @@
 
 #define MMU_UPDATE_HISTO	30
 
+/*
+ * Protects atomic reservation decrease/increase against concurrent increases.
+ * Also protects non-atomic updates of current_pages and driver_pages, and
+ * balloon lists.
+ */
+DEFINE_SPINLOCK(xen_reservation_lock);
+
 #ifdef CONFIG_XEN_DEBUG_FS
 
 static struct {
@@ -124,7 +138,8 @@
  * large enough to allocate page table pages to allocate the rest.
  * Each page can map 2MB.
  */
-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+#define LEVEL1_IDENT_ENTRIES	(PTRS_PER_PTE * 4)
+static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
 
 #ifdef CONFIG_X86_64
 /* l3 pud for userspace vsyscall mapping */
@@ -155,49 +170,202 @@
  */
 #define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 
+/*
+ * Xen leaves the responsibility for maintaining p2m mappings to the
+ * guests themselves, but it must also access and update the p2m array
+ * during suspend/resume when all the pages are reallocated.
+ *
+ * The p2m table is logically a flat array, but we implement it as a
+ * three-level tree to allow the address space to be sparse.
+ *
+ *                               Xen
+ *                                |
+ *     p2m_top              p2m_top_mfn
+ *       /  \                   /   \
+ * p2m_mid p2m_mid	p2m_mid_mfn p2m_mid_mfn
+ *    / \      / \         /           /
+ *  p2m p2m p2m p2m p2m p2m p2m ...
+ *
+ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
+ *
+ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
+ * maximum representable pseudo-physical address space is:
+ *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
+ *
+ * P2M_PER_PAGE depends on the architecture, as a mfn is always
+ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
+ * 512 and 1024 entries respectively. 
+ */
 
-#define P2M_ENTRIES_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long))
-#define TOP_ENTRIES		(MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
+unsigned long xen_max_p2m_pfn __read_mostly;
 
-/* Placeholder for holes in the address space */
-static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
-		{ [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
+#define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long **))
 
- /* Array of pointers to pages containing p2m entries */
-static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
-		{ [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
+#define MAX_P2M_PFN		(P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
 
-/* Arrays of p2m arrays expressed in mfns used for save/restore */
-static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
+/* Placeholders for holes in the address space */
+static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
 
-static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
-	__page_aligned_bss;
+static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
+
+RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
 
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
-	BUG_ON(pfn >= MAX_DOMAIN_PAGES);
-	return pfn / P2M_ENTRIES_PER_PAGE;
+	BUG_ON(pfn >= MAX_P2M_PFN);
+	return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
+}
+
+static inline unsigned p2m_mid_index(unsigned long pfn)
+{
+	return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
 }
 
 static inline unsigned p2m_index(unsigned long pfn)
 {
-	return pfn % P2M_ENTRIES_PER_PAGE;
+	return pfn % P2M_PER_PAGE;
 }
 
-/* Build the parallel p2m_top_mfn structures */
+static void p2m_top_init(unsigned long ***top)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+		top[i] = p2m_mid_missing;
+}
+
+static void p2m_top_mfn_init(unsigned long *top)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+		top[i] = virt_to_mfn(p2m_mid_missing_mfn);
+}
+
+static void p2m_top_mfn_p_init(unsigned long **top)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+		top[i] = p2m_mid_missing_mfn;
+}
+
+static void p2m_mid_init(unsigned long **mid)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_MID_PER_PAGE; i++)
+		mid[i] = p2m_missing;
+}
+
+static void p2m_mid_mfn_init(unsigned long *mid)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_MID_PER_PAGE; i++)
+		mid[i] = virt_to_mfn(p2m_missing);
+}
+
+static void p2m_init(unsigned long *p2m)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_MID_PER_PAGE; i++)
+		p2m[i] = INVALID_P2M_ENTRY;
+}
+
+static int lookup_pte_fn(
+	pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+	uint64_t *ptep = (uint64_t *)data;
+	if (ptep)
+		*ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
+			 PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
+	return 0;
+}
+
+int create_lookup_pte_addr(struct mm_struct *mm,
+			   unsigned long address,
+			   uint64_t *ptep)
+{
+	return apply_to_page_range(mm, address, PAGE_SIZE,
+				   lookup_pte_fn, ptep);
+}
+
+EXPORT_SYMBOL(create_lookup_pte_addr);
+
+/*
+ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
+ *
+ * This is called both at boot time, and after resuming from suspend:
+ * - At boot time we're called very early, and must use extend_brk()
+ *   to allocate memory.
+ *
+ * - After resume we're called from within stop_machine, but the mfn
+ *   tree should alreay be completely allocated.
+ */
 void xen_build_mfn_list_list(void)
 {
-	unsigned pfn, idx;
+	unsigned long pfn;
 
-	for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
-		unsigned topidx = p2m_top_index(pfn);
+	/* Pre-initialize p2m_top_mfn to be completely missing */
+	if (p2m_top_mfn == NULL) {
+		p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+		p2m_mid_mfn_init(p2m_mid_missing_mfn);
 
-		p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
+		p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+		p2m_top_mfn_p_init(p2m_top_mfn_p);
+
+		p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+		p2m_top_mfn_init(p2m_top_mfn);
+	} else {
+		/* Reinitialise, mfn's all change after migration */
+		p2m_mid_mfn_init(p2m_mid_missing_mfn);
 	}
 
-	for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
-		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
-		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
+	for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
+		unsigned topidx = p2m_top_index(pfn);
+		unsigned mididx = p2m_mid_index(pfn);
+		unsigned long **mid;
+		unsigned long *mid_mfn_p;
+
+		mid = p2m_top[topidx];
+		mid_mfn_p = p2m_top_mfn_p[topidx];
+
+		/* Don't bother allocating any mfn mid levels if
+		 * they're just missing, just update the stored mfn,
+		 * since all could have changed over a migrate.
+		 */
+		if (mid == p2m_mid_missing) {
+			BUG_ON(mididx);
+			BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+			p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
+			pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
+			continue;
+		}
+
+		if (mid_mfn_p == p2m_mid_missing_mfn) {
+			/*
+			 * XXX boot-time only!  We should never find
+			 * missing parts of the mfn tree after
+			 * runtime.  extend_brk() will BUG if we call
+			 * it too late.
+			 */
+			mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+			p2m_mid_mfn_init(mid_mfn_p);
+
+			p2m_top_mfn_p[topidx] = mid_mfn_p;
+		}
+
+		p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+		mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
 	}
 }
 
@@ -206,8 +374,8 @@
 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
-		virt_to_mfn(p2m_top_mfn_list);
-	HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
+		virt_to_mfn(p2m_top_mfn);
+	HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
 }
 
 /* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -215,98 +383,188 @@
 {
 	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
 	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
-	unsigned pfn;
+	unsigned long pfn;
 
-	for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
+	xen_max_p2m_pfn = max_pfn;
+
+	p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+	p2m_init(p2m_missing);
+
+	p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+	p2m_mid_init(p2m_mid_missing);
+
+	p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
+	p2m_top_init(p2m_top);
+
+	/*
+	 * The domain builder gives us a pre-constructed p2m array in
+	 * mfn_list for all the pages initially given to us, so we just
+	 * need to graft that into our tree structure.
+	 */
+	for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
 		unsigned topidx = p2m_top_index(pfn);
+		unsigned mididx = p2m_mid_index(pfn);
 
-		p2m_top[topidx] = &mfn_list[pfn];
+		if (p2m_top[topidx] == p2m_mid_missing) {
+			unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+			p2m_mid_init(mid);
+
+			p2m_top[topidx] = mid;
+		}
+
+		/*
+		 * As long as the mfn_list has enough entries to completely
+		 * fill a p2m page, pointing into the array is ok. But if
+		 * not the entries beyond the last pfn will be undefined.
+		 */
+		if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
+			unsigned long p2midx;
+
+			p2midx = max_pfn % P2M_PER_PAGE;
+			for ( ; p2midx < P2M_PER_PAGE; p2midx++)
+				mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
+		}
+		p2m_top[topidx][mididx] = &mfn_list[pfn];
 	}
-
-	xen_build_mfn_list_list();
 }
 
 unsigned long get_phys_to_machine(unsigned long pfn)
 {
-	unsigned topidx, idx;
+	unsigned topidx, mididx, idx;
 
-	if (unlikely(pfn >= MAX_DOMAIN_PAGES))
+	if (unlikely(pfn >= MAX_P2M_PFN))
 		return INVALID_P2M_ENTRY;
 
 	topidx = p2m_top_index(pfn);
+	mididx = p2m_mid_index(pfn);
 	idx = p2m_index(pfn);
-	return p2m_top[topidx][idx];
+
+	return p2m_top[topidx][mididx][idx];
 }
 EXPORT_SYMBOL_GPL(get_phys_to_machine);
 
-/* install a  new p2m_top page */
-bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
+static void *alloc_p2m_page(void)
 {
-	unsigned topidx = p2m_top_index(pfn);
-	unsigned long **pfnp, *mfnp;
-	unsigned i;
-
-	pfnp = &p2m_top[topidx];
-	mfnp = &p2m_top_mfn[topidx];
-
-	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
-		p[i] = INVALID_P2M_ENTRY;
-
-	if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
-		*mfnp = virt_to_mfn(p);
-		return true;
-	}
-
-	return false;
+	return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
 }
 
-static void alloc_p2m(unsigned long pfn)
+static void free_p2m_page(void *p)
 {
-	unsigned long *p;
+	free_page((unsigned long)p);
+}
 
-	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
-	BUG_ON(p == NULL);
+/* 
+ * Fully allocate the p2m structure for a given pfn.  We need to check
+ * that both the top and mid levels are allocated, and make sure the
+ * parallel mfn tree is kept in sync.  We may race with other cpus, so
+ * the new pages are installed with cmpxchg; if we lose the race then
+ * simply free the page we allocated and use the one that's there.
+ */
+static bool alloc_p2m(unsigned long pfn)
+{
+	unsigned topidx, mididx;
+	unsigned long ***top_p, **mid;
+	unsigned long *top_mfn_p, *mid_mfn;
 
-	if (!install_p2mtop_page(pfn, p))
-		free_page((unsigned long)p);
+	topidx = p2m_top_index(pfn);
+	mididx = p2m_mid_index(pfn);
+
+	top_p = &p2m_top[topidx];
+	mid = *top_p;
+
+	if (mid == p2m_mid_missing) {
+		/* Mid level is missing, allocate a new one */
+		mid = alloc_p2m_page();
+		if (!mid)
+			return false;
+
+		p2m_mid_init(mid);
+
+		if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
+			free_p2m_page(mid);
+	}
+
+	top_mfn_p = &p2m_top_mfn[topidx];
+	mid_mfn = p2m_top_mfn_p[topidx];
+
+	BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
+
+	if (mid_mfn == p2m_mid_missing_mfn) {
+		/* Separately check the mid mfn level */
+		unsigned long missing_mfn;
+		unsigned long mid_mfn_mfn;
+
+		mid_mfn = alloc_p2m_page();
+		if (!mid_mfn)
+			return false;
+
+		p2m_mid_mfn_init(mid_mfn);
+
+		missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+		mid_mfn_mfn = virt_to_mfn(mid_mfn);
+		if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
+			free_p2m_page(mid_mfn);
+		else
+			p2m_top_mfn_p[topidx] = mid_mfn;
+	}
+
+	if (p2m_top[topidx][mididx] == p2m_missing) {
+		/* p2m leaf page is missing */
+		unsigned long *p2m;
+
+		p2m = alloc_p2m_page();
+		if (!p2m)
+			return false;
+
+		p2m_init(p2m);
+
+		if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
+			free_p2m_page(p2m);
+		else
+			mid_mfn[mididx] = virt_to_mfn(p2m);
+	}
+
+	return true;
 }
 
 /* Try to install p2m mapping; fail if intermediate bits missing */
 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
-	unsigned topidx, idx;
+	unsigned topidx, mididx, idx;
 
-	if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
+	if (unlikely(pfn >= MAX_P2M_PFN)) {
 		BUG_ON(mfn != INVALID_P2M_ENTRY);
 		return true;
 	}
 
 	topidx = p2m_top_index(pfn);
-	if (p2m_top[topidx] == p2m_missing) {
-		if (mfn == INVALID_P2M_ENTRY)
-			return true;
-		return false;
-	}
-
+	mididx = p2m_mid_index(pfn);
 	idx = p2m_index(pfn);
-	p2m_top[topidx][idx] = mfn;
+
+	if (p2m_top[topidx][mididx] == p2m_missing)
+		return mfn == INVALID_P2M_ENTRY;
+
+	p2m_top[topidx][mididx][idx] = mfn;
 
 	return true;
 }
 
-void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
 	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
 		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
-		return;
+		return true;
 	}
 
 	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
-		alloc_p2m(pfn);
+		if (!alloc_p2m(pfn))
+			return false;
 
 		if (!__set_phys_to_machine(pfn, mfn))
-			BUG();
+			return false;
 	}
+
+	return true;
 }
 
 unsigned long arbitrary_virt_to_mfn(void *vaddr)
@@ -315,6 +573,7 @@
 
 	return PFN_DOWN(maddr.maddr);
 }
+EXPORT_SYMBOL_GPL(set_phys_to_machine);
 
 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 {
@@ -345,7 +604,8 @@
 	unsigned int level;
 
 	pte = lookup_address(address, &level);
-	BUG_ON(pte == NULL);
+	if (pte == NULL)
+		return;		/* vaddr missing */
 
 	ptev = pte_wrprotect(*pte);
 
@@ -360,7 +620,8 @@
 	unsigned int level;
 
 	pte = lookup_address(address, &level);
-	BUG_ON(pte == NULL);
+	if (pte == NULL)
+		return;		/* vaddr missing */
 
 	ptev = pte_mkwrite(*pte);
 
@@ -376,6 +637,24 @@
 	return PagePinned(page);
 }
 
+void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
+{
+	struct multicall_space mcs;
+	struct mmu_update *u;
+
+	mcs = xen_mc_entry(sizeof(*u));
+	u = mcs.args;
+
+	/* ptep might be kmapped when using 32-bit HIGHPTE */
+	u->ptr = arbitrary_virt_to_machine(ptep).maddr;
+	u->val = pte_val_ma(pteval);
+
+	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+EXPORT_SYMBOL_GPL(xen_set_domain_pte);
+
 static void xen_extend_mmu_update(const struct mmu_update *update)
 {
 	struct multicall_space mcs;
@@ -516,7 +795,34 @@
 	if (val & _PAGE_PRESENT) {
 		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 		pteval_t flags = val & PTE_FLAGS_MASK;
-		val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
+		unsigned long mfn = pfn_to_mfn(pfn);
+
+		/*
+		 * If there's no mfn for the pfn, then just create an
+		 * empty non-present pte.  Unfortunately this loses
+		 * information about the original pfn, so
+		 * pte_mfn_to_pfn is asymmetric.
+		 */
+		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
+			mfn = 0;
+			flags = 0;
+		}
+
+		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
+	}
+
+	return val;
+}
+
+static pteval_t iomap_pte(pteval_t val)
+{
+	if (val & _PAGE_PRESENT) {
+		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+		pteval_t flags = val & PTE_FLAGS_MASK;
+
+		/* We assume the pte frame number is a MFN, so
+		   just use it as-is. */
+		val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
 	}
 
 	return val;
@@ -524,7 +830,18 @@
 
 pteval_t xen_pte_val(pte_t pte)
 {
-	return pte_mfn_to_pfn(pte.pte);
+	pteval_t pteval = pte.pte;
+
+	/* If this is a WC pte, convert back from Xen WC to Linux WC */
+	if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
+		WARN_ON(!pat_enabled);
+		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
+	}
+
+	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
+		return pteval;
+
+	return pte_mfn_to_pfn(pteval);
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 
@@ -534,9 +851,62 @@
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 
+/*
+ * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
+ * are reserved for now, to correspond to the Intel-reserved PAT
+ * types.
+ *
+ * We expect Linux's PAT set as follows:
+ *
+ * Idx  PTE flags        Linux    Xen    Default
+ * 0                     WB       WB     WB
+ * 1            PWT      WC       WT     WT
+ * 2        PCD          UC-      UC-    UC-
+ * 3        PCD PWT      UC       UC     UC
+ * 4    PAT              WB       WC     WB
+ * 5    PAT     PWT      WC       WP     WT
+ * 6    PAT PCD          UC-      UC     UC-
+ * 7    PAT PCD PWT      UC       UC     UC
+ */
+
+void xen_set_pat(u64 pat)
+{
+	/* We expect Linux to use a PAT setting of
+	 * UC UC- WC WB (ignoring the PAT flag) */
+	WARN_ON(pat != 0x0007010600070106ull);
+}
+
 pte_t xen_make_pte(pteval_t pte)
 {
-	pte = pte_pfn_to_mfn(pte);
+	phys_addr_t addr = (pte & PTE_PFN_MASK);
+
+	/* If Linux is trying to set a WC pte, then map to the Xen WC.
+	 * If _PAGE_PAT is set, then it probably means it is really
+	 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
+	 * things work out OK...
+	 *
+	 * (We should never see kernel mappings with _PAGE_PSE set,
+	 * but we could see hugetlbfs mappings, I think.).
+	 */
+	if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
+		if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
+			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
+	}
+
+	/*
+	 * Unprivileged domains are allowed to do IOMAPpings for
+	 * PCI passthrough, but not map ISA space.  The ISA
+	 * mappings are just dummy local mappings to keep other
+	 * parts of the kernel happy.
+	 */
+	if (unlikely(pte & _PAGE_IOMAP) &&
+	    (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
+		pte = iomap_pte(pte);
+	} else {
+		pte &= ~_PAGE_IOMAP;
+		pte = pte_pfn_to_mfn(pte);
+	}
+
 	return native_make_pte(pte);
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
@@ -934,8 +1304,6 @@
    read-only, and can be pinned. */
 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 {
-	vm_unmap_aliases();
-
 	xen_mc_batch();
 
 	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
@@ -1219,7 +1587,7 @@
 	spin_lock(&mm->page_table_lock);
 
 	/* pgd may not be pinned in the error exit path of execve */
-	if (xen_page_pinned(mm->pgd))
+	if (xen_page_pinned(mm->pgd) && !mm->context.has_foreign_mappings)
 		xen_pgd_unpin(mm);
 
 	spin_unlock(&mm->page_table_lock);
@@ -1288,12 +1656,19 @@
 	preempt_enable();
 }
 
+/*
+ * Flush tlb on other cpus.  Xen can do this via a single hypercall
+ * rather than explicit IPIs, which has the nice property of avoiding
+ * any cpus which don't actually have dirty tlbs.  Unfortunately it
+ * doesn't give us an opportunity to kick out cpus which are in lazy
+ * tlb state, so we may end up reflushing some cpus unnecessarily.
+ */
 static void xen_flush_tlb_others(const struct cpumask *cpus,
 				 struct mm_struct *mm, unsigned long va)
 {
 	struct {
 		struct mmuext_op op;
-		DECLARE_BITMAP(mask, NR_CPUS);
+		DECLARE_BITMAP(mask, num_processors);
 	} *args;
 	struct multicall_space mcs;
 
@@ -1417,6 +1792,13 @@
 	return ret;
 }
 
+void xen_late_unpin_pgd(struct mm_struct *mm, pgd_t *pgd)
+{
+	if (xen_page_pinned(pgd))
+		__xen_pgd_unpin(mm, pgd);
+
+}
+
 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 #ifdef CONFIG_X86_64
@@ -1445,13 +1827,29 @@
 }
 #endif
 
-#ifdef CONFIG_X86_32
 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
 {
-	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
-	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+	unsigned long pfn = pte_pfn(pte);
+	pte_t oldpte = *ptep;
+
+	if (pte_flags(oldpte) & _PAGE_PRESENT) {
+		/* Don't allow existing IO mappings to be overridden */
+		if (pte_flags(oldpte) & _PAGE_IOMAP)
+			pte = oldpte;
+
+		/* Don't allow _PAGE_RW to be set on existing pte */
 		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
 			       pte_val_ma(pte));
+	}
+
+	/*
+	 * If the new pfn is within the range of the newly allocated
+	 * kernel pagetable, and it isn't being mapped into an
+	 * early_ioremap fixmap slot, make sure it is RO.
+	 */
+	if (!is_early_ioremap_ptep(ptep) &&
+	    pfn >= e820_table_start && pfn < e820_table_end)
+		pte = pte_wrprotect(pte);
 
 	return pte;
 }
@@ -1464,7 +1862,6 @@
 
 	xen_set_pte(ptep, pte);
 }
-#endif
 
 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
 {
@@ -1517,7 +1914,6 @@
 	if (PagePinned(virt_to_page(mm->pgd))) {
 		SetPagePinned(page);
 
-		vm_unmap_aliases();
 		if (!PageHighMem(page)) {
 			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
 			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
@@ -1620,6 +2016,7 @@
 	return __ka(m2p(maddr));
 }
 
+/* Set the page permissions on an identity-mapped pages */
 static void set_page_prot(void *addr, pgprot_t prot)
 {
 	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
@@ -1635,6 +2032,9 @@
 	unsigned ident_pte;
 	unsigned long pfn;
 
+	level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
+				      PAGE_SIZE);
+
 	ident_pte = 0;
 	pfn = 0;
 	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
@@ -1645,7 +2045,7 @@
 			pte_page = m2v(pmd[pmdidx].pmd);
 		else {
 			/* Check for free pte pages */
-			if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
+			if (ident_pte == LEVEL1_IDENT_ENTRIES)
 				break;
 
 			pte_page = &level1_ident_pgt[ident_pte];
@@ -1677,6 +2077,20 @@
 	set_page_prot(pmd, PAGE_KERNEL_RO);
 }
 
+void __init xen_setup_machphys_mapping(void)
+{
+	struct xen_machphys_mapping mapping;
+	unsigned long machine_to_phys_nr_ents;
+
+	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
+		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
+		machine_to_phys_nr_ents = mapping.max_mfn + 1;
+	} else {
+		machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
+	}
+	machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
+}
+
 #ifdef CONFIG_X86_64
 static void convert_pfn_mfn(void *v)
 {
@@ -1768,12 +2182,15 @@
 	return pgd;
 }
 #else	/* !CONFIG_X86_64 */
-static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
+static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD);
 
 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
 					 unsigned long max_pfn)
 {
 	pmd_t *kernel_pmd;
+	int i;
+
+	level2_kernel_pgt = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
 
 	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
 				  xen_start_info->nr_pt_frames * PAGE_SIZE +
@@ -1785,6 +2202,20 @@
 	xen_map_identity_early(level2_kernel_pgt, max_pfn);
 
 	memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+
+	/*
+	 * When running a 32 bit domain 0 on a 64 bit hypervisor a
+	 * pinned L3 (such as the initial pgd here) contains bits
+	 * which are reserved in the PAE layout but not in the 64 bit
+	 * layout. Unfortunately some versions of the hypervisor
+	 * (incorrectly) validate compat mode guests against the PAE
+	 * layout and hence will not allow such a pagetable to be
+	 * pinned by the guest. Therefore we mask off only the PFN and
+	 * Present bits of the supplied L3.
+	 */
+	for (i = 0; i < PTRS_PER_PGD; i++)
+		swapper_pg_dir[i].pgd &= (PTE_PFN_MASK | _PAGE_PRESENT);
+
 	set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
 			__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
 
@@ -1807,6 +2238,8 @@
 }
 #endif	/* CONFIG_X86_64 */
 
+static unsigned char dummy_ioapic_mapping[PAGE_SIZE] __page_aligned_bss;
+
 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 {
 	pte_t pte;
@@ -1836,9 +2269,26 @@
 		pte = pfn_pte(phys, prot);
 		break;
 
-	default:
+#ifdef CONFIG_X86_IO_APIC
+	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
+		/*
+		 * We just don't map the IO APIC - all access is via
+		 * hypercalls.  Keep the address in the pte for reference.
+		 */
+		pte = pfn_pte(PFN_DOWN(__pa(dummy_ioapic_mapping)), PAGE_KERNEL);
+		break;
+#endif
+
+	case FIX_PARAVIRT_BOOTMAP:
+		/* This is an MFN, but it isn't an IO mapping from the
+		   IO domain */
 		pte = mfn_pte(phys, prot);
 		break;
+
+	default:
+		/* By default, set_fixmap is used for hardware mappings */
+		pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
+		break;
 	}
 
 	__native_set_fixmap(idx, pte);
@@ -1853,6 +2303,29 @@
 #endif
 }
 
+__init void xen_ident_map_ISA(void)
+{
+	unsigned long pa;
+
+	/*
+	 * If we're dom0, then linear map the ISA machine addresses into
+	 * the kernel's address space.
+	 */
+	if (!xen_initial_domain())
+		return;
+
+	xen_raw_printk("Xen: setup ISA identity maps\n");
+
+	for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
+		pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
+
+		if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
+			BUG();
+	}
+
+	xen_flush_tlb();
+}
+
 static __init void xen_post_allocator_init(void)
 {
 	pv_mmu_ops.set_pte = xen_set_pte;
@@ -1915,11 +2388,7 @@
 	.kmap_atomic_pte = xen_kmap_atomic_pte,
 #endif
 
-#ifdef CONFIG_X86_64
-	.set_pte = xen_set_pte,
-#else
 	.set_pte = xen_set_pte_init,
-#endif
 	.set_pte_at = xen_set_pte_at,
 	.set_pmd = xen_set_pmd_hyper,
 
@@ -1970,6 +2439,301 @@
 	pv_mmu_ops = xen_mmu_ops;
 }
 
+/* Protected by xen_reservation_lock. */
+#define MAX_CONTIG_ORDER 9 /* 2MB */
+static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
+
+#define VOID_PTE (mfn_pte(0, __pgprot(0)))
+static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
+				unsigned long *in_frames,
+				unsigned long *out_frames)
+{
+	int i;
+	struct multicall_space mcs;
+
+	xen_mc_batch();
+	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
+		mcs = __xen_mc_entry(0);
+
+		if (in_frames)
+			in_frames[i] = virt_to_mfn(vaddr);
+
+		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
+		set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
+
+		if (out_frames)
+			out_frames[i] = virt_to_pfn(vaddr);
+	}
+	xen_mc_issue(0);
+}
+
+/*
+ * Update the pfn-to-mfn mappings for a virtual address range, either to
+ * point to an array of mfns, or contiguously from a single starting
+ * mfn.
+ */
+static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
+				     unsigned long *mfns,
+				     unsigned long first_mfn)
+{
+	unsigned i, limit;
+	unsigned long mfn;
+
+	xen_mc_batch();
+
+	limit = 1u << order;
+	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
+		struct multicall_space mcs;
+		unsigned flags;
+
+		mcs = __xen_mc_entry(0);
+		if (mfns)
+			mfn = mfns[i];
+		else
+			mfn = first_mfn + i;
+
+		if (i < (limit - 1))
+			flags = 0;
+		else {
+			if (order == 0)
+				flags = UVMF_INVLPG | UVMF_ALL;
+			else
+				flags = UVMF_TLB_FLUSH | UVMF_ALL;
+		}
+
+		MULTI_update_va_mapping(mcs.mc, vaddr,
+				mfn_pte(mfn, PAGE_KERNEL), flags);
+
+		set_phys_to_machine(virt_to_pfn(vaddr), mfn);
+	}
+
+	xen_mc_issue(0);
+}
+
+/*
+ * Perform the hypercall to exchange a region of our pfns to point to
+ * memory with the required contiguous alignment.  Takes the pfns as
+ * input, and populates mfns as output.
+ *
+ * Returns a success code indicating whether the hypervisor was able to
+ * satisfy the request or not.
+ */
+static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
+			       unsigned long *pfns_in,
+			       unsigned long extents_out, unsigned int order_out,
+			       unsigned long *mfns_out,
+			       unsigned int address_bits)
+{
+	long rc;
+	int success;
+
+	struct xen_memory_exchange exchange = {
+		.in = {
+			.nr_extents   = extents_in,
+			.extent_order = order_in,
+			.extent_start = pfns_in,
+			.domid        = DOMID_SELF
+		},
+		.out = {
+			.nr_extents   = extents_out,
+			.extent_order = order_out,
+			.extent_start = mfns_out,
+			.address_bits = address_bits,
+			.domid        = DOMID_SELF
+		}
+	};
+
+	BUG_ON(extents_in << order_in != extents_out << order_out);
+
+	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+	success = (exchange.nr_exchanged == extents_in);
+
+	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+	BUG_ON(success && (rc != 0));
+
+	return success;
+}
+
+int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
+				 unsigned int address_bits)
+{
+	unsigned long *in_frames = discontig_frames, out_frame;
+	unsigned long  flags;
+	int            success;
+
+	/*
+	 * Currently an auto-translated guest will not perform I/O, nor will
+	 * it require PAE page directories below 4GB. Therefore any calls to
+	 * this function are redundant and can be ignored.
+	 */
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return 0;
+
+	if (unlikely(order > MAX_CONTIG_ORDER))
+		return -ENOMEM;
+
+	memset((void *) vstart, 0, PAGE_SIZE << order);
+
+	spin_lock_irqsave(&xen_reservation_lock, flags);
+
+	/* 1. Zap current PTEs, remembering MFNs. */
+	xen_zap_pfn_range(vstart, order, in_frames, NULL);
+
+	/* 2. Get a new contiguous memory extent. */
+	out_frame = virt_to_pfn(vstart);
+	success = xen_exchange_memory(1UL << order, 0, in_frames,
+				      1, order, &out_frame,
+				      address_bits);
+
+	/* 3. Map the new extent in place of old pages. */
+	if (success)
+		xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
+	else
+		xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
+
+	spin_unlock_irqrestore(&xen_reservation_lock, flags);
+
+	return success ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
+
+void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
+{
+	unsigned long *out_frames = discontig_frames, in_frame;
+	unsigned long  flags;
+	int success;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	if (unlikely(order > MAX_CONTIG_ORDER))
+		return;
+
+	memset((void *) vstart, 0, PAGE_SIZE << order);
+
+	spin_lock_irqsave(&xen_reservation_lock, flags);
+
+	/* 1. Find start MFN of contiguous extent. */
+	in_frame = virt_to_mfn(vstart);
+
+	/* 2. Zap current PTEs. */
+	xen_zap_pfn_range(vstart, order, NULL, out_frames);
+
+	/* 3. Do the exchange for non-contiguous MFNs. */
+	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
+					0, out_frames, 0);
+
+	/* 4. Map new pages in place of old pages. */
+	if (success)
+		xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
+	else
+		xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
+
+	spin_unlock_irqrestore(&xen_reservation_lock, flags);
+}
+EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
+
+#define REMAP_BATCH_SIZE 16
+
+struct remap_data {
+	unsigned long mfn;
+	pgprot_t prot;
+	struct mmu_update *mmu_update;
+};
+
+static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
+				 unsigned long addr, void *data)
+{
+	struct remap_data *rmd = data;
+	pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
+
+	rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
+	rmd->mmu_update->val = pte_val_ma(pte);
+	rmd->mmu_update++;
+
+	return 0;
+}
+
+int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+			       unsigned long addr,
+			       unsigned long mfn, int nr,
+			       pgprot_t prot, unsigned domid)
+{
+	struct remap_data rmd;
+	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
+	int batch;
+	unsigned long range;
+	int err = 0;
+
+	prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
+
+	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+
+	rmd.mfn = mfn;
+	rmd.prot = prot;
+
+	while (nr) {
+		batch = min(REMAP_BATCH_SIZE, nr);
+		range = (unsigned long)batch << PAGE_SHIFT;
+
+		rmd.mmu_update = mmu_update;
+		err = apply_to_page_range(vma->vm_mm, addr, range,
+					  remap_area_mfn_pte_fn, &rmd);
+		if (err)
+			goto out;
+
+		err = -EFAULT;
+		if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
+			goto out;
+
+		nr -= batch;
+		addr += range;
+	}
+
+	err = 0;
+out:
+
+	flush_tlb_all();
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+
+#ifdef CONFIG_XEN_PVHVM
+static void xen_hvm_exit_mmap(struct mm_struct *mm)
+{
+	struct xen_hvm_pagetable_dying a;
+	int rc;
+
+	a.domid = DOMID_SELF;
+	a.gpa = __pa(mm->pgd);
+	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+	WARN_ON_ONCE(rc < 0);
+}
+
+static int is_pagetable_dying_supported(void)
+{
+	struct xen_hvm_pagetable_dying a;
+	int rc = 0;
+
+	a.domid = DOMID_SELF;
+	a.gpa = 0x00;
+	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+	if (rc < 0) {
+		printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
+		return 0;
+	}
+	return 1;
+}
+
+void __init xen_hvm_init_mmu_ops(void)
+{
+	if (is_pagetable_dying_supported())
+		pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
+}
+#endif
+
 #ifdef CONFIG_XEN_DEBUG_FS
 
 static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 5fe6bc7..537bb9a 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -12,7 +12,6 @@
 
 
 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-bool install_p2mtop_page(unsigned long pfn, unsigned long *p);
 
 void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
@@ -60,4 +59,5 @@
 unsigned long xen_read_cr2_direct(void);
 
 extern void xen_init_mmu_ops(void);
+extern void xen_hvm_init_mmu_ops(void);
 #endif	/* _XEN_MMU_H */
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
new file mode 100644
index 0000000..4d55524
--- /dev/null
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -0,0 +1,52 @@
+/* Glue code to lib/swiotlb-xen.c */
+
+#include <linux/dma-mapping.h>
+#include <linux/swiotlb.h>
+
+#include <asm/xen/hypervisor.h>
+
+int xen_swiotlb __read_mostly;
+
+static struct dma_map_ops xen_swiotlb_dma_ops = {
+	.mapping_error = xen_swiotlb_dma_mapping_error,
+	.alloc_coherent = xen_swiotlb_alloc_coherent,
+	.free_coherent = xen_swiotlb_free_coherent,
+	.sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
+	.sync_single_for_device = xen_swiotlb_sync_single_for_device,
+	.sync_single_range_for_cpu = xen_swiotlb_sync_single_range_for_cpu,
+	.sync_single_range_for_device = xen_swiotlb_sync_single_range_for_device,
+	.sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
+	.sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
+	.map_sg = xen_swiotlb_map_sg_attrs,
+	.unmap_sg = xen_swiotlb_unmap_sg_attrs,
+	.map_page = xen_swiotlb_map_page,
+	.unmap_page = xen_swiotlb_unmap_page,
+	.dma_supported = xen_swiotlb_dma_supported,
+};
+
+/*
+ * pci_swiotlb_detect - set swiotlb to 1 if necessary
+ *
+ * This returns non-zero if we are forced to use swiotlb (by the boot
+ * option).
+ */
+int __init pci_xen_swiotlb_detect(void)
+{
+
+	if (xen_pv_domain() && (xen_initial_domain() || swiotlb))
+		xen_swiotlb = 1;
+
+	/* If we are running under Xen, we MUST disable the native SWIOTLB */
+	if (xen_pv_domain())
+		swiotlb = 0;
+
+	return xen_swiotlb;
+}
+
+void __init pci_xen_swiotlb_init(void)
+{
+	if (xen_swiotlb) {
+		xen_swiotlb_init(1);
+		dma_ops = &xen_swiotlb_dma_ops;
+	}
+}
diff --git a/arch/x86/xen/pci.c b/arch/x86/xen/pci.c
new file mode 100644
index 0000000..8ca31f1
--- /dev/null
+++ b/arch/x86/xen/pci.c
@@ -0,0 +1,296 @@
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/pci.h>
+#include <linux/msi.h>
+#include <linux/slab.h>
+
+#include <asm/mpspec.h>
+#include <asm/io_apic.h>
+#include <asm/pci_x86.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/pci.h>
+
+#include <xen/interface/xen.h>
+#include <xen/events.h>
+
+#include "xen-ops.h"
+
+int xen_register_pirq(u32 gsi, int triggering)
+{
+	int rc, irq;
+	struct physdev_map_pirq map_irq;
+	int shareable = 0;
+	char *name;
+
+	if (!xen_pv_domain())
+		return -1;
+
+	if (triggering == ACPI_EDGE_SENSITIVE) {
+		shareable = 0;
+		name = "ioapic-edge";
+	} else {
+		shareable = 1;
+		name = "ioapic-level";
+	}
+
+	irq = xen_allocate_pirq(gsi, shareable, name);
+
+	printk(KERN_DEBUG "xen: --> irq=%d\n", irq);
+
+	if (irq < 0)
+		goto out;
+
+	map_irq.domid = DOMID_SELF;
+	map_irq.type = MAP_PIRQ_TYPE_GSI;
+	map_irq.index = gsi;
+	map_irq.pirq = irq;
+
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+	if (rc) {
+		printk(KERN_WARNING "xen map irq failed %d\n", rc);
+		return -1;
+	}
+
+out:
+	return irq;
+}
+
+int xen_register_gsi(u32 gsi, int triggering, int polarity)
+{
+	int rc, irq;
+	struct physdev_setup_gsi setup_gsi;
+
+	if (!xen_pv_domain())
+		return -1;
+
+	printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
+			gsi, triggering, polarity);
+
+	irq = xen_register_pirq(gsi, triggering);
+
+	setup_gsi.gsi = gsi;
+	setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
+	setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
+	if (rc == -EEXIST)
+		printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
+	else if (rc) {
+		printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
+				gsi, rc);
+	}
+
+	return irq;
+}
+
+#ifdef CONFIG_ACPI
+#define BAD_MADT_ENTRY(entry, end) (					    \
+		(!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
+		((struct acpi_subtable_header *)entry)->length < sizeof(*entry))
+
+
+static int __init
+xen_acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
+			   const unsigned long end)
+{
+	struct acpi_madt_interrupt_override *intsrc = NULL;
+
+	intsrc = (struct acpi_madt_interrupt_override *)header;
+
+	if (BAD_MADT_ENTRY(intsrc, end))
+		return -EINVAL;
+
+	acpi_table_print_madt_entry(header);
+
+	if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
+		int gsi;
+		int trigger, polarity;
+
+		trigger = intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK;
+		polarity = intsrc->inti_flags & ACPI_MADT_POLARITY_MASK;
+
+		/* Command-line over-ride via acpi_sci= */
+		if (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK)
+			trigger = acpi_sci_flags & ACPI_MADT_TRIGGER_MASK;
+
+		if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
+			polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
+
+		printk("xen: sci override: source_irq=%d global_irq=%d trigger=%x polarity=%x\n",
+			intsrc->source_irq, intsrc->global_irq,
+			trigger, polarity);
+
+		switch (polarity) {
+		case ACPI_MADT_POLARITY_CONFORMS:
+		case ACPI_MADT_POLARITY_ACTIVE_LOW:
+			polarity = ACPI_ACTIVE_LOW;
+			break;
+
+		case ACPI_MADT_POLARITY_ACTIVE_HIGH:
+			polarity = ACPI_ACTIVE_HIGH;
+			break;
+
+		default:
+			return 0;
+		}
+
+		switch (trigger) {
+		case ACPI_MADT_TRIGGER_CONFORMS:
+		case ACPI_MADT_TRIGGER_LEVEL:
+			trigger = ACPI_LEVEL_SENSITIVE;
+			break;
+
+		case ACPI_MADT_TRIGGER_EDGE:
+			trigger = ACPI_EDGE_SENSITIVE;
+			break;
+
+		default:
+			return 0;
+		}
+
+		gsi = xen_register_gsi(intsrc->global_irq,
+				       trigger, polarity);
+		/*
+		 * stash over-ride to indicate we've been here
+		 * and for later update of acpi_gbl_FADT
+		 */
+		acpi_sci_override_gsi = gsi;
+
+		printk("xen: acpi sci %d\n", gsi);
+	}
+
+	return 0;
+}
+
+static __init void xen_setup_acpi_sci(void)
+{
+  acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
+			xen_acpi_parse_int_src_ovr,
+			nr_irqs);
+}
+#else
+static __init void xen_setup_acpi_sci(void)
+{
+}
+#endif
+
+void __init xen_setup_pirqs(void)
+{
+	int irq;
+
+	if (0 == nr_ioapics) {
+		for (irq = 0; irq < NR_IRQS_LEGACY; irq++)
+			xen_allocate_pirq(irq, 0, "xt-pic");
+		return;
+	}
+
+	/* Pre-allocate legacy irqs */
+	for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
+		int trigger, polarity;
+
+		if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
+			continue;
+
+		xen_register_pirq(irq,
+			trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE);
+	}
+
+	xen_setup_acpi_sci();
+}
+
+#ifdef CONFIG_PCI_MSI
+int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	int irq, ret;
+	struct msi_desc *msidesc;
+
+	list_for_each_entry(msidesc, &dev->msi_list, list) {
+		irq = xen_create_msi_irq(dev, msidesc, type);
+		if (irq < 0)
+			return -1;
+
+		ret = set_irq_msi(irq, msidesc);
+		if (ret)
+			goto error;
+	}
+	return 0;
+
+error:
+	xen_destroy_irq(irq);
+	return ret;
+}
+#endif
+
+struct xen_device_domain_owner {
+	domid_t domain;
+	struct pci_dev *dev;
+	struct list_head list;
+};
+
+static DEFINE_SPINLOCK(dev_domain_list_spinlock);
+static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list);
+
+static struct xen_device_domain_owner *find_device(struct pci_dev *dev)
+{
+	struct xen_device_domain_owner *owner;
+
+	list_for_each_entry(owner, &dev_domain_list, list) {
+		if (owner->dev == dev)
+			return owner;
+	}
+	return NULL;
+}
+
+int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+	struct xen_device_domain_owner *owner;
+	int domain = -ENODEV;
+
+	spin_lock(&dev_domain_list_spinlock);
+	owner = find_device(dev);
+	if (owner)
+		domain = owner->domain;
+	spin_unlock(&dev_domain_list_spinlock);
+	return domain;
+}
+EXPORT_SYMBOL(xen_find_device_domain_owner);
+
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain)
+{
+	struct xen_device_domain_owner *owner;
+
+	owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL);
+	if (!owner)
+		return -ENODEV;
+
+	spin_lock(&dev_domain_list_spinlock);
+	if (find_device(dev)) {
+		spin_unlock(&dev_domain_list_spinlock);
+		kfree(owner);
+		return -EEXIST;
+	}
+	owner->domain = domain;
+	owner->dev = dev;
+	list_add_tail(&owner->list, &dev_domain_list);
+	spin_unlock(&dev_domain_list_spinlock);
+	return 0;
+}
+EXPORT_SYMBOL(xen_register_device_domain_owner);
+
+int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+	struct xen_device_domain_owner *owner;
+
+	spin_lock(&dev_domain_list_spinlock);
+	owner = find_device(dev);
+	if (!owner) {
+		spin_unlock(&dev_domain_list_spinlock);
+		return -ENODEV;
+	}
+	list_del(&owner->list);
+	spin_unlock(&dev_domain_list_spinlock);
+	kfree(owner);
+	return 0;
+}
+EXPORT_SYMBOL(xen_unregister_device_domain_owner);
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
new file mode 100644
index 0000000..0f45638
--- /dev/null
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -0,0 +1,143 @@
+/******************************************************************************
+ * platform-pci-unplug.c
+ *
+ * Xen platform PCI device driver
+ * Copyright (c) 2010, Citrix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+
+#include <xen/platform_pci.h>
+
+#define XEN_PLATFORM_ERR_MAGIC -1
+#define XEN_PLATFORM_ERR_PROTOCOL -2
+#define XEN_PLATFORM_ERR_BLACKLIST -3
+
+/* store the value of xen_emul_unplug after the unplug is done */
+int xen_platform_pci_unplug;
+EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
+#ifdef CONFIG_XEN_PVHVM
+static int xen_emul_unplug;
+
+static int __init check_platform_magic(void)
+{
+	short magic;
+	char protocol;
+
+	magic = inw(XEN_IOPORT_MAGIC);
+	if (magic != XEN_IOPORT_MAGIC_VAL) {
+		printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
+		return XEN_PLATFORM_ERR_MAGIC;
+	}
+
+	protocol = inb(XEN_IOPORT_PROTOVER);
+
+	printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
+			protocol);
+
+	switch (protocol) {
+	case 1:
+		outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
+		outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
+		if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
+			printk(KERN_ERR "Xen Platform: blacklisted by host\n");
+			return XEN_PLATFORM_ERR_BLACKLIST;
+		}
+		break;
+	default:
+		printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
+		return XEN_PLATFORM_ERR_PROTOCOL;
+	}
+
+	return 0;
+}
+
+void __init xen_unplug_emulated_devices(void)
+{
+	int r;
+
+	/* user explicitly requested no unplug */
+	if (xen_emul_unplug & XEN_UNPLUG_NEVER)
+		return;
+	/* check the version of the xen platform PCI device */
+	r = check_platform_magic();
+	/* If the version matches enable the Xen platform PCI driver.
+	 * Also enable the Xen platform PCI driver if the host does
+	 * not support the unplug protocol (XEN_PLATFORM_ERR_MAGIC)
+	 * but the user told us that unplugging is unnecessary. */
+	if (r && !(r == XEN_PLATFORM_ERR_MAGIC &&
+			(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY)))
+		return;
+	/* Set the default value of xen_emul_unplug depending on whether or
+	 * not the Xen PV frontends and the Xen platform PCI driver have
+	 * been compiled for this kernel (modules or built-in are both OK). */
+	if (!xen_emul_unplug) {
+		if (xen_must_unplug_nics()) {
+			printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
+					"been compiled for this kernel: unplug emulated NICs.\n");
+			xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+		}
+		if (xen_must_unplug_disks()) {
+			printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
+					"been compiled for this kernel: unplug emulated disks.\n"
+					"You might have to change the root device\n"
+					"from /dev/hd[a-d] to /dev/xvd[a-d]\n"
+					"in your root= kernel command line option\n");
+			xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+		}
+	}
+	/* Now unplug the emulated devices */
+	if (!(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY))
+		outw(xen_emul_unplug, XEN_IOPORT_UNPLUG);
+	xen_platform_pci_unplug = xen_emul_unplug;
+}
+
+static int __init parse_xen_emul_unplug(char *arg)
+{
+	char *p, *q;
+	int l;
+
+	for (p = arg; p; p = q) {
+		q = strchr(p, ',');
+		if (q) {
+			l = q - p;
+			q++;
+		} else {
+			l = strlen(p);
+		}
+		if (!strncmp(p, "all", l))
+			xen_emul_unplug |= XEN_UNPLUG_ALL;
+		else if (!strncmp(p, "ide-disks", l))
+			xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+		else if (!strncmp(p, "aux-ide-disks", l))
+			xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS;
+		else if (!strncmp(p, "nics", l))
+			xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+		else if (!strncmp(p, "unnecessary", l))
+			xen_emul_unplug |= XEN_UNPLUG_UNNECESSARY;
+		else if (!strncmp(p, "never", l))
+			xen_emul_unplug |= XEN_UNPLUG_NEVER;
+		else
+			printk(KERN_WARNING "unrecognised option '%s' "
+				 "in parameter 'xen_emul_unplug'\n", p);
+	}
+	return 0;
+}
+early_param("xen_emul_unplug", parse_xen_emul_unplug);
+#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index ad0047f..c011a7d 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -10,6 +10,7 @@
 #include <linux/pm.h>
 
 #include <asm/elf.h>
+#include <asm/hpet.h>
 #include <asm/vdso.h>
 #include <asm/e820.h>
 #include <asm/setup.h>
@@ -19,7 +20,9 @@
 
 #include <xen/page.h>
 #include <xen/interface/callback.h>
+#include <xen/interface/memory.h>
 #include <xen/interface/physdev.h>
+#include <xen/interface/memory.h>
 #include <xen/features.h>
 
 #include "xen-ops.h"
@@ -32,25 +35,192 @@
 extern void xen_syscall_target(void);
 extern void xen_syscall32_target(void);
 
+/* Amount of extra memory space we add to the e820 ranges */
+phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
+
+/* 
+ * The maximum amount of extra memory compared to the base size.  The
+ * main scaling factor is the size of struct page.  At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ * 
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define EXTRA_MEM_RATIO		(10)
+
+static __init void xen_add_extra_mem(unsigned long pages)
+{
+	u64 size = (u64)pages * PAGE_SIZE;
+	u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
+
+	if (!pages)
+		return;
+
+	e820_add_region(extra_start, size, E820_RAM);
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+
+	reserve_early(extra_start, extra_start + size, "XEN EXTRA");
+
+	xen_extra_mem_size += size;
+
+	xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
+}
+
+static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
+					      phys_addr_t end_addr)
+{
+	struct xen_memory_reservation reservation = {
+		.address_bits = 0,
+		.extent_order = 0,
+		.domid        = DOMID_SELF
+	};
+	unsigned long start, end;
+	unsigned long len = 0;
+	unsigned long pfn;
+	int ret;
+
+	start = PFN_UP(start_addr);
+	end = PFN_DOWN(end_addr);
+
+	if (end <= start)
+		return 0;
+
+	printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ",
+	       start, end);
+	for(pfn = start; pfn < end; pfn++) {
+		unsigned long mfn = pfn_to_mfn(pfn);
+
+		/* Make sure pfn exists to start with */
+		if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
+			continue;
+
+		set_xen_guest_handle(reservation.extent_start, &mfn);
+		reservation.nr_extents = 1;
+
+		ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+					   &reservation);
+		WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
+		     start, end, ret);
+		if (ret == 1) {
+			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+			len++;
+		}
+	}
+	printk(KERN_CONT "%ld pages freed\n", len);
+
+	return len;
+}
+
+static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
+						     const struct e820map *e820)
+{
+	phys_addr_t max_addr = PFN_PHYS(max_pfn);
+	phys_addr_t last_end = ISA_END_ADDRESS;
+	unsigned long released = 0;
+	int i;
+
+	/* Free any unused memory above the low 1Mbyte. */
+	for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
+		phys_addr_t end = e820->map[i].addr;
+		end = min(max_addr, end);
+
+		if (last_end < end)
+			released += xen_release_chunk(last_end, end);
+		last_end = max(last_end, e820->map[i].addr + e820->map[i].size);
+	}
+
+	if (last_end < max_addr)
+		released += xen_release_chunk(last_end, max_addr);
+
+	printk(KERN_INFO "released %ld pages of unused memory\n", released);
+	return released;
+}
 
 /**
  * machine_specific_memory_setup - Hook for machine specific memory setup.
  **/
-
 char * __init xen_memory_setup(void)
 {
+	static struct e820entry map[E820MAX] __initdata;
+
 	unsigned long max_pfn = xen_start_info->nr_pages;
+	unsigned long long mem_end;
+	int rc;
+	struct xen_memory_map memmap;
+	unsigned long extra_pages = 0;
+	unsigned long extra_limit;
+	int op;
+	int i;
 
 	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
+	mem_end = PFN_PHYS(max_pfn);
+
+	memmap.nr_entries = E820MAX;
+	set_xen_guest_handle(memmap.buffer, map);
+
+	op = xen_initial_domain() ?
+		XENMEM_machine_memory_map :
+		XENMEM_memory_map;
+	rc = HYPERVISOR_memory_op(op, &memmap);
+	if (rc == -ENOSYS) {
+		BUG_ON(xen_initial_domain());
+		memmap.nr_entries = 1;
+		map[0].addr = 0ULL;
+		map[0].size = mem_end;
+		/* 8MB slack (to balance backend allocations). */
+		map[0].size += 8ULL << 20;
+		map[0].type = E820_RAM;
+		rc = 0;
+	}
+	BUG_ON(rc);
 
 	e820.nr_map = 0;
+	xen_extra_mem_start = mem_end;
+	for (i = 0; i < memmap.nr_entries; i++) {
+		unsigned long long end;
 
-	e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
+		/* Guard against non-page aligned E820 entries. */
+		if (map[i].type == E820_RAM)
+			map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE;
+
+		end = map[i].addr + map[i].size;
+		if (map[i].type == E820_RAM && end > mem_end) {
+			/* RAM off the end - may be partially included */
+			u64 delta = min(map[i].size, end - mem_end);
+
+			map[i].size -= delta;
+			end -= delta;
+
+			extra_pages += PFN_DOWN(delta);
+			/*
+			 * Set RAM below 4GB that is not for us to be unusable.
+			 * This prevents "System RAM" address space from being
+			 * used as potential resource for I/O address (happens
+			 * when 'allocate_resource' is called).
+			 */
+			if (delta &&
+				(xen_initial_domain() && end < 0x100000000ULL))
+				e820_add_region(end, delta, E820_UNUSABLE);
+		}
+
+		if (map[i].size > 0 && end > xen_extra_mem_start)
+			xen_extra_mem_start = end;
+
+		/* Add region if any remains */
+		if (map[i].size > 0)
+			e820_add_region(map[i].addr, map[i].size, map[i].type);
+	}
 
 	/*
-	 * Even though this is normal, usable memory under Xen, reserve
-	 * ISA memory anyway because too many things think they can poke
+	 * In domU, the ISA region is normal, usable memory, but we
+	 * reserve ISA memory anyway because too many things poke
 	 * about in there.
+	 *
+	 * In Dom0, the host E820 information can leave gaps in the
+	 * ISA range, which would cause us to release those pages.  To
+	 * avoid this, we unconditionally reserve them here.
 	 */
 	e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
 			E820_RESERVED);
@@ -67,23 +237,32 @@
 
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 
+	extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
+
+	/*
+	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
+	 * factor the base size.  On non-highmem systems, the base
+	 * size is the full initial memory allocation; on highmem it
+	 * is limited to the max size of lowmem, so that it doesn't
+	 * get completely filled.
+	 *
+	 * In principle there could be a problem in lowmem systems if
+	 * the initial memory is also very large with respect to
+	 * lowmem, but we won't try to deal with that here.
+	 */
+	extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+			  max_pfn + extra_pages);
+
+	if (extra_limit >= max_pfn)
+		extra_pages = extra_limit - max_pfn;
+	else
+		extra_pages = 0;
+
+	xen_add_extra_mem(extra_pages);
+
 	return "Xen";
 }
 
-static void xen_idle(void)
-{
-	local_irq_disable();
-
-	if (need_resched())
-		local_irq_enable();
-	else {
-		current_thread_info()->status &= ~TS_POLLING;
-		smp_mb__after_clear_bit();
-		safe_halt();
-		current_thread_info()->status |= TS_POLLING;
-	}
-}
-
 /*
  * Set the bit indicating "nosegneg" library variants should be used.
  * We only need to bother in pure 32-bit mode; compat 32-bit processes
@@ -156,6 +335,8 @@
 	struct physdev_set_iopl set_iopl;
 	int rc;
 
+	xen_panic_handler_init();
+
 	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
 	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
 
@@ -182,13 +363,21 @@
 	}
 #endif
 
+	/* 
+	 * Xen hypervisor uses HPET to wakeup cpu from deep c-states,
+	 * so the HPET usage in dom0 must be forbidden.
+	 */
+	disable_hpet(NULL);
+
 	memcpy(boot_command_line, xen_start_info->cmd_line,
 	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
 	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
 
-	pm_idle = xen_idle;
-
-	paravirt_disable_iospace();
+	/* Set up idle, making sure it calls safe_halt() pvop */
+#ifdef CONFIG_X86_32
+	boot_cpu_data.hlt_works_ok = 1;
+#endif
+	pm_idle = default_idle;
 
 	fiddle_vdso();
 }
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index a96204a..a873872 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -179,6 +179,7 @@
 static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 {
 	unsigned cpu;
+	unsigned int i;
 
 	if (skip_ioapic_setup) {
 		char *m = (max_cpus == 0) ?
@@ -193,6 +194,12 @@
 
 	smp_store_cpu_info(0);
 	cpu_data(0).x86_max_cores = 1;
+
+	for_each_possible_cpu(i) {
+		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+		zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
+	}
 	set_cpu_sibling_map(0);
 
 	if (xen_smp_intr_init(0))
@@ -309,6 +316,8 @@
 	xen_setup_timer(cpu);
 	xen_init_lock_cpu(cpu);
 
+	cpumask_set_cpu(cpu, cpu_callout_mask);
+
 	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 
 	/* make sure interrupts start blocked */
@@ -402,6 +411,8 @@
 	load_cr3(swapper_pg_dir);
 	/* should set up a minimal gdt */
 
+	set_cpu_online(cpu, false);
+
 	HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
 	BUG();
 }
@@ -489,3 +500,41 @@
 	xen_fill_possible_map();
 	xen_init_spinlocks();
 }
+
+static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
+{
+	native_smp_prepare_cpus(max_cpus);
+	WARN_ON(xen_smp_intr_init(0));
+
+	if (!xen_have_vector_callback)
+		return;
+	xen_init_lock_cpu(0);
+	xen_init_spinlocks();
+}
+
+static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
+{
+	int rc;
+	rc = native_cpu_up(cpu);
+	WARN_ON (xen_smp_intr_init(cpu));
+	return rc;
+}
+
+static void xen_hvm_cpu_die(unsigned int cpu)
+{
+	unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+	unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+	unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
+	unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
+	native_cpu_die(cpu);
+}
+
+void __init xen_hvm_smp_init(void)
+{
+	smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
+	smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
+	smp_ops.cpu_up = xen_hvm_cpu_up;
+	smp_ops.cpu_die = xen_hvm_cpu_die;
+	smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
+	smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
+}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index a9c6611..6c43a90f 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,7 +12,7 @@
 #include "xen-ops.h"
 #include "mmu.h"
 
-void xen_pre_suspend(void)
+void xen_arch_pre_suspend(void)
 {
 	xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
 	xen_start_info->console.domU.mfn =
@@ -26,7 +26,21 @@
 		BUG();
 }
 
-void xen_post_suspend(int suspend_cancelled)
+void xen_arch_hvm_post_suspend(int suspend_cancelled)
+{
+#ifdef CONFIG_XEN_PVHVM
+	int cpu;
+	xen_hvm_init_shared_info();
+	xen_callback_vector();
+	if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
+		for_each_online_cpu(cpu) {
+			xen_setup_runstate_info(cpu);
+		}
+	}
+#endif
+}
+
+void xen_arch_post_suspend(int suspend_cancelled)
 {
 	xen_build_mfn_list_list();
 
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 8f92188..07efc5b 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -19,6 +19,7 @@
 #include <asm/xen/hypercall.h>
 
 #include <xen/events.h>
+#include <xen/features.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/vcpu.h>
 
@@ -155,7 +156,7 @@
 }
 
 /* Get the TSC speed from Xen */
-unsigned long xen_tsc_khz(void)
+static unsigned long xen_tsc_khz(void)
 {
 	struct pvclock_vcpu_time_info *info =
 		&HYPERVISOR_shared_info->vcpu_info[0].time;
@@ -190,7 +191,7 @@
 	put_cpu_var(xen_vcpu);
 }
 
-unsigned long xen_get_wallclock(void)
+static unsigned long xen_get_wallclock(void)
 {
 	struct timespec ts;
 
@@ -198,10 +199,24 @@
 	return ts.tv_sec;
 }
 
-int xen_set_wallclock(unsigned long now)
+static int xen_set_wallclock(unsigned long now)
 {
+	struct xen_platform_op op;
+	int rc;
+
 	/* do nothing for domU */
-	return -1;
+	if (!xen_initial_domain())
+		return -1;
+
+	op.cmd = XENPF_settime;
+	op.u.settime.secs = now;
+	op.u.settime.nsecs = 0;
+	op.u.settime.system_time = xen_clocksource_read();
+
+	rc = HYPERVISOR_dom0_op(&op);
+	WARN(rc != 0, "XENPF_settime failed: now=%ld\n", now);
+
+	return rc;
 }
 
 static struct clocksource xen_clocksource __read_mostly = {
@@ -405,6 +420,8 @@
 
 	evt->cpumask = cpumask_of(cpu);
 	evt->irq = irq;
+
+	xen_setup_runstate_info(cpu);
 }
 
 void xen_teardown_timer(int cpu)
@@ -437,7 +454,7 @@
 	}
 }
 
-__init void xen_time_init(void)
+static __init void xen_time_init(void)
 {
 	int cpu = smp_processor_id();
 
@@ -461,3 +478,52 @@
 	xen_setup_timer(cpu);
 	xen_setup_cpu_clockevents();
 }
+
+static const struct pv_time_ops xen_time_ops __initdata = {
+       .sched_clock = xen_clocksource_read,
+};
+
+__init void xen_init_time_ops(void)
+{
+	pv_time_ops = xen_time_ops;
+
+	x86_init.timers.timer_init = xen_time_init;
+	x86_init.timers.setup_percpu_clockev = x86_init_noop;
+	x86_cpuinit.setup_percpu_clockev = x86_init_noop;
+
+	x86_platform.calibrate_tsc = xen_tsc_khz;
+	x86_platform.get_wallclock = xen_get_wallclock;
+	x86_platform.set_wallclock = xen_set_wallclock;
+}
+
+#ifdef CONFIG_XEN_PVHVM
+static void xen_hvm_setup_cpu_clockevents(void)
+{
+	int cpu = smp_processor_id();
+	xen_setup_runstate_info(cpu);
+	xen_setup_timer(cpu);
+	xen_setup_cpu_clockevents();
+}
+
+__init void xen_hvm_init_time_ops(void)
+{
+	/* vector callback is needed otherwise we cannot receive interrupts
+	 * on cpu > 0 and at this point we don't know how many cpus are
+	 * available */
+	if (!xen_have_vector_callback)
+		return;
+	if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
+		printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
+				"disable pv timer\n");
+		return;
+	}
+
+	pv_time_ops = xen_time_ops;
+	x86_init.timers.setup_percpu_clockev = xen_time_init;
+	x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
+
+	x86_platform.calibrate_tsc = xen_tsc_khz;
+	x86_platform.get_wallclock = xen_get_wallclock;
+	x86_platform.set_wallclock = xen_set_wallclock;
+}
+#endif
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
new file mode 100644
index 0000000..1cd7f4d
--- /dev/null
+++ b/arch/x86/xen/vga.c
@@ -0,0 +1,67 @@
+#include <linux/screen_info.h>
+#include <linux/init.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+
+#include <xen/interface/xen.h>
+
+#include "xen-ops.h"
+
+void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
+{
+	struct screen_info *screen_info = &boot_params.screen_info;
+
+	/* This is drawn from a dump from vgacon:startup in
+	 * standard Linux. */
+	screen_info->orig_video_mode = 3;
+	screen_info->orig_video_isVGA = 1;
+	screen_info->orig_video_lines = 25;
+	screen_info->orig_video_cols = 80;
+	screen_info->orig_video_ega_bx = 3;
+	screen_info->orig_video_points = 16;
+	screen_info->orig_y = screen_info->orig_video_lines - 1;
+
+	switch (info->video_type) {
+	case XEN_VGATYPE_TEXT_MODE_3:
+		if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
+		    + sizeof(info->u.text_mode_3))
+			break;
+		screen_info->orig_video_lines = info->u.text_mode_3.rows;
+		screen_info->orig_video_cols = info->u.text_mode_3.columns;
+		screen_info->orig_x = info->u.text_mode_3.cursor_x;
+		screen_info->orig_y = info->u.text_mode_3.cursor_y;
+		screen_info->orig_video_points =
+			info->u.text_mode_3.font_height;
+		break;
+
+	case XEN_VGATYPE_VESA_LFB:
+		if (size < offsetof(struct dom0_vga_console_info,
+				    u.vesa_lfb.gbl_caps))
+			break;
+		screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB;
+		screen_info->lfb_width = info->u.vesa_lfb.width;
+		screen_info->lfb_height = info->u.vesa_lfb.height;
+		screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel;
+		screen_info->lfb_base = info->u.vesa_lfb.lfb_base;
+		screen_info->lfb_size = info->u.vesa_lfb.lfb_size;
+		screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line;
+		screen_info->red_size = info->u.vesa_lfb.red_size;
+		screen_info->red_pos = info->u.vesa_lfb.red_pos;
+		screen_info->green_size = info->u.vesa_lfb.green_size;
+		screen_info->green_pos = info->u.vesa_lfb.green_pos;
+		screen_info->blue_size = info->u.vesa_lfb.blue_size;
+		screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
+		screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
+		screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
+		if (size >= offsetof(struct dom0_vga_console_info,
+				     u.vesa_lfb.gbl_caps)
+		    + sizeof(info->u.vesa_lfb.gbl_caps))
+			screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
+		if (size >= offsetof(struct dom0_vga_console_info,
+				     u.vesa_lfb.mode_attrs)
+		    + sizeof(info->u.vesa_lfb.mode_attrs))
+			screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
+		break;
+	}
+}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f9153a3..e860de7 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,6 +30,10 @@
 pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
 void xen_ident_map_ISA(void);
 void xen_reserve_top(void);
+void xen_ident_map_ISA(void);
+extern unsigned long xen_max_p2m_pfn;
+
+void xen_set_pat(u64);
 
 char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);
@@ -38,6 +42,10 @@
 void xen_enable_syscall(void);
 void xen_vcpu_restore(void);
 
+void xen_callback_vector(void);
+void xen_hvm_init_shared_info(void);
+void __init xen_unplug_emulated_devices(void);
+
 void __init xen_build_dynamic_phys_to_machine(void);
 
 void xen_init_irq_ops(void);
@@ -46,11 +54,8 @@
 void xen_teardown_timer(int cpu);
 cycle_t xen_clocksource_read(void);
 void xen_setup_cpu_clockevents(void);
-unsigned long xen_tsc_khz(void);
-void __init xen_time_init(void);
-unsigned long xen_get_wallclock(void);
-int xen_set_wallclock(unsigned long time);
-unsigned long long xen_sched_clock(void);
+void __init xen_init_time_ops(void);
+void __init xen_hvm_init_time_ops(void);
 
 irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
 
@@ -60,10 +65,12 @@
 
 #ifdef CONFIG_SMP
 void xen_smp_init(void);
+void __init xen_hvm_smp_init(void);
 
 extern cpumask_var_t xen_cpu_initialized_map;
 #else
 static inline void xen_smp_init(void) {}
+static inline void xen_hvm_smp_init(void) {}
 #endif
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
@@ -82,6 +89,23 @@
 }
 #endif
 
+struct dom0_vga_console_info;
+
+#ifdef CONFIG_XEN_DOM0
+void xen_init_vga(const struct dom0_vga_console_info *, size_t size);
+#else
+static inline void xen_init_vga(const struct dom0_vga_console_info *info,
+				size_t size)
+{
+}
+#endif
+
+#ifdef CONFIG_XEN_DOM0
+void xen_init_apic(void);
+#else
+static inline void xen_init_apic(void) {}
+#endif
+
 /* Declare an asm function, along with symbols needed to make it
    inlineable */
 #define DECL_ASM(ret, name, ...)		\
@@ -101,4 +125,6 @@
 void xen_sysret64(void);
 void xen_adjust_exception_frame(void);
 
+extern int xen_panic_handler_init(void);
+
 #endif /* XEN_OPS_H */
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 7702118..1be123c 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -61,6 +61,7 @@
 # processor has its own "processor." module_param namespace
 processor-y			:= processor_core.o processor_throttling.o
 processor-y			+= processor_idle.o processor_thermal.o
+processor-y			+= processor_xen.o
 processor-$(CONFIG_CPU_FREQ)	+= processor_perflib.o
 
 obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 28ccdbc..b0f9ed6 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -31,6 +31,7 @@
 #include <linux/types.h>
 #include <linux/memory_hotplug.h>
 #include <acpi/acpi_drivers.h>
+#include <xen/acpi.h>
 
 #define ACPI_MEMORY_DEVICE_CLASS		"memory"
 #define ACPI_MEMORY_DEVICE_HID			"PNP0C80"
@@ -70,21 +71,6 @@
 		},
 };
 
-struct acpi_memory_info {
-	struct list_head list;
-	u64 start_addr;		/* Memory Range start physical addr */
-	u64 length;		/* Memory Range length */
-	unsigned short caching;	/* memory cache attribute */
-	unsigned short write_protect;	/* memory read/write attribute */
-	unsigned int enabled:1;
-};
-
-struct acpi_memory_device {
-	struct acpi_device * device;
-	unsigned int state;	/* State of the memory device */
-	struct list_head res_list;
-};
-
 static int acpi_hotmem_initialized;
 
 static acpi_status
@@ -228,6 +214,9 @@
 		return result;
 	}
 
+	if (xen_initial_domain())
+		return xen_hotadd_memory(mem_device);
+
 	node = acpi_get_node(mem_device->device->handle);
 	/*
 	 * Tell the VM there is more memory here...
diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c
index cc22f9a..747d96f 100644
--- a/drivers/acpi/acpica/hwsleep.c
+++ b/drivers/acpi/acpica/hwsleep.c
@@ -47,6 +47,9 @@
 #include "actables.h"
 #include <linux/tboot.h>
 
+#include <xen/acpi.h>
+#include <asm/xen/hypervisor.h>
+
 #define _COMPONENT          ACPI_HARDWARE
 ACPI_MODULE_NAME("hwsleep")
 
@@ -346,6 +349,19 @@
 	tboot_sleep(sleep_state, pm1a_control, pm1b_control);
 
 	/* Write #2: Write both SLP_TYP + SLP_EN */
+	if (xen_pv_acpi()) {
+		int err;
+
+		err = acpi_notify_hypervisor_state(sleep_state,
+						   pm1a_control, pm1b_control);
+		if (err) {
+			ACPI_DEBUG_PRINT((ACPI_DB_INIT,
+					  "Hypervisor failure [%d]\n", err));
+			return_ACPI_STATUS(AE_ERROR);
+		}
+
+		return_ACPI_STATUS(AE_OK);
+	}
 
 	status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control);
 	if (ACPI_FAILURE(status)) {
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 7102474..2428cc0 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -58,6 +58,7 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 #include <acpi/processor.h>
+#include <xen/acpi.h>
 
 #define PREFIX "ACPI: "
 
@@ -81,11 +82,9 @@
 MODULE_LICENSE("GPL");
 
 static int acpi_processor_add(struct acpi_device *device);
-static int acpi_processor_remove(struct acpi_device *device, int type);
 #ifdef CONFIG_ACPI_PROCFS
 static int acpi_processor_info_open_fs(struct inode *inode, struct file *file);
 #endif
-static void acpi_processor_notify(struct acpi_device *device, u32 event);
 static acpi_status acpi_processor_hotadd_init(acpi_handle handle, int *p_cpu);
 static int acpi_processor_handle_eject(struct acpi_processor *pr);
 
@@ -247,7 +246,7 @@
 	return 0;
 }
 
-static int acpi_processor_errata(struct acpi_processor *pr)
+int acpi_processor_errata(struct acpi_processor *pr)
 {
 	int result = 0;
 	struct pci_dev *dev = NULL;
@@ -278,7 +277,7 @@
  * _PDC is required for a BIOS-OS handshake for most of the newer
  * ACPI processor features.
  */
-static int acpi_processor_set_pdc(struct acpi_processor *pr)
+int acpi_processor_set_pdc(struct acpi_processor *pr)
 {
 	struct acpi_object_list *pdc_in = pr->pdc;
 	acpi_status status = AE_OK;
@@ -347,7 +346,7 @@
 			   PDE(inode)->data);
 }
 
-static int acpi_processor_add_fs(struct acpi_device *device)
+int acpi_processor_add_fs(struct acpi_device *device)
 {
 	struct proc_dir_entry *entry = NULL;
 
@@ -386,7 +385,7 @@
 		return -EIO;
 	return 0;
 }
-static int acpi_processor_remove_fs(struct acpi_device *device)
+int acpi_processor_remove_fs(struct acpi_device *device)
 {
 
 	if (acpi_device_dir(device)) {
@@ -402,15 +401,6 @@
 
 	return 0;
 }
-#else
-static inline int acpi_processor_add_fs(struct acpi_device *device)
-{
-	return 0;
-}
-static inline int acpi_processor_remove_fs(struct acpi_device *device)
-{
-	return 0;
-}
 #endif
 
 /* Use the acpiid in MADT to map cpus in case of SMP */
@@ -705,7 +695,7 @@
 
 static DEFINE_PER_CPU(void *, processor_device_array);
 
-static void acpi_processor_notify(struct acpi_device *device, u32 event)
+void acpi_processor_notify(struct acpi_device *device, u32 event)
 {
 	struct acpi_processor *pr = acpi_driver_data(device);
 	int saved;
@@ -873,7 +863,7 @@
 	return result;
 }
 
-static int acpi_processor_remove(struct acpi_device *device, int type)
+int acpi_processor_remove(struct acpi_device *device, int type)
 {
 	struct acpi_processor *pr = NULL;
 
@@ -1148,7 +1138,11 @@
 	if (result < 0)
 		goto out_proc;
 
-	result = acpi_bus_register_driver(&acpi_processor_driver);
+	if (xen_initial_domain())
+		result = xen_acpi_processor_init();
+	else
+		result = acpi_bus_register_driver(&acpi_processor_driver);
+
 	if (result < 0)
 		goto out_cpuidle;
 
@@ -1184,7 +1178,10 @@
 
 	acpi_processor_uninstall_hotplug_notify();
 
-	acpi_bus_unregister_driver(&acpi_processor_driver);
+	if (xen_initial_domain())
+		xen_acpi_processor_exit();
+	else
+		acpi_bus_unregister_driver(&acpi_processor_driver);
 
 	cpuidle_unregister_driver(&acpi_idle_driver);
 
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index a6ad608..3c32e87 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -58,6 +58,7 @@
 
 #include <acpi/acpi_bus.h>
 #include <acpi/processor.h>
+#include <xen/acpi.h>
 #include <asm/processor.h>
 
 #define PREFIX "ACPI: "
@@ -439,7 +440,8 @@
 				cx.entry_method = ACPI_CSTATE_HALT;
 				snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
 			} else {
-				continue;
+				if (!xen_initial_domain())
+					continue;
 			}
 			if (cx.type == ACPI_STATE_C1 &&
 					(idle_halt || idle_nomwait)) {
@@ -477,6 +479,9 @@
 
 		cx.power = obj->integer.value;
 
+		/* cache control methods to notify xen*/
+		processor_cntl_xen_power_cache(pr->acpi_id, i, reg);
+
 		current_count++;
 		memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx));
 
@@ -653,7 +658,7 @@
 	return (working);
 }
 
-static int acpi_processor_get_power_info(struct acpi_processor *pr)
+int acpi_processor_get_power_info(struct acpi_processor *pr)
 {
 	unsigned int i;
 	int result;
@@ -1223,9 +1228,14 @@
 	 * platforms that only support C1.
 	 */
 	if (pr->flags.power) {
-		acpi_processor_setup_cpuidle(pr);
-		if (cpuidle_register_device(&pr->power.dev))
-			return -EIO;
+		if (xen_initial_domain()) {
+			processor_cntl_xen_notify(pr,
+					PROCESSOR_PM_INIT, PM_TYPE_IDLE);
+		} else {
+			acpi_processor_setup_cpuidle(pr);
+			if (cpuidle_register_device(&pr->power.dev))
+				return -EIO;
+		}
 	}
 #ifdef CONFIG_ACPI_PROCFS
 	/* 'power' [R] */
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 40d395e..7ba143d 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -332,7 +332,7 @@
 	return result;
 }
 
-static int acpi_processor_get_performance_info(struct acpi_processor *pr)
+int acpi_processor_get_performance_info(struct acpi_processor *pr)
 {
 	int result = 0;
 	acpi_status status = AE_OK;
@@ -438,7 +438,7 @@
 
 EXPORT_SYMBOL(acpi_processor_notify_smm);
 
-static int acpi_processor_get_psd(struct acpi_processor	*pr)
+int acpi_processor_get_psd(struct acpi_processor	*pr)
 {
 	int result = 0;
 	acpi_status status = AE_OK;
diff --git a/drivers/acpi/processor_xen.c b/drivers/acpi/processor_xen.c
new file mode 100644
index 0000000..305398d
--- /dev/null
+++ b/drivers/acpi/processor_xen.c
@@ -0,0 +1,651 @@
+/*
+ * processor_xen.c - ACPI Processor Driver for xen
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/pm.h>
+#include <linux/cpufreq.h>
+#include <linux/cpu.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/dmi.h>
+#include <linux/moduleparam.h>
+#include <linux/cpuidle.h>
+#include <linux/acpi.h>
+
+#include <acpi/acpi_bus.h>
+#include <acpi/acpi_drivers.h>
+#include <acpi/processor.h>
+#include <xen/acpi.h>
+#include <xen/pcpu.h>
+
+#define PREFIX "ACPI: "
+
+#define ACPI_PROCESSOR_CLASS            "processor"
+#define ACPI_PROCESSOR_DEVICE_NAME	"Processor"
+#define ACPI_PROCESSOR_FILE_INFO	"info"
+#define ACPI_PROCESSOR_FILE_THROTTLING	"throttling"
+#define ACPI_PROCESSOR_FILE_LIMIT	"limit"
+#define ACPI_PROCESSOR_NOTIFY_PERFORMANCE 0x80
+#define ACPI_PROCESSOR_NOTIFY_POWER	0x81
+#define ACPI_PROCESSOR_NOTIFY_THROTTLING	0x82
+
+#define _COMPONENT              ACPI_PROCESSOR_COMPONENT
+ACPI_MODULE_NAME("processor_xen");
+
+static const struct acpi_device_id processor_device_ids[] = {
+	{ACPI_PROCESSOR_OBJECT_HID, 0},
+	{"ACPI0007", 0},
+	{"", 0},
+};
+
+/*
+ * Xen ACPI processor driver
+ */
+
+/* from processor_core.c */
+
+static int xen_acpi_processor_add(struct acpi_device *device);
+static void xen_acpi_processor_notify(struct acpi_device *device, u32 event);
+
+struct acpi_driver xen_acpi_processor_driver = {
+	.name = "processor",
+	.class = ACPI_PROCESSOR_CLASS,
+	.ids = processor_device_ids,
+	.ops = {
+		.add = xen_acpi_processor_add,
+		.remove = acpi_processor_remove,
+		.suspend = acpi_processor_suspend,
+		.resume = acpi_processor_resume,
+		.notify = xen_acpi_processor_notify,
+		},
+};
+
+static int is_processor_present(acpi_handle handle)
+{
+	acpi_status status;
+	unsigned long long sta = 0;
+
+
+	status = acpi_evaluate_integer(handle, "_STA", NULL, &sta);
+
+	if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT))
+		return 1;
+
+	/*
+	 * _STA is mandatory for a processor that supports hot plug
+	 */
+	if (status == AE_NOT_FOUND)
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+				"Processor does not support hot plug\n"));
+	else
+		ACPI_EXCEPTION((AE_INFO, status,
+				"Processor Device is not present"));
+	return 0;
+}
+
+static acpi_status
+xen_acpi_processor_hotadd_init(struct acpi_processor *pr, int *p_cpu)
+{
+	if (!is_processor_present(pr->handle))
+		return AE_ERROR;
+
+	if (processor_cntl_xen_notify(pr,
+				PROCESSOR_HOTPLUG, HOTPLUG_TYPE_ADD))
+		return AE_ERROR;
+
+	return AE_OK;
+}
+
+static int xen_acpi_processor_get_info(struct acpi_device *device)
+{
+	acpi_status status = 0;
+	union acpi_object object = { 0 };
+	struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
+	struct acpi_processor *pr;
+	int cpu_index, device_declaration = 0;
+	static int cpu0_initialized;
+
+	pr = acpi_driver_data(device);
+	if (!pr)
+		return -EINVAL;
+
+	if (num_online_cpus() > 1)
+		errata.smp = TRUE;
+
+	acpi_processor_errata(pr);
+
+	/*
+	 * Check to see if we have bus mastering arbitration control.  This
+	 * is required for proper C3 usage (to maintain cache coherency).
+	 */
+	if (acpi_gbl_FADT.pm2_control_block &&
+			acpi_gbl_FADT.pm2_control_length) {
+		pr->flags.bm_control = 1;
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+				  "Bus mastering arbitration control present\n"
+				  ));
+	} else
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+				  "No bus mastering arbitration control\n"));
+
+	if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_OBJECT_HID)) {
+		/* Declared with "Processor" statement; match ProcessorID */
+		status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer);
+		if (ACPI_FAILURE(status)) {
+			printk(KERN_ERR PREFIX "Evaluating processor object\n");
+			return -ENODEV;
+		}
+
+		/*
+		 * TBD: Synch processor ID (via LAPIC/LSAPIC structures) on SMP.
+		 *      >>> 'acpi_get_processor_id(acpi_id, &id)' in
+		 *      arch/xxx/acpi.c
+		 */
+		pr->acpi_id = object.processor.proc_id;
+	} else {
+		/*
+		 * Declared with "Device" statement; match _UID.
+		 * Note that we don't handle string _UIDs yet.
+		 */
+		unsigned long long value;
+		status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID,
+						NULL, &value);
+		if (ACPI_FAILURE(status)) {
+			printk(KERN_ERR PREFIX
+			    "Evaluating processor _UID [%#x]\n", status);
+			return -ENODEV;
+		}
+		device_declaration = 1;
+		pr->acpi_id = value;
+	}
+
+	/* TBD: add Xen specific code to query cpu_index */
+	cpu_index = -1;
+
+	/* Handle UP system running SMP kernel, with no LAPIC in MADT */
+	if (!cpu0_initialized && (cpu_index == -1) &&
+	    (num_online_cpus() == 1)) {
+		cpu_index = 0;
+	}
+
+	cpu0_initialized = 1;
+
+	pr->id = cpu_index;
+
+	/*
+	 *  Extra Processor objects may be enumerated on MP systems with
+	 *  less than the max # of CPUs, or Xen vCPU < pCPU.
+	 *  They should be ignored _iff they are physically not present.
+	 *
+	 */
+	if (xen_pcpu_index(pr->acpi_id, 1) == -1) {
+		if (ACPI_FAILURE
+		    (xen_acpi_processor_hotadd_init(pr, &pr->id))) {
+			return -ENODEV;
+		}
+	}
+
+	/*
+	 * On some boxes several processors use the same processor bus id.
+	 * But they are located in different scope. For example:
+	 * \_SB.SCK0.CPU0
+	 * \_SB.SCK1.CPU0
+	 * Rename the processor device bus id. And the new bus id will be
+	 * generated as the following format:
+	 * CPU+CPU ID.
+	 */
+	sprintf(acpi_device_bid(device), "CPU%X", pr->id);
+	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Processor [%d:%d]\n", pr->id,
+				pr->acpi_id));
+
+	if (!object.processor.pblk_address)
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No PBLK (NULL address)\n"));
+	else if (object.processor.pblk_length != 6)
+		printk(KERN_ERR PREFIX "Invalid PBLK length [%d]\n",
+				object.processor.pblk_length);
+	else {
+		pr->throttling.address = object.processor.pblk_address;
+		pr->throttling.duty_offset = acpi_gbl_FADT.duty_offset;
+		pr->throttling.duty_width = acpi_gbl_FADT.duty_width;
+
+		pr->pblk = object.processor.pblk_address;
+
+		/*
+		 * We don't care about error returns - we just try to mark
+		 * these reserved so that nobody else is confused into thinking
+		 * that this region might be unused..
+		 *
+		 * (In particular, allocating the IO range for Cardbus)
+		 */
+		request_region(pr->throttling.address, 6, "ACPI CPU throttle");
+	}
+
+	/*
+	 * If ACPI describes a slot number for this CPU, we can use it
+	 * ensure we get the right value in the "physical id" field
+	 * of /proc/cpuinfo
+	 */
+	status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer);
+	if (ACPI_SUCCESS(status))
+		arch_fix_phys_package_id(pr->id, object.integer.value);
+
+	return 0;
+}
+
+static struct acpi_device *processor_device_array[XEN_MAX_ACPI_ID + 1];
+
+static int __cpuinit xen_acpi_processor_add(struct acpi_device *device)
+{
+	struct acpi_processor *pr = NULL;
+	int result = 0;
+	struct sys_device *sysdev;
+
+	pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
+	if (!pr)
+		return -ENOMEM;
+
+	if (!zalloc_cpumask_var(&pr->throttling.shared_cpu_map, GFP_KERNEL)) {
+		kfree(pr);
+		return -ENOMEM;
+	}
+
+	pr->handle = device->handle;
+	strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME);
+	strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS);
+	device->driver_data = pr;
+
+	result = xen_acpi_processor_get_info(device);
+	if (result) {
+		/* Processor is physically not present */
+		return 0;
+	}
+
+	/*
+	 * Buggy BIOS check
+	 * ACPI id of processors can be reported wrongly by the BIOS.
+	 * Don't trust it blindly
+	 */
+	if (pr->acpi_id > XEN_MAX_ACPI_ID ||
+			(processor_device_array[pr->acpi_id] != NULL &&
+			 processor_device_array[pr->acpi_id] != device)) {
+		printk(KERN_WARNING "BIOS reported wrong ACPI id "
+			"for the processor\n");
+		result = -ENODEV;
+		goto err_free_cpumask;
+	}
+
+	processor_device_array[pr->acpi_id] = device;
+
+	if (pr->id != -1) {
+		per_cpu(processors, pr->id) = pr;
+
+		result = acpi_processor_add_fs(device);
+		if (result)
+			goto err_free_cpumask;
+
+		sysdev = get_cpu_sysdev(pr->id);
+		if (sysdev != NULL && sysfs_create_link(&device->dev.kobj,
+					&sysdev->kobj, "sysdev")) {
+			result = -EFAULT;
+			goto err_remove_fs;
+		}
+	}
+
+	/* _PDC call should be done before doing anything else (if reqd.). */
+	xen_arch_acpi_processor_init_pdc(pr);
+	acpi_processor_set_pdc(pr);
+	arch_acpi_processor_cleanup_pdc(pr);
+
+#ifdef CONFIG_CPU_FREQ
+	xen_acpi_processor_ppc_has_changed(pr);
+	result = xen_acpi_processor_get_performance(pr);
+	if (result)
+		goto err_remove_fs;
+#endif
+
+	if (pr->id != -1) {
+		acpi_processor_get_throttling_info(pr);
+		acpi_processor_get_limit_info(pr);
+	}
+
+	xen_acpi_processor_power_init(pr, device);
+
+	if (pr->id != -1) {
+		pr->cdev = thermal_cooling_device_register("Processor", device,
+				&processor_cooling_ops);
+		if (IS_ERR(pr->cdev)) {
+			result = PTR_ERR(pr->cdev);
+			goto err_power_exit;
+		}
+
+		dev_info(&device->dev, "registered as cooling_device%d\n",
+				pr->cdev->id);
+
+		result = sysfs_create_link(&device->dev.kobj,
+				&pr->cdev->device.kobj,
+				"thermal_cooling");
+		if (result) {
+			printk(KERN_ERR PREFIX "Create sysfs link\n");
+			goto err_thermal_unregister;
+		}
+		result = sysfs_create_link(&pr->cdev->device.kobj,
+				&device->dev.kobj,
+				"device");
+		if (result) {
+			printk(KERN_ERR PREFIX "Create sysfs link\n");
+			goto err_remove_sysfs;
+		}
+	}
+
+	return 0;
+
+err_remove_sysfs:
+	sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
+err_thermal_unregister:
+	thermal_cooling_device_unregister(pr->cdev);
+err_power_exit:
+	acpi_processor_power_exit(pr, device);
+err_remove_fs:
+	acpi_processor_remove_fs(device);
+err_free_cpumask:
+	free_cpumask_var(pr->throttling.shared_cpu_map);
+
+	return result;
+}
+
+static void xen_acpi_processor_notify(struct acpi_device *device, u32 event)
+{
+	struct acpi_processor *pr = acpi_driver_data(device);
+	int saved;
+
+	if (!pr)
+		return;
+
+	switch (event) {
+	case ACPI_PROCESSOR_NOTIFY_PERFORMANCE:
+		saved = pr->performance_platform_limit;
+		xen_acpi_processor_ppc_has_changed(pr);
+		if (saved == pr->performance_platform_limit)
+			break;
+		acpi_bus_generate_proc_event(device, event,
+					pr->performance_platform_limit);
+		acpi_bus_generate_netlink_event(device->pnp.device_class,
+					dev_name(&device->dev), event,
+					pr->performance_platform_limit);
+		break;
+	case ACPI_PROCESSOR_NOTIFY_POWER:
+		xen_acpi_processor_cst_has_changed(pr);
+		acpi_bus_generate_proc_event(device, event, 0);
+		acpi_bus_generate_netlink_event(device->pnp.device_class,
+					dev_name(&device->dev), event, 0);
+		break;
+	case ACPI_PROCESSOR_NOTIFY_THROTTLING:
+		acpi_processor_tstate_has_changed(pr);
+		acpi_bus_generate_proc_event(device, event, 0);
+		acpi_bus_generate_netlink_event(device->pnp.device_class,
+					dev_name(&device->dev), event, 0);
+	default:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+				  "Unsupported event [0x%x]\n", event));
+		break;
+	}
+
+	return;
+}
+
+/* from processor_idle.c */
+
+static int xen_acpi_processor_get_power_info(struct acpi_processor *pr)
+{
+	int ret;
+	int invalid_pr_id = 0;
+
+	/*
+	 * acpi_processor_get_power_info need valid pr->id
+	 * so set pr->id=0 temporarily
+	 */
+	if (pr->id == -1) {
+		invalid_pr_id = 1;
+		pr->id = 0;
+	}
+
+	ret = acpi_processor_get_power_info(pr);
+
+	if (invalid_pr_id)
+		pr->id = -1;
+
+	return ret;
+}
+
+int xen_acpi_processor_cst_has_changed(struct acpi_processor *pr)
+{
+	if (!pr)
+		return -EINVAL;
+
+	if (!pr->flags.power_setup_done)
+		return -ENODEV;
+
+	xen_acpi_processor_get_power_info(pr);
+
+	processor_cntl_xen_notify(pr,
+			PROCESSOR_PM_CHANGE, PM_TYPE_IDLE);
+
+	return 0;
+}
+
+
+int __cpuinit xen_acpi_processor_power_init(struct acpi_processor *pr,
+			      struct acpi_device *device)
+{
+	acpi_status status = 0;
+	unsigned int i;
+
+	if (!pr)
+		return -EINVAL;
+
+	if (acpi_gbl_FADT.cst_control) {
+		status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
+				acpi_gbl_FADT.cst_control, 8);
+		if (ACPI_FAILURE(status)) {
+			ACPI_EXCEPTION((AE_INFO, status,
+				"Notifying BIOS of _CST ability failed"));
+		}
+	}
+
+	xen_acpi_processor_get_power_info(pr);
+
+	pr->flags.power_setup_done = 1;
+
+	if (pr->flags.power) {
+			processor_cntl_xen_notify(pr,
+					PROCESSOR_PM_INIT, PM_TYPE_IDLE);
+
+		printk(KERN_INFO PREFIX "CPU%d (power states:", pr->id);
+		for (i = 1; i <= pr->power.count; i++)
+			if (pr->power.states[i].valid)
+				printk(" C%d[C%d]", i,
+				       pr->power.states[i].type);
+		printk(")\n");
+	}
+
+	return 0;
+}
+
+/* from processor_perflib.c */
+
+#ifdef CONFIG_CPU_FREQ
+static int xen_processor_notify_smm(void)
+{
+	acpi_status status;
+	static int is_done;
+
+	/* only need successfully notify BIOS once */
+	/* avoid double notification which may lead to unexpected result */
+	if (is_done)
+		return 0;
+
+	/* Can't write pstate_cnt to smi_cmd if either value is zero */
+	if ((!acpi_gbl_FADT.smi_command) || (!acpi_gbl_FADT.pstate_control)) {
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No SMI port or pstate_cnt\n"));
+		return 0;
+	}
+
+	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+		"Writing pstate_cnt [0x%x] to smi_cmd [0x%x]\n",
+		acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
+
+	status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
+				    (u32) acpi_gbl_FADT.pstate_control, 8);
+	if (ACPI_FAILURE(status))
+		return status;
+
+	is_done = 1;
+
+	return 0;
+}
+
+static int xen_acpi_processor_get_platform_limit(struct acpi_processor *pr)
+{
+	acpi_status status = 0;
+	unsigned long long ppc = 0;
+
+	if (!pr)
+		return -EINVAL;
+
+	/*
+	 * _PPC indicates the maximum state currently supported by the platform
+	 * (e.g. 0 = states 0..n; 1 = states 1..n; etc.
+	 */
+	status = acpi_evaluate_integer(pr->handle, "_PPC", NULL, &ppc);
+
+	if (ACPI_FAILURE(status) && status != AE_NOT_FOUND) {
+		ACPI_EXCEPTION((AE_INFO, status, "Evaluating _PPC"));
+		return -ENODEV;
+	}
+
+	pr->performance_platform_limit = (int)ppc;
+
+	return 0;
+}
+
+int xen_acpi_processor_ppc_has_changed(struct acpi_processor *pr)
+{
+	int ret;
+
+	ret = xen_acpi_processor_get_platform_limit(pr);
+
+	if (ret < 0)
+		return ret;
+	else
+		return processor_cntl_xen_notify(pr,
+				PROCESSOR_PM_CHANGE, PM_TYPE_PERF);
+}
+
+/*
+ * Existing ACPI module does parse performance states at some point,
+ * when acpi-cpufreq driver is loaded which however is something
+ * we'd like to disable to avoid confliction with xen PM
+ * logic. So we have to collect raw performance information here
+ * when ACPI processor object is found and started.
+ */
+int xen_acpi_processor_get_performance(struct acpi_processor *pr)
+{
+	int ret;
+	struct acpi_processor_performance *perf;
+	struct acpi_psd_package *pdomain;
+
+	if (pr->performance)
+		return -EBUSY;
+
+	perf = kzalloc(sizeof(struct acpi_processor_performance), GFP_KERNEL);
+	if (!perf)
+		return -ENOMEM;
+
+	pr->performance = perf;
+	/* Get basic performance state information */
+	ret = acpi_processor_get_performance_info(pr);
+	if (ret < 0)
+		goto err_out;
+
+	/*
+	 * Well, here we need retrieve performance dependency information
+	 * from _PSD object. The reason why existing interface is not used
+	 * is due to the reason that existing interface sticks to Linux cpu
+	 * id to construct some bitmap, however we want to split ACPI
+	 * processor objects from Linux cpu id logic. For example, even
+	 * when Linux is configured as UP, we still want to parse all ACPI
+	 * processor objects to xen. In this case, it's preferred
+	 * to use ACPI ID instead.
+	 */
+	pdomain = &pr->performance->domain_info;
+	pdomain->num_processors = 0;
+	ret = acpi_processor_get_psd(pr);
+	if (ret < 0) {
+		/*
+		 * _PSD is optional - assume no coordination if absent (or
+		 * broken), matching native kernels' behavior.
+		 */
+		pdomain->num_entries = ACPI_PSD_REV0_ENTRIES;
+		pdomain->revision = ACPI_PSD_REV0_REVISION;
+		pdomain->domain = pr->acpi_id;
+		pdomain->coord_type = DOMAIN_COORD_TYPE_SW_ALL;
+		pdomain->num_processors = 1;
+	}
+
+	/* Some sanity check */
+	if ((pdomain->revision != ACPI_PSD_REV0_REVISION) ||
+	    (pdomain->num_entries != ACPI_PSD_REV0_ENTRIES) ||
+	    ((pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ALL) &&
+	     (pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ANY) &&
+	     (pdomain->coord_type != DOMAIN_COORD_TYPE_HW_ALL))) {
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	/* Last step is to notify BIOS that xen exists */
+	xen_processor_notify_smm();
+
+	processor_cntl_xen_notify(pr, PROCESSOR_PM_INIT, PM_TYPE_PERF);
+
+	return 0;
+err_out:
+	pr->performance = NULL;
+	kfree(perf);
+	return ret;
+}
+#endif /* CONFIG_CPU_FREQ */
+
+/* init and exit */
+
+int xen_acpi_processor_init(void)
+{
+	return acpi_bus_register_driver(&xen_acpi_processor_driver);
+}
+
+void xen_acpi_processor_exit(void)
+{
+	acpi_bus_unregister_driver(&xen_acpi_processor_driver);
+}
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 0458094..85a1308 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -19,6 +19,8 @@
 
 #include <asm/io.h>
 
+#include <xen/acpi.h>
+
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
@@ -200,6 +202,21 @@
 	return error;
 }
 
+static void do_suspend(void)
+{
+	if (!xen_pv_acpi()) {
+		do_suspend_lowlevel();
+		return;
+	}
+
+	/*
+	 * Xen will save and restore CPU context, so
+	 * we can skip that and just go straight to
+	 * the suspend.
+	 */
+	acpi_enter_sleep_state(ACPI_STATE_S3);
+}
+
 /**
  *	acpi_suspend_enter - Actually enter a sleep state.
  *	@pm_state: ignored
@@ -233,7 +250,7 @@
 		break;
 
 	case ACPI_STATE_S3:
-		do_suspend_lowlevel();
+		do_suspend();
 		break;
 	}
 
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1d886e0..f4a2b10 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -462,6 +462,7 @@
 	tristate "Xen virtual block device support"
 	depends on XEN
 	default y
+	select XEN_XENBUS_FRONTEND
 	help
 	  This driver implements the front-end of the Xen virtual
 	  block device driver.  It communicates with a back-end driver
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index a2e8977..6147a4e 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -42,10 +42,12 @@
 #include <linux/module.h>
 #include <linux/scatterlist.h>
 
+#include <xen/xen.h>
 #include <xen/xenbus.h>
 #include <xen/grant_table.h>
 #include <xen/events.h>
 #include <xen/page.h>
+#include <xen/platform_pci.h>
 
 #include <xen/interface/grant_table.h>
 #include <xen/interface/io/blkif.h>
@@ -67,7 +69,7 @@
 
 static const struct block_device_operations xlvbd_block_fops;
 
-#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
+#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
 
 /*
  * We have one of these per vbd, whether ide, scsi or 'other'.  They
@@ -76,6 +78,7 @@
  */
 struct blkfront_info
 {
+	struct mutex mutex;
 	struct xenbus_device *xbdev;
 	struct gendisk *gd;
 	int vdevice;
@@ -85,6 +88,7 @@
 	struct blkif_front_ring ring;
 	struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int evtchn, irq;
+	struct tasklet_struct tasklet;
 	struct request_queue *rq;
 	struct work_struct work;
 	struct gnttab_free_callback callback;
@@ -93,14 +97,12 @@
 	int feature_barrier;
 	int is_ready;
 
-	/**
-	 * The number of people holding this device open.  We won't allow a
-	 * hot-unplug unless this is 0.
-	 */
-	int users;
+	spinlock_t io_lock;
 };
 
-static DEFINE_SPINLOCK(blkif_io_lock);
+static unsigned int nr_minors;
+static unsigned long *minors;
+static DEFINE_SPINLOCK(minor_lock);
 
 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
 	(BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
@@ -116,6 +118,10 @@
 #define EXTENDED (1<<EXT_SHIFT)
 #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
 #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
+#define EMULATED_HD_DISK_MINOR_OFFSET (0)
+#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
+#define EMULATED_SD_DISK_MINOR_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET + (4 * 16))
+#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_HD_DISK_NAME_OFFSET + 4)
 
 #define DEV_NAME	"xvd"	/* name in /dev */
 
@@ -136,6 +142,55 @@
 	info->shadow_free = id;
 }
 
+static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
+{
+	unsigned int end = minor + nr;
+	int rc;
+
+	if (end > nr_minors) {
+		unsigned long *bitmap, *old;
+
+		bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap),
+				 GFP_KERNEL);
+		if (bitmap == NULL)
+			return -ENOMEM;
+
+		spin_lock(&minor_lock);
+		if (end > nr_minors) {
+			old = minors;
+			memcpy(bitmap, minors,
+			       BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
+			minors = bitmap;
+			nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
+		} else
+			old = bitmap;
+		spin_unlock(&minor_lock);
+		kfree(old);
+	}
+
+	spin_lock(&minor_lock);
+	if (find_next_bit(minors, end, minor) >= end) {
+		for (; minor < end; ++minor)
+			__set_bit(minor, minors);
+		rc = 0;
+	} else
+		rc = -EBUSY;
+	spin_unlock(&minor_lock);
+
+	return rc;
+}
+
+static void xlbd_release_minors(unsigned int minor, unsigned int nr)
+{
+	unsigned int end = minor + nr;
+
+	BUG_ON(end > nr_minors);
+	spin_lock(&minor_lock);
+	for (; minor < end; ++minor)
+		__clear_bit(minor, minors);
+	spin_unlock(&minor_lock);
+}
+
 static void blkif_restart_queue_callback(void *arg)
 {
 	struct blkfront_info *info = (struct blkfront_info *)arg;
@@ -333,11 +388,12 @@
 		flush_requests(info);
 }
 
-static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
+static int xlvbd_init_blk_queue(struct blkfront_info *info,
+				struct gendisk *gd, u16 sector_size)
 {
 	struct request_queue *rq;
 
-	rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
+	rq = blk_init_queue(do_blkif_request, &info->io_lock);
 	if (rq == NULL)
 		return -1;
 
@@ -370,20 +426,84 @@
 static int xlvbd_barrier(struct blkfront_info *info)
 {
 	int err;
+	const char *barrier;
 
-	err = blk_queue_ordered(info->rq,
-				info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
-				NULL);
+	switch (info->feature_barrier) {
+	case QUEUE_ORDERED_DRAIN:	barrier = "enabled (drain)"; break;
+	case QUEUE_ORDERED_TAG:		barrier = "enabled (tag)"; break;
+	case QUEUE_ORDERED_NONE:	barrier = "disabled"; break;
+	default:			return -EINVAL;
+	}
+
+	err = blk_queue_ordered(info->rq, info->feature_barrier, NULL);
 
 	if (err)
 		return err;
 
 	printk(KERN_INFO "blkfront: %s: barriers %s\n",
-	       info->gd->disk_name,
-	       info->feature_barrier ? "enabled" : "disabled");
+	       info->gd->disk_name, barrier);
 	return 0;
 }
 
+static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
+{
+	int major;
+	major = BLKIF_MAJOR(vdevice);
+	*minor = BLKIF_MINOR(vdevice);
+	switch (major) {
+		case XEN_IDE0_MAJOR:
+			*offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
+			*minor = ((*minor / 64) * PARTS_PER_DISK) +
+				EMULATED_HD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_IDE1_MAJOR:
+			*offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
+			*minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
+				EMULATED_HD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_SCSI_DISK0_MAJOR:
+			*offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
+			*minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_SCSI_DISK1_MAJOR:
+		case XEN_SCSI_DISK2_MAJOR:
+		case XEN_SCSI_DISK3_MAJOR:
+		case XEN_SCSI_DISK4_MAJOR:
+		case XEN_SCSI_DISK5_MAJOR:
+		case XEN_SCSI_DISK6_MAJOR:
+		case XEN_SCSI_DISK7_MAJOR:
+			*offset = (*minor / PARTS_PER_DISK) + 
+				((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
+				EMULATED_SD_DISK_NAME_OFFSET;
+			*minor = *minor +
+				((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
+				EMULATED_SD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_SCSI_DISK8_MAJOR:
+		case XEN_SCSI_DISK9_MAJOR:
+		case XEN_SCSI_DISK10_MAJOR:
+		case XEN_SCSI_DISK11_MAJOR:
+		case XEN_SCSI_DISK12_MAJOR:
+		case XEN_SCSI_DISK13_MAJOR:
+		case XEN_SCSI_DISK14_MAJOR:
+		case XEN_SCSI_DISK15_MAJOR:
+			*offset = (*minor / PARTS_PER_DISK) + 
+				((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
+				EMULATED_SD_DISK_NAME_OFFSET;
+			*minor = *minor +
+				((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
+				EMULATED_SD_DISK_MINOR_OFFSET;
+			break;
+		case XENVBD_MAJOR:
+			*offset = *minor / PARTS_PER_DISK;
+			break;
+		default:
+			printk(KERN_WARNING "blkfront: your disk configuration is "
+					"incorrect, please use an xvd device instead\n");
+			return -ENODEV;
+	}
+	return 0;
+}
 
 static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 			       struct blkfront_info *info,
@@ -391,7 +511,7 @@
 {
 	struct gendisk *gd;
 	int nr_minors = 1;
-	int err = -ENODEV;
+	int err;
 	unsigned int offset;
 	int minor;
 	int nr_parts;
@@ -406,21 +526,32 @@
 	}
 
 	if (!VDEV_IS_EXTENDED(info->vdevice)) {
-		minor = BLKIF_MINOR(info->vdevice);
-		nr_parts = PARTS_PER_DISK;
+		err = xen_translate_vdev(info->vdevice, &minor, &offset);
+		if (err)
+			return err;		
+ 		nr_parts = PARTS_PER_DISK;
 	} else {
 		minor = BLKIF_MINOR_EXT(info->vdevice);
 		nr_parts = PARTS_PER_EXT_DISK;
+		offset = minor / nr_parts;
+		if (xen_hvm_domain() && offset <= EMULATED_HD_DISK_NAME_OFFSET + 4)
+			printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
+					"emulated IDE disks,\n\t choose an xvd device name"
+					"from xvde on\n", info->vdevice);
 	}
+	err = -ENODEV;
 
 	if ((minor % nr_parts) == 0)
 		nr_minors = nr_parts;
 
+	err = xlbd_reserve_minors(minor, nr_minors);
+	if (err)
+		goto out;
+	err = -ENODEV;
+
 	gd = alloc_disk(nr_minors);
 	if (gd == NULL)
-		goto out;
-
-	offset = minor / nr_parts;
+		goto release;
 
 	if (nr_minors > 1) {
 		if (offset < 26)
@@ -447,16 +578,15 @@
 	gd->driverfs_dev = &(info->xbdev->dev);
 	set_capacity(gd, capacity);
 
-	if (xlvbd_init_blk_queue(gd, sector_size)) {
+	if (xlvbd_init_blk_queue(info, gd, sector_size)) {
 		del_gendisk(gd);
-		goto out;
+		goto release;
 	}
 
 	info->rq = gd->queue;
 	info->gd = gd;
 
-	if (info->feature_barrier)
-		xlvbd_barrier(info);
+	xlvbd_barrier(info);
 
 	if (vdisk_info & VDISK_READONLY)
 		set_disk_ro(gd, 1);
@@ -469,10 +599,45 @@
 
 	return 0;
 
+ release:
+	xlbd_release_minors(minor, nr_minors);
  out:
 	return err;
 }
 
+static void xlvbd_release_gendisk(struct blkfront_info *info)
+{
+	unsigned int minor, nr_minors;
+	unsigned long flags;
+
+	if (info->rq == NULL)
+		return;
+
+	spin_lock_irqsave(&info->io_lock, flags);
+
+	/* No more blkif_request(). */
+	blk_stop_queue(info->rq);
+
+	/* No more gnttab callback work. */
+	gnttab_cancel_free_callback(&info->callback);
+	spin_unlock_irqrestore(&info->io_lock, flags);
+
+	/* Flush gnttab callback work. Must be done with no locks held. */
+	flush_scheduled_work();
+
+	del_gendisk(info->gd);
+
+	minor = info->gd->first_minor;
+	nr_minors = info->gd->minors;
+	xlbd_release_minors(minor, nr_minors);
+
+	blk_cleanup_queue(info->rq);
+	info->rq = NULL;
+
+	put_disk(info->gd);
+	info->gd = NULL;
+}
+
 static void kick_pending_request_queues(struct blkfront_info *info)
 {
 	if (!RING_FULL(&info->ring)) {
@@ -487,16 +652,16 @@
 {
 	struct blkfront_info *info = container_of(work, struct blkfront_info, work);
 
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 	if (info->connected == BLKIF_STATE_CONNECTED)
 		kick_pending_request_queues(info);
-	spin_unlock_irq(&blkif_io_lock);
+	spin_unlock_irq(&info->io_lock);
 }
 
 static void blkif_free(struct blkfront_info *info, int suspend)
 {
 	/* Prevent new requests being issued until we fix things up. */
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 	info->connected = suspend ?
 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
 	/* No more blkif_request(). */
@@ -504,7 +669,7 @@
 		blk_stop_queue(info->rq);
 	/* No more gnttab callback work. */
 	gnttab_cancel_free_callback(&info->callback);
-	spin_unlock_irq(&blkif_io_lock);
+	spin_unlock_irq(&info->io_lock);
 
 	/* Flush gnttab callback work. Must be done with no locks held. */
 	flush_scheduled_work();
@@ -529,21 +694,20 @@
 		gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
 }
 
-static irqreturn_t blkif_interrupt(int irq, void *dev_id)
+static void
+blkif_do_interrupt(unsigned long data)
 {
+	struct blkfront_info *info = (struct blkfront_info *)data;
 	struct request *req;
 	struct blkif_response *bret;
 	RING_IDX i, rp;
 	unsigned long flags;
-	struct blkfront_info *info = (struct blkfront_info *)dev_id;
 	int error;
 
-	spin_lock_irqsave(&blkif_io_lock, flags);
+	spin_lock_irqsave(&info->io_lock, flags);
 
-	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
-		spin_unlock_irqrestore(&blkif_io_lock, flags);
-		return IRQ_HANDLED;
-	}
+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+		goto out;
 
  again:
 	rp = info->ring.sring->rsp_prod;
@@ -567,7 +731,7 @@
 				printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
 				       info->gd->disk_name);
 				error = -EOPNOTSUPP;
-				info->feature_barrier = 0;
+				info->feature_barrier = QUEUE_ORDERED_NONE;
 				xlvbd_barrier(info);
 			}
 			/* fall through */
@@ -596,7 +760,17 @@
 
 	kick_pending_request_queues(info);
 
-	spin_unlock_irqrestore(&blkif_io_lock, flags);
+out:
+	spin_unlock_irqrestore(&info->io_lock, flags);
+}
+
+
+static irqreturn_t
+blkif_interrupt(int irq, void *dev_id)
+{
+	struct blkfront_info *info = (struct blkfront_info *)dev_id;
+
+	tasklet_schedule(&info->tasklet);
 
 	return IRQ_HANDLED;
 }
@@ -650,7 +824,7 @@
 
 
 /* Common code used when first setting up, and when resuming. */
-static int talk_to_backend(struct xenbus_device *dev,
+static int talk_to_blkback(struct xenbus_device *dev,
 			   struct blkfront_info *info)
 {
 	const char *message = NULL;
@@ -710,7 +884,6 @@
 	return err;
 }
 
-
 /**
  * Entry point to this code when a new device is created.  Allocate the basic
  * structures and the ring buffer for communication with the backend, and
@@ -736,16 +909,48 @@
 		}
 	}
 
+	if (xen_hvm_domain()) {
+		char *type;
+		int len;
+		/* no unplug has been done: do not hook devices != xen vbds */
+		if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) {
+			int major;
+
+			if (!VDEV_IS_EXTENDED(vdevice))
+				major = BLKIF_MAJOR(vdevice);
+			else
+				major = XENVBD_MAJOR;
+
+			if (major != XENVBD_MAJOR) {
+				printk(KERN_INFO
+						"%s: HVM does not support vbd %d as xen block device\n",
+						__FUNCTION__, vdevice);
+				return -ENODEV;
+			}
+		}
+		/* do not create a PV cdrom device if we are an HVM guest */
+		type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len);
+		if (IS_ERR(type))
+			return -ENODEV;
+		if (strncmp(type, "cdrom", 5) == 0) {
+			kfree(type);
+			return -ENODEV;
+		}
+		kfree(type);
+	}
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info) {
 		xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
 		return -ENOMEM;
 	}
 
+	mutex_init(&info->mutex);
 	info->xbdev = dev;
 	info->vdevice = vdevice;
 	info->connected = BLKIF_STATE_DISCONNECTED;
 	INIT_WORK(&info->work, blkif_restart_queue);
+	spin_lock_init(&info->io_lock);
+	tasklet_init(&info->tasklet, blkif_do_interrupt, (unsigned long)info);
 
 	for (i = 0; i < BLK_RING_SIZE; i++)
 		info->shadow[i].req.id = i+1;
@@ -755,7 +960,7 @@
 	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
 	dev_set_drvdata(&dev->dev, info);
 
-	err = talk_to_backend(dev, info);
+	err = talk_to_blkback(dev, info);
 	if (err) {
 		kfree(info);
 		dev_set_drvdata(&dev->dev, NULL);
@@ -819,7 +1024,7 @@
 
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 
 	/* Now safe for us to use the shared ring */
 	info->connected = BLKIF_STATE_CONNECTED;
@@ -830,7 +1035,7 @@
 	/* Kick any other new requests queued since we resumed */
 	kick_pending_request_queues(info);
 
-	spin_unlock_irq(&blkif_io_lock);
+	spin_unlock_irq(&info->io_lock);
 
 	return 0;
 }
@@ -850,13 +1055,50 @@
 
 	blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
 
-	err = talk_to_backend(dev, info);
+	err = talk_to_blkback(dev, info);
 	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
 		err = blkif_recover(info);
 
 	return err;
 }
 
+static void
+blkfront_closing(struct blkfront_info *info)
+{
+	struct xenbus_device *xbdev = info->xbdev;
+	struct block_device *bdev = NULL;
+
+	mutex_lock(&info->mutex);
+
+	if (xbdev->state == XenbusStateClosing) {
+		mutex_unlock(&info->mutex);
+		return;
+	}
+
+	if (info->gd)
+		bdev = bdget_disk(info->gd, 0);
+
+	mutex_unlock(&info->mutex);
+
+	if (!bdev) {
+		xenbus_frontend_closed(xbdev);
+		return;
+	}
+
+	mutex_lock(&bdev->bd_mutex);
+
+	if (bdev->bd_openers) {
+		xenbus_dev_error(xbdev, -EBUSY,
+				 "Device in use; refusing to close");
+		xenbus_switch_state(xbdev, XenbusStateClosing);
+	} else {
+		xlvbd_release_gendisk(info);
+		xenbus_frontend_closed(xbdev);
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
+	bdput(bdev);
+}
 
 /*
  * Invoked when the backend is finally 'ready' (and has told produced
@@ -868,11 +1110,31 @@
 	unsigned long sector_size;
 	unsigned int binfo;
 	int err;
+	int barrier;
 
-	if ((info->connected == BLKIF_STATE_CONNECTED) ||
-	    (info->connected == BLKIF_STATE_SUSPENDED) )
+	switch (info->connected) {
+	case BLKIF_STATE_CONNECTED:
+		/*
+		 * Potentially, the back-end may be signalling
+		 * a capacity change; update the capacity.
+		 */
+		err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+				   "sectors", "%Lu", &sectors);
+		if (XENBUS_EXIST_ERR(err))
+			return;
+		printk(KERN_INFO "Setting capacity to %Lu\n",
+		       sectors);
+		set_capacity(info->gd, sectors);
+		revalidate_disk(info->gd);
+
+		/* fall through */
+	case BLKIF_STATE_SUSPENDED:
 		return;
 
+	default:
+		break;
+	}
+
 	dev_dbg(&info->xbdev->dev, "%s:%s.\n",
 		__func__, info->xbdev->otherend);
 
@@ -889,10 +1151,26 @@
 	}
 
 	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
-			    "feature-barrier", "%d", &info->feature_barrier,
+			    "feature-barrier", "%d", &barrier,
 			    NULL);
+
+	/*
+	 * If there's no "feature-barrier" defined, then it means
+	 * we're dealing with a very old backend which writes
+	 * synchronously; draining will do what needs to get done.
+	 *
+	 * If there are barriers, then we can do full queued writes
+	 * with tagged barriers.
+	 *
+	 * If barriers are not supported, then there's no much we can
+	 * do, so just set ordering to NONE.
+	 */
 	if (err)
-		info->feature_barrier = 0;
+		info->feature_barrier = QUEUE_ORDERED_DRAIN;
+	else if (barrier)
+		info->feature_barrier = QUEUE_ORDERED_TAG;
+	else
+		info->feature_barrier = QUEUE_ORDERED_NONE;
 
 	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
 	if (err) {
@@ -904,10 +1182,10 @@
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
 	/* Kick pending requests. */
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 	info->connected = BLKIF_STATE_CONNECTED;
 	kick_pending_request_queues(info);
-	spin_unlock_irq(&blkif_io_lock);
+	spin_unlock_irq(&info->io_lock);
 
 	add_disk(info->gd);
 
@@ -915,57 +1193,21 @@
 }
 
 /**
- * Handle the change of state of the backend to Closing.  We must delete our
- * device-layer structures now, to ensure that writes are flushed through to
- * the backend.  Once is this done, we can switch to Closed in
- * acknowledgement.
- */
-static void blkfront_closing(struct xenbus_device *dev)
-{
-	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
-	unsigned long flags;
-
-	dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
-
-	if (info->rq == NULL)
-		goto out;
-
-	spin_lock_irqsave(&blkif_io_lock, flags);
-
-	/* No more blkif_request(). */
-	blk_stop_queue(info->rq);
-
-	/* No more gnttab callback work. */
-	gnttab_cancel_free_callback(&info->callback);
-	spin_unlock_irqrestore(&blkif_io_lock, flags);
-
-	/* Flush gnttab callback work. Must be done with no locks held. */
-	flush_scheduled_work();
-
-	blk_cleanup_queue(info->rq);
-	info->rq = NULL;
-
-	del_gendisk(info->gd);
-
- out:
-	xenbus_frontend_closed(dev);
-}
-
-/**
  * Callback received when the backend's state changes.
  */
-static void backend_changed(struct xenbus_device *dev,
+static void blkback_changed(struct xenbus_device *dev,
 			    enum xenbus_state backend_state)
 {
 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
-	struct block_device *bd;
 
-	dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
+	dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
 
 	switch (backend_state) {
 	case XenbusStateInitialising:
 	case XenbusStateInitWait:
 	case XenbusStateInitialised:
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
 	case XenbusStateUnknown:
 	case XenbusStateClosed:
 		break;
@@ -975,35 +1217,56 @@
 		break;
 
 	case XenbusStateClosing:
-		if (info->gd == NULL) {
-			xenbus_frontend_closed(dev);
-			break;
-		}
-		bd = bdget_disk(info->gd, 0);
-		if (bd == NULL)
-			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
-
-		mutex_lock(&bd->bd_mutex);
-		if (info->users > 0)
-			xenbus_dev_error(dev, -EBUSY,
-					 "Device in use; refusing to close");
-		else
-			blkfront_closing(dev);
-		mutex_unlock(&bd->bd_mutex);
-		bdput(bd);
+		blkfront_closing(info);
 		break;
 	}
 }
 
-static int blkfront_remove(struct xenbus_device *dev)
+static int blkfront_remove(struct xenbus_device *xbdev)
 {
-	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+	struct blkfront_info *info = dev_get_drvdata(&xbdev->dev);
+	struct block_device *bdev = NULL;
+	struct gendisk *disk;
 
-	dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename);
+	dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename);
 
 	blkif_free(info, 0);
 
-	kfree(info);
+	mutex_lock(&info->mutex);
+
+	disk = info->gd;
+	if (disk)
+		bdev = bdget_disk(disk, 0);
+
+	info->xbdev = NULL;
+	mutex_unlock(&info->mutex);
+
+	if (!bdev) {
+		kfree(info);
+		return 0;
+	}
+
+	/*
+	 * The xbdev was removed before we reached the Closed
+	 * state. See if it's safe to remove the disk. If the bdev
+	 * isn't closed yet, we let release take care of it.
+	 */
+
+	mutex_lock(&bdev->bd_mutex);
+	info = disk->private_data;
+
+	dev_warn(disk_to_dev(disk),
+		 "%s was hot-unplugged, %d stale handles\n",
+		 xbdev->nodename, bdev->bd_openers);
+
+	if (info && !bdev->bd_openers) {
+		xlvbd_release_gendisk(info);
+		disk->private_data = NULL;
+		kfree(info);
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
+	bdput(bdev);
 
 	return 0;
 }
@@ -1012,30 +1275,68 @@
 {
 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 
-	return info->is_ready;
+	return info->is_ready && info->xbdev;
 }
 
 static int blkif_open(struct block_device *bdev, fmode_t mode)
 {
-	struct blkfront_info *info = bdev->bd_disk->private_data;
-	info->users++;
-	return 0;
+	struct gendisk *disk = bdev->bd_disk;
+	struct blkfront_info *info;
+	int err = 0;
+
+	info = disk->private_data;
+	if (!info)
+		/* xbdev gone */
+		return -ERESTARTSYS;
+
+	mutex_lock(&info->mutex);
+
+	if (!info->gd)
+		/* xbdev is closed */
+		err = -ERESTARTSYS;
+
+	mutex_unlock(&info->mutex);
+
+	return err;
 }
 
 static int blkif_release(struct gendisk *disk, fmode_t mode)
 {
 	struct blkfront_info *info = disk->private_data;
-	info->users--;
-	if (info->users == 0) {
-		/* Check whether we have been instructed to close.  We will
-		   have ignored this request initially, as the device was
-		   still mounted. */
-		struct xenbus_device *dev = info->xbdev;
-		enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
+	struct block_device *bdev;
+	struct xenbus_device *xbdev;
 
-		if (state == XenbusStateClosing && info->is_ready)
-			blkfront_closing(dev);
+	bdev = bdget_disk(disk, 0);
+	bdput(bdev);
+
+	if (bdev->bd_openers)
+		return 0;
+
+	/*
+	 * Check if we have been instructed to close. We will have
+	 * deferred this request, because the bdev was still open.
+	 */
+
+	mutex_lock(&info->mutex);
+	xbdev = info->xbdev;
+
+	if (xbdev && xbdev->state == XenbusStateClosing) {
+		/* pending switch to state closed */
+		dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
+		xlvbd_release_gendisk(info);
+		xenbus_frontend_closed(info->xbdev);
 	}
+
+	mutex_unlock(&info->mutex);
+
+	if (!xbdev) {
+		/* sudden device removal */
+		dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
+		xlvbd_release_gendisk(info);
+		disk->private_data = NULL;
+		kfree(info);
+	}
+
 	return 0;
 }
 
@@ -1061,7 +1362,7 @@
 	.probe = blkfront_probe,
 	.remove = blkfront_remove,
 	.resume = blkfront_resume,
-	.otherend_changed = backend_changed,
+	.otherend_changed = blkback_changed,
 	.is_ready = blkfront_is_ready,
 };
 
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index c496c8a..4064d95 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -18,6 +18,8 @@
 #include <asm/k8.h>
 #include <asm/gart.h>
 #include "agp.h"
+#include <xen/page.h>
+#include <asm/xen/page.h>
 
 /* NVIDIA K8 registers */
 #define NVIDIA_X86_64_0_APBASE		0x10
@@ -78,8 +80,21 @@
 	}
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
+		phys_addr_t phys = page_to_phys(mem->pages[i]);
+		if (xen_pv_domain()) {
+			phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+					page_to_pfn(mem->pages[i])));
+			if (phys != xen_phys) {
+				printk(KERN_ERR "Fixing up GART: (0x%lx->0x%lx)." \
+					" CODE UNTESTED!\n",
+					(unsigned long)phys,
+					(unsigned long)xen_phys);
+				WARN_ON_ONCE(phys != xen_phys);
+				phys = xen_phys;
+			}
+		}
 		tmp = agp_bridge->driver->mask_memory(agp_bridge,
-						      page_to_phys(mem->pages[i]),
+						      phys,
 						      mask_type);
 
 		BUG_ON(tmp & 0xffffff0000000ffcULL);
@@ -181,6 +196,20 @@
 	unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
 	int i;
 
+	if (xen_pv_domain()) {
+		phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+				virt_to_pfn(agp_bridge->gatt_table_real)));
+		/* Future thoughts: Perhaps use the gatt_table_bus that
+		 * agp_generic_create_gatt_table has setup instead of
+		 * doing the virt_to_phys once more? */
+		if (gatt_bus != xen_phys) {
+			printk(KERN_ERR "Fixing up GATT: (0x%lx->0x%lx)." \
+					" CODE UNTESTED!\n", gatt_bus,
+					(unsigned long)xen_phys);
+			WARN_ON_ONCE(gatt_bus != xen_phys);
+			gatt_bus = xen_phys;
+		}
+	}
 	/* Configure AGP regs in each x86-64 host bridge. */
         for (i = 0; i < num_k8_northbridges; i++) {
 		agp_bridge->gart_bus_addr =
diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c
index a56ca08..30fc4b6 100644
--- a/drivers/char/agp/backend.c
+++ b/drivers/char/agp/backend.c
@@ -38,6 +38,8 @@
 #include <linux/vmalloc.h>
 #include <asm/io.h>
 #include "agp.h"
+#include <xen/page.h>
+#include <asm/xen/page.h>
 
 /* Due to XFree86 brain-damage, we can't go to 1.0 until they
  * fix some real stupidity. It's only by chance we can bump
@@ -160,8 +162,13 @@
 			}
 		} else {
 			bridge->scratch_page_dma = page_to_phys(page);
+			if (xen_pv_domain()) {
+				phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+							page_to_pfn(page)));
+				if (bridge->scratch_page_dma != xen_phys)
+					bridge->scratch_page_dma = xen_phys;
+			}
 		}
-
 		bridge->scratch_page = bridge->driver->mask_memory(bridge,
 						   bridge->scratch_page_dma, 0);
 	}
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 0d8d60c..c20ef5c 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -42,6 +42,8 @@
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
 #include "agp.h"
+#include <xen/page.h>
+#include <asm/xen/page.h>
 
 __u32 *agp_gatt_table;
 int agp_memory_reserved;
@@ -1008,6 +1010,14 @@
 		return -ENOMEM;
 	}
 	bridge->gatt_bus_addr = virt_to_phys(bridge->gatt_table_real);
+	/* KRW: virt_to_phys under Xen is not safe. */
+	if (xen_pv_domain()) {
+		/* Use back-door to get the "real" PFN. */
+		phys_addr_t pfn = virt_to_pfn(bridge->gatt_table_real);
+		phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(pfn));
+		if (bridge->gatt_bus_addr != xen_phys)
+			bridge->gatt_bus_addr = xen_phys;
+	}
 
 	/* AK: bogus, should encode addresses > 4GB */
 	for (i = 0; i < num_entries; i++) {
@@ -1147,8 +1157,17 @@
 	}
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
+		phys_addr_t phys = page_to_phys(mem->pages[i]);
+
+		/* HACK: Via a back-door we get the bus address. */
+		if (xen_pv_domain()) {
+			phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+					page_to_pfn(mem->pages[i])));
+			if (phys != xen_phys)
+				phys = xen_phys;
+		}
 		writel(bridge->driver->mask_memory(bridge,
-						   page_to_phys(mem->pages[i]),
+						   phys,
 						   mask_type),
 		       bridge->gatt_table+j);
 	}
@@ -1246,7 +1265,16 @@
 	int i, ret = -ENOMEM;
 
 	for (i = 0; i < num_pages; i++) {
-		page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO);
+		if (xen_pv_domain()) {
+			void *addr;
+			dma_addr_t _d;
+
+			addr = dma_alloc_coherent(NULL, PAGE_SIZE, &_d, GFP_KERNEL);
+			if (!addr)
+				goto out;
+			page = virt_to_page(addr);
+		} else
+			page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO);
 		/* agp_free_memory() needs gart address */
 		if (page == NULL)
 			goto out;
@@ -1274,7 +1302,17 @@
 {
 	struct page * page;
 
-	page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO);
+	if (xen_pv_domain()) {
+		void *addr;
+		dma_addr_t _d;
+
+		addr = dma_alloc_coherent(NULL, PAGE_SIZE, &_d, GFP_KERNEL);
+		if (!addr)
+			return NULL;
+		page = virt_to_page(addr);
+	} else
+		page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO);
+
 	if (page == NULL)
 		return NULL;
 
@@ -1305,7 +1343,12 @@
 		unmap_page_from_agp(page);
 #endif
 		put_page(page);
-		__free_page(page);
+		if (xen_pv_domain()) {
+			void *addr = page_address(page);
+			dma_free_coherent(NULL, PAGE_SIZE, addr,
+					  virt_to_bus(addr));
+		} else 
+			__free_page(page);
 		atomic_dec(&agp_bridge->current_memory_agp);
 		mem->pages[i] = NULL;
 	}
@@ -1322,7 +1365,12 @@
 
 	if (flags & AGP_PAGE_DESTROY_FREE) {
 		put_page(page);
-		__free_page(page);
+		if (xen_pv_domain()) {
+			void *addr = page_address(page);
+			dma_free_coherent(NULL, PAGE_SIZE, addr,
+					  virt_to_bus(addr));
+		} else
+			__free_page(page);
 		atomic_dec(&agp_bridge->current_memory_agp);
 	}
 }
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index b8e0219..7a62c3c 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -10,14 +10,20 @@
 #include <linux/agp_backend.h>
 #include <asm/smp.h>
 #include "agp.h"
+#include <xen/page.h>
+#include <asm/xen/page.h>
 
 /*
  * If we have Intel graphics, we're not going to have anything other than
  * an Intel IOMMU. So make the correct use of the PCI DMA API contingent
  * on the Intel IOMMU support (CONFIG_DMAR).
  * Only newer chipsets need to bother with this, of course.
+ *
+ * Xen guests accessing graphics hardware also need proper translation
+ * between pseudo-physical addresses and real machine addresses, which
+ * is also achieved by using the DMA API.
  */
-#ifdef CONFIG_DMAR
+#if defined(CONFIG_DMAR) || defined(CONFIG_XEN)
 #define USE_PCI_DMA_API 1
 #endif
 
@@ -296,8 +302,20 @@
 	int i, j;
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
+		phys_addr_t phys = page_to_phys(mem->pages[i]);
+		if (xen_pv_domain()) {
+			phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+					page_to_pfn(mem->pages[i])));
+			if (xen_phys != phys) {
+				printk(KERN_ERR "Compile kernel with " \
+					"CONFIG_DMAR to get rid of this " \
+					"warning!\n");
+				WARN_ON_ONCE(xen_phys != phys);
+				/* Fixup: */
+				phys = xen_phys;
+			}
 		writel(agp_bridge->driver->mask_memory(agp_bridge,
-				page_to_phys(mem->pages[i]), mask_type),
+				phys, mask_type),
 		       intel_private.gtt+j);
 	}
 
@@ -395,15 +413,19 @@
 /* Exists to support ARGB cursors */
 static struct page *i8xx_alloc_pages(void)
 {
+	void *addr;
+	dma_addr_t _d;
 	struct page *page;
 
-	page = alloc_pages(GFP_KERNEL | GFP_DMA32, 2);
-	if (page == NULL)
+	addr = dma_alloc_coherent(NULL, 4 * PAGE_SIZE, &_d, GFP_KERNEL);
+	if (addr == NULL)
 		return NULL;
 
+	page = virt_to_page(addr);
+
 	if (set_pages_uc(page, 4) < 0) {
 		set_pages_wb(page, 4);
-		__free_pages(page, 2);
+		dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, _d);
 		return NULL;
 	}
 	get_page(page);
@@ -413,12 +435,17 @@
 
 static void i8xx_destroy_pages(struct page *page)
 {
+	void *addr;
+
 	if (page == NULL)
 		return;
 
 	set_pages_wb(page, 4);
 	put_page(page);
-	__free_pages(page, 2);
+
+	addr = page_address(page);
+
+	dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, virt_to_bus(addr));
 	atomic_dec(&agp_bridge->current_memory_agp);
 }
 
@@ -478,8 +505,16 @@
 		if (!mem->is_flushed)
 			global_cache_flush();
 		for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
+			phys_addr_t phys = page_to_phys(mem->pages[i]);
+			if (xen_pv_domain()) {
+				phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+						page_to_pfn(mem->pages[i])));
+				/* Fixup: */
+				if (xen_phys != phys)
+					phys = xen_phys;
+			}
 			writel(agp_bridge->driver->mask_memory(agp_bridge,
-					page_to_phys(mem->pages[i]), mask_type),
+					phys, mask_type),
 			       intel_private.registers+I810_PTE_BASE+(j*4));
 		}
 		readl(intel_private.registers+I810_PTE_BASE+((j-1)*4));
@@ -552,6 +587,12 @@
 	new->num_scratch_pages = pg_count;
 	new->type = AGP_PHYS_MEMORY;
 	new->physical = page_to_phys(new->pages[0]);
+	if (xen_pv_domain()) {
+		phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+					page_to_pfn(new->pages[0])));
+		if (xen_phys != new->physical)
+			new->physical = xen_phys;
+	}
 	return new;
 }
 
@@ -992,8 +1033,16 @@
 		global_cache_flush();
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
+		phys_addr_t phys = page_to_phys(mem->pages[i]);
+		if (xen_pv_domain()) {
+			phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+					page_to_pfn(mem->pages[i])));
+			/* Fixup: */
+			if (xen_phys != phys)
+				phys = xen_phys;
+		}
 		writel(agp_bridge->driver->mask_memory(agp_bridge,
-				page_to_phys(mem->pages[i]), mask_type),
+				phys, mask_type),
 		       intel_private.registers+I810_PTE_BASE+(j*4));
 	}
 	readl(intel_private.registers+I810_PTE_BASE+((j-1)*4));
diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c
index a6ee32b..5620de5 100644
--- a/drivers/char/hvc_xen.c
+++ b/drivers/char/hvc_xen.c
@@ -25,6 +25,8 @@
 #include <linux/types.h>
 
 #include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
 #include <xen/page.h>
 #include <xen/events.h>
 #include <xen/interface/io/console.h>
@@ -72,11 +74,12 @@
 	wmb();			/* write ring before updating pointer */
 	intf->out_prod = prod;
 
-	notify_daemon();
+	if (sent)
+		notify_daemon();
 	return sent;
 }
 
-static int write_console(uint32_t vtermno, const char *data, int len)
+static int domU_write_console(uint32_t vtermno, const char *data, int len)
 {
 	int ret = len;
 
@@ -99,7 +102,7 @@
 	return ret;
 }
 
-static int read_console(uint32_t vtermno, char *buf, int len)
+static int domU_read_console(uint32_t vtermno, char *buf, int len)
 {
 	struct xencons_interface *intf = xencons_interface();
 	XENCONS_RING_IDX cons, prod;
@@ -120,28 +123,65 @@
 	return recv;
 }
 
-static struct hv_ops hvc_ops = {
-	.get_chars = read_console,
-	.put_chars = write_console,
+static struct hv_ops domU_hvc_ops = {
+	.get_chars = domU_read_console,
+	.put_chars = domU_write_console,
 	.notifier_add = notifier_add_irq,
 	.notifier_del = notifier_del_irq,
 	.notifier_hangup = notifier_hangup_irq,
 };
 
-static int __init xen_init(void)
+static int dom0_read_console(uint32_t vtermno, char *buf, int len)
+{
+	return HYPERVISOR_console_io(CONSOLEIO_read, len, buf);
+}
+
+/*
+ * Either for a dom0 to write to the system console, or a domU with a
+ * debug version of Xen
+ */
+static int dom0_write_console(uint32_t vtermno, const char *str, int len)
+{
+	int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str);
+	if (rc < 0)
+		return 0;
+
+	return len;
+}
+
+static struct hv_ops dom0_hvc_ops = {
+	.get_chars = dom0_read_console,
+	.put_chars = dom0_write_console,
+	.notifier_add = notifier_add_irq,
+	.notifier_del = notifier_del_irq,
+	.notifier_hangup = notifier_hangup_irq,
+};
+
+static int __init xen_hvc_init(void)
 {
 	struct hvc_struct *hp;
+	struct hv_ops *ops;
 
-	if (!xen_pv_domain() ||
-	    xen_initial_domain() ||
-	    !xen_start_info->console.domU.evtchn)
+	if (!xen_pv_domain())
 		return -ENODEV;
 
-	xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
+	if (xen_initial_domain()) {
+		ops = &dom0_hvc_ops;
+		xencons_irq = bind_virq_to_irq(VIRQ_CONSOLE, 0);
+	} else {
+		if (!xen_start_info->console.domU.evtchn)
+			return -ENODEV;
+
+		ops = &domU_hvc_ops;
+		xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
+	}
+
 	if (xencons_irq < 0)
 		xencons_irq = 0; /* NO_IRQ */
+	else
+		set_irq_noprobe(xencons_irq);
 
-	hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256);
+	hp = hvc_alloc(HVC_COOKIE, xencons_irq, ops, 256);
 	if (IS_ERR(hp))
 		return PTR_ERR(hp);
 
@@ -158,7 +198,7 @@
 		rebind_evtchn_irq(xen_start_info->console.domU.evtchn, xencons_irq);
 }
 
-static void __exit xen_fini(void)
+static void __exit xen_hvc_fini(void)
 {
 	if (hvc)
 		hvc_remove(hvc);
@@ -166,29 +206,24 @@
 
 static int xen_cons_init(void)
 {
+	struct hv_ops *ops;
+
 	if (!xen_pv_domain())
 		return 0;
 
-	hvc_instantiate(HVC_COOKIE, 0, &hvc_ops);
+	ops = &domU_hvc_ops;
+	if (xen_initial_domain())
+		ops = &dom0_hvc_ops;
+
+	hvc_instantiate(HVC_COOKIE, 0, ops);
+
 	return 0;
 }
 
-module_init(xen_init);
-module_exit(xen_fini);
+module_init(xen_hvc_init);
+module_exit(xen_hvc_fini);
 console_initcall(xen_cons_init);
 
-static void raw_console_write(const char *str, int len)
-{
-	while(len > 0) {
-		int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str);
-		if (rc <= 0)
-			break;
-
-		str += rc;
-		len -= rc;
-	}
-}
-
 #ifdef CONFIG_EARLY_PRINTK
 static void xenboot_write_console(struct console *console, const char *string,
 				  unsigned len)
@@ -196,19 +231,22 @@
 	unsigned int linelen, off = 0;
 	const char *pos;
 
-	raw_console_write(string, len);
+	dom0_write_console(0, string, len);
 
-	write_console(0, "(early) ", 8);
+	if (xen_initial_domain())
+		return;
+
+	domU_write_console(0, "(early) ", 8);
 	while (off < len && NULL != (pos = strchr(string+off, '\n'))) {
 		linelen = pos-string+off;
 		if (off + linelen > len)
 			break;
-		write_console(0, string+off, linelen);
-		write_console(0, "\r\n", 2);
+		domU_write_console(0, string+off, linelen);
+		domU_write_console(0, "\r\n", 2);
 		off += linelen + 1;
 	}
 	if (off < len)
-		write_console(0, string+off, len-off);
+		domU_write_console(0, string+off, len-off);
 }
 
 struct console xenboot_console = {
@@ -220,7 +258,7 @@
 
 void xen_raw_console_write(const char *str)
 {
-	raw_console_write(str, strlen(str));
+	dom0_write_console(0, str, strlen(str));
 }
 
 void xen_raw_printk(const char *fmt, ...)
diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index cbaf420..163459d 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -8,7 +8,6 @@
 
 #include <linux/bug.h>
 #include <linux/device.h>
-#include <linux/ethtool.h>
 #include <linux/firewire.h>
 #include <linux/firewire-constants.h>
 #include <linux/highmem.h>
@@ -1333,17 +1332,6 @@
 	return 0;
 }
 
-static void fwnet_get_drvinfo(struct net_device *net,
-			      struct ethtool_drvinfo *info)
-{
-	strcpy(info->driver, KBUILD_MODNAME);
-	strcpy(info->bus_info, "ieee1394");
-}
-
-static const struct ethtool_ops fwnet_ethtool_ops = {
-	.get_drvinfo = fwnet_get_drvinfo,
-};
-
 static const struct net_device_ops fwnet_netdev_ops = {
 	.ndo_open       = fwnet_open,
 	.ndo_stop	= fwnet_stop,
@@ -1362,7 +1350,6 @@
 	net->hard_header_len	= FWNET_HLEN;
 	net->type		= ARPHRD_IEEE1394;
 	net->tx_queue_len	= 10;
-	SET_ETHTOOL_OPS(net, &fwnet_ethtool_ops);
 }
 
 /* caller must hold fwnet_device_mutex */
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 0e27d98..f5e2572 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -201,7 +201,7 @@
 	}
 	if (drm_core_check_feature(dev, DRIVER_SG) && dev->sg &&
 	    !drm_core_check_feature(dev, DRIVER_MODESET)) {
-		drm_sg_cleanup(dev->sg);
+		drm_sg_cleanup(dev, dev->sg);
 		dev->sg = NULL;
 	}
 
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 8bf3770..dde5f66 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -539,7 +539,7 @@
 	vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND;
 	vma->vm_ops = obj->dev->driver->gem_vm_ops;
 	vma->vm_private_data = map->handle;
-	vma->vm_page_prot =  pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
+	vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
 
 	/* Take a ref for this mapping of the object, so that the fault
 	 * handler can dereference the mmap offset's pointer to the object.
diff --git a/drivers/gpu/drm/drm_scatter.c b/drivers/gpu/drm/drm_scatter.c
index c7823c8..95ffb8a 100644
--- a/drivers/gpu/drm/drm_scatter.c
+++ b/drivers/gpu/drm/drm_scatter.c
@@ -32,20 +32,73 @@
  */
 
 #include <linux/vmalloc.h>
+#include <linux/mm.h>
 #include "drmP.h"
 
 #define DEBUG_SCATTER 0
 
-static inline void *drm_vmalloc_dma(unsigned long size)
+static void *drm_vmalloc_dma(struct drm_device *drmdev, unsigned long size)
 {
 #if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE)
 	return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL | _PAGE_NO_CACHE);
 #else
-	return vmalloc_32(size);
+	struct device *dev = &drmdev->pdev->dev;
+	struct page **pages;
+	void *addr;
+	const int npages = PFN_UP(size);
+	int i;
+
+	pages = kmalloc(npages * sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		goto fail;
+
+	for (i = 0; i < npages; i++) {
+		dma_addr_t phys;
+		void *addr;
+		addr = dma_alloc_coherent(dev, PAGE_SIZE, &phys, GFP_KERNEL);
+		if (addr == NULL)
+			goto out_free_pages;
+
+		pages[i] = virt_to_page(addr);
+	}
+
+	addr = vmap(pages, npages, VM_MAP | VM_IOREMAP, PAGE_KERNEL);
+
+	kfree(pages);
+
+	return addr;
+
+out_free_pages:
+	while (i > 0) {
+		void *addr = page_address(pages[--i]);
+		dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr));
+	}
+
+	kfree(pages);
+
+fail:
+	return NULL;
 #endif
 }
 
-void drm_sg_cleanup(struct drm_sg_mem * entry)
+static void drm_vfree_dma(struct drm_device *drmdev, void *addr, int npages,
+			  struct page **pages)
+{
+#if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE)
+	vfree(addr);
+#else
+	struct device *dev = &drmdev->pdev->dev;
+	int i;
+
+	for (i = 0; i < npages; i++) {
+		void *addr = page_address(pages[i]);
+		dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr));
+	}
+	vunmap(addr);
+#endif
+}
+
+void drm_sg_cleanup(struct drm_device *drmdev, struct drm_sg_mem * entry)
 {
 	struct page *page;
 	int i;
@@ -56,7 +109,7 @@
 			ClearPageReserved(page);
 	}
 
-	vfree(entry->virtual);
+	drm_vfree_dma(drmdev, entry->virtual, entry->pages, entry->pagelist);
 
 	kfree(entry->busaddr);
 	kfree(entry->pagelist);
@@ -107,7 +160,7 @@
 	}
 	memset((void *)entry->busaddr, 0, pages * sizeof(*entry->busaddr));
 
-	entry->virtual = drm_vmalloc_dma(pages << PAGE_SHIFT);
+	entry->virtual = drm_vmalloc_dma(dev, pages << PAGE_SHIFT);
 	if (!entry->virtual) {
 		kfree(entry->busaddr);
 		kfree(entry->pagelist);
@@ -180,7 +233,7 @@
 	return 0;
 
       failed:
-	drm_sg_cleanup(entry);
+	drm_sg_cleanup(dev, entry);
 	return -ENOMEM;
 }
 EXPORT_SYMBOL(drm_sg_alloc);
@@ -212,7 +265,7 @@
 
 	DRM_DEBUG("virtual  = %p\n", entry->virtual);
 
-	drm_sg_cleanup(entry);
+	drm_sg_cleanup(dev, entry);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 1c040d0..e3555bf 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -87,6 +87,9 @@
 	bool is_iomem;
 	unsigned long address = (unsigned long)vmf->virtual_address;
 	int retval = VM_FAULT_NOPAGE;
+	bool vm_io = (vma->vm_flags & VM_IO) && VM_IO;
+	bool pte_iomap = (pgprot_val(vma->vm_page_prot) & _PAGE_IOMAP)
+			&& _PAGE_IOMAP;
 
 	/*
 	 * Work around locking order reversal in fault / nopfn
@@ -158,11 +161,30 @@
 	if (is_iomem) {
 		vma->vm_page_prot = ttm_io_prot(bo->mem.placement,
 						vma->vm_page_prot);
+		if (!vm_io || !pte_iomap) {
+			vma->vm_flags |= VM_IO;
+			pgprot_val(vma->vm_page_prot) |= _PAGE_IOMAP;
+		}
 	} else {
 		ttm = bo->ttm;
 		vma->vm_page_prot = (bo->mem.placement & TTM_PL_FLAG_CACHED) ?
 		    vm_get_page_prot(vma->vm_flags) :
 		    ttm_io_prot(bo->mem.placement, vma->vm_page_prot);
+		/*
+		 * During PCI suspend the graphic cards purge their VRAM and
+		 * move their graphic objects to the TT. They also unmap all
+		 * of the objects, meaning that when an user application is
+		 * unfrozen it will re-fault  and call here.
+		 *
+		 * What this means is that the VMA for the graphic object might
+		 * have been set for VRAM TTM but now it is with the TT
+		 * (normal RAM) meaning that the vma->vm_flags could be
+		 * inappropiate (say, VM_IO on TT - no good).
+		 */
+		if (vm_io || pte_iomap) {
+			vma->vm_flags &= ~VM_IO;
+			pgprot_val(vma->vm_page_prot) &= ~_PAGE_IOMAP;
+		}
 	}
 
 	/*
@@ -239,6 +261,7 @@
 {
 	struct ttm_bo_driver *driver;
 	struct ttm_buffer_object *bo;
+	struct ttm_mem_type_manager *man;
 	int ret;
 
 	read_lock(&bdev->vm_lock);
@@ -271,7 +294,11 @@
 	 */
 
 	vma->vm_private_data = bo;
-	vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
+	vma->vm_flags |= VM_RESERVED | VM_MIXEDMAP | VM_DONTEXPAND;
+	man = &bdev->man[bo->mem.mem_type];
+	if (man->flags & TTM_MEMTYPE_FLAG_NEEDS_IOREMAP)
+		vma->vm_flags |= VM_IO;
+	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	return 0;
 out_unref:
 	ttm_bo_unref(&bo);
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index 3d5b8b0..8b05e38 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -38,7 +38,8 @@
 #include "ttm/ttm_module.h"
 #include "ttm/ttm_bo_driver.h"
 #include "ttm/ttm_placement.h"
-
+#include <linux/dma-mapping.h>
+#include <xen/xen.h>
 static int ttm_tt_swapin(struct ttm_tt *ttm);
 
 /**
@@ -84,6 +85,16 @@
 	else
 		gfp_flags |= __GFP_HIGHMEM;
 
+	if ((page_flags & TTM_PAGE_FLAG_DMA32) && xen_pv_domain())
+	{
+		void *addr;
+		dma_addr_t _d;
+
+		addr = dma_alloc_coherent(NULL, PAGE_SIZE, &_d, GFP_KERNEL);
+		if (addr == NULL)
+			return NULL;
+		return virt_to_page(addr);
+	}
 	return alloc_page(gfp_flags);
 }
 
@@ -286,6 +297,7 @@
 	int i;
 	struct page *cur_page;
 	struct ttm_backend *be = ttm->be;
+	void *addr;
 
 	if (be)
 		be->func->clear(be);
@@ -300,7 +312,16 @@
 				       "Leaking pages.\n");
 			ttm_mem_global_free_page(ttm->glob->mem_glob,
 						 cur_page);
-			__free_page(cur_page);
+
+			if ((ttm->page_flags & TTM_PAGE_FLAG_DMA32) &&
+				xen_pv_domain()) {
+				addr = page_address(cur_page);
+				WARN_ON(!addr);
+				if (addr)
+					dma_free_coherent(NULL, PAGE_SIZE, addr,
+						  virt_to_bus(addr));
+			} else
+				__free_page(cur_page);
 		}
 	}
 	ttm->state = tt_unpopulated;
diff --git a/drivers/ieee1394/eth1394.c b/drivers/ieee1394/eth1394.c
index a4e9dcb..62ab09e 100644
--- a/drivers/ieee1394/eth1394.c
+++ b/drivers/ieee1394/eth1394.c
@@ -58,7 +58,6 @@
 #include <linux/tcp.h>
 #include <linux/skbuff.h>
 #include <linux/bitops.h>
-#include <linux/ethtool.h>
 #include <asm/uaccess.h>
 #include <asm/delay.h>
 #include <asm/unaligned.h>
@@ -173,8 +172,6 @@
 				struct net_device *dev);
 static void ether1394_iso(struct hpsb_iso *iso);
 
-static const struct ethtool_ops ethtool_ops;
-
 static int ether1394_write(struct hpsb_host *host, int srcid, int destid,
 			   quadlet_t *data, u64 addr, size_t len, u16 flags);
 static void ether1394_add_host(struct hpsb_host *host);
@@ -525,8 +522,6 @@
 	dev->header_ops		= &ether1394_header_ops;
 	dev->netdev_ops		= &ether1394_netdev_ops;
 
-	SET_ETHTOOL_OPS(dev, &ethtool_ops);
-
 	dev->watchdog_timeo	= ETHER1394_TIMEOUT;
 	dev->flags		= IFF_BROADCAST | IFF_MULTICAST;
 	dev->features		= NETIF_F_HIGHDMA;
@@ -1698,17 +1693,6 @@
 	return NETDEV_TX_OK;
 }
 
-static void ether1394_get_drvinfo(struct net_device *dev,
-				  struct ethtool_drvinfo *info)
-{
-	strcpy(info->driver, driver_name);
-	strcpy(info->bus_info, "ieee1394"); /* FIXME provide more detail? */
-}
-
-static const struct ethtool_ops ethtool_ops = {
-	.get_drvinfo = ether1394_get_drvinfo
-};
-
 static int __init ether1394_init_module(void)
 {
 	int err;
diff --git a/drivers/input/xen-kbdfront.c b/drivers/input/xen-kbdfront.c
index 480d5d2..546838f 100644
--- a/drivers/input/xen-kbdfront.c
+++ b/drivers/input/xen-kbdfront.c
@@ -21,7 +21,10 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/input.h>
+
 #include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
 #include <xen/events.h>
 #include <xen/page.h>
 #include <xen/interface/io/fbif.h>
@@ -286,6 +289,8 @@
 	switch (backend_state) {
 	case XenbusStateInitialising:
 	case XenbusStateInitialised:
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
 	case XenbusStateUnknown:
 	case XenbusStateClosed:
 		break;
@@ -348,7 +353,7 @@
 
 static int __init xenkbd_init(void)
 {
-	if (!xen_domain())
+	if (!xen_domain() || xen_hvm_domain())
 		return -ENODEV;
 
 	/* Nothing to do if running in dom0. */
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index b2f71f7..b7feb84 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2787,6 +2787,7 @@
 config XEN_NETDEV_FRONTEND
 	tristate "Xen network device frontend driver"
 	depends on XEN
+	select XEN_XENBUS_FRONTEND
 	default y
 	help
 	  The network device frontend driver allows the kernel to
diff --git a/drivers/net/bmac.c b/drivers/net/bmac.c
index 406f064..c063b53 100644
--- a/drivers/net/bmac.c
+++ b/drivers/net/bmac.c
@@ -1236,15 +1236,8 @@
 	}
 	spin_unlock_irqrestore(&bp->lock, flags);
 }
-static void bmac_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
-{
-	struct bmac_data *bp = netdev_priv(dev);
-	strcpy(info->driver, "bmac");
-	strcpy(info->bus_info, dev_name(&bp->mdev->ofdev.dev));
-}
 
 static const struct ethtool_ops bmac_ethtool_ops = {
-	.get_drvinfo		= bmac_get_drvinfo,
 	.get_link		= ethtool_op_get_link,
 };
 
diff --git a/drivers/net/fec_mpc52xx.c b/drivers/net/fec_mpc52xx.c
index 66dace6..8238fa2 100644
--- a/drivers/net/fec_mpc52xx.c
+++ b/drivers/net/fec_mpc52xx.c
@@ -772,11 +772,6 @@
 
 
 /* ethtool interface */
-static void mpc52xx_fec_get_drvinfo(struct net_device *dev,
-		struct ethtool_drvinfo *info)
-{
-	strcpy(info->driver, DRIVER_NAME);
-}
 
 static int mpc52xx_fec_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
@@ -811,7 +806,6 @@
 }
 
 static const struct ethtool_ops mpc52xx_fec_ethtool_ops = {
-	.get_drvinfo = mpc52xx_fec_get_drvinfo,
 	.get_settings = mpc52xx_fec_get_settings,
 	.set_settings = mpc52xx_fec_set_settings,
 	.get_link = ethtool_op_get_link,
diff --git a/drivers/net/pasemi_mac_ethtool.c b/drivers/net/pasemi_mac_ethtool.c
index 28a8622..29ff9ad 100644
--- a/drivers/net/pasemi_mac_ethtool.c
+++ b/drivers/net/pasemi_mac_ethtool.c
@@ -77,21 +77,6 @@
 	return phy_ethtool_gset(phydev, cmd);
 }
 
-static void
-pasemi_mac_ethtool_get_drvinfo(struct net_device *netdev,
-			       struct ethtool_drvinfo *drvinfo)
-{
-	struct pasemi_mac *mac;
-	mac = netdev_priv(netdev);
-
-	/* clear and fill out info */
-	memset(drvinfo, 0, sizeof(struct ethtool_drvinfo));
-	strncpy(drvinfo->driver, "pasemi_mac", 12);
-	strcpy(drvinfo->version, "N/A");
-	strcpy(drvinfo->fw_version, "N/A");
-	strncpy(drvinfo->bus_info, pci_name(mac->pdev), 32);
-}
-
 static u32
 pasemi_mac_ethtool_get_msglevel(struct net_device *netdev)
 {
@@ -150,7 +135,6 @@
 
 const struct ethtool_ops pasemi_mac_ethtool_ops = {
 	.get_settings		= pasemi_mac_ethtool_get_settings,
-	.get_drvinfo		= pasemi_mac_ethtool_get_drvinfo,
 	.get_msglevel		= pasemi_mac_ethtool_get_msglevel,
 	.set_msglevel		= pasemi_mac_ethtool_set_msglevel,
 	.get_link		= ethtool_op_get_link,
diff --git a/drivers/net/pcmcia/3c574_cs.c b/drivers/net/pcmcia/3c574_cs.c
index b58965a..7f9a4f4 100644
--- a/drivers/net/pcmcia/3c574_cs.c
+++ b/drivers/net/pcmcia/3c574_cs.c
@@ -83,7 +83,6 @@
 #include <linux/skbuff.h>
 #include <linux/if_arp.h>
 #include <linux/ioport.h>
-#include <linux/ethtool.h>
 #include <linux/bitops.h>
 #include <linux/mii.h>
 
@@ -249,7 +248,6 @@
 static int el3_close(struct net_device *dev);
 static void el3_tx_timeout(struct net_device *dev);
 static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
-static const struct ethtool_ops netdev_ethtool_ops;
 static void set_rx_mode(struct net_device *dev);
 static void set_multicast_list(struct net_device *dev);
 
@@ -300,7 +298,6 @@
 	link->conf.ConfigIndex = 1;
 
 	dev->netdev_ops = &el3_netdev_ops;
-	SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
 	dev->watchdog_timeo = TX_TIMEOUT;
 
 	return tc574_config(link);
@@ -1083,16 +1080,6 @@
 	return worklimit;
 }
 
-static void netdev_get_drvinfo(struct net_device *dev,
-			       struct ethtool_drvinfo *info)
-{
-	strcpy(info->driver, "3c574_cs");
-}
-
-static const struct ethtool_ops netdev_ethtool_ops = {
-	.get_drvinfo		= netdev_get_drvinfo,
-};
-
 /* Provide ioctl() calls to examine the MII xcvr state. */
 static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
diff --git a/drivers/net/pcmcia/axnet_cs.c b/drivers/net/pcmcia/axnet_cs.c
index 3131a59..40e5e7c 100644
--- a/drivers/net/pcmcia/axnet_cs.c
+++ b/drivers/net/pcmcia/axnet_cs.c
@@ -33,7 +33,6 @@
 #include <linux/timer.h>
 #include <linux/delay.h>
 #include <linux/spinlock.h>
-#include <linux/ethtool.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/crc32.h>
@@ -98,7 +97,6 @@
 static struct net_device_stats *get_stats(struct net_device *dev);
 static void set_multicast_list(struct net_device *dev);
 static void axnet_tx_timeout(struct net_device *dev);
-static const struct ethtool_ops netdev_ethtool_ops;
 static irqreturn_t ei_irq_wrapper(int irq, void *dev_id);
 static void ei_watchdog(u_long arg);
 static void axnet_reset_8390(struct net_device *dev);
@@ -186,7 +184,6 @@
 
     dev->netdev_ops = &axnet_netdev_ops;
 
-    SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
     dev->watchdog_timeo = TX_TIMEOUT;
 
     return axnet_config(link);
@@ -683,16 +680,6 @@
     add_timer(&info->watchdog);
 }
 
-static void netdev_get_drvinfo(struct net_device *dev,
-			       struct ethtool_drvinfo *info)
-{
-	strcpy(info->driver, "axnet_cs");
-}
-
-static const struct ethtool_ops netdev_ethtool_ops = {
-	.get_drvinfo		= netdev_get_drvinfo,
-};
-
 /*====================================================================*/
 
 static int axnet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
diff --git a/drivers/net/pcmcia/ibmtr_cs.c b/drivers/net/pcmcia/ibmtr_cs.c
index 06618af..db0c890 100644
--- a/drivers/net/pcmcia/ibmtr_cs.c
+++ b/drivers/net/pcmcia/ibmtr_cs.c
@@ -52,7 +52,6 @@
 #include <linux/string.h>
 #include <linux/timer.h>
 #include <linux/module.h>
-#include <linux/ethtool.h>
 #include <linux/netdevice.h>
 #include <linux/trdevice.h>
 #include <linux/ibmtr.h>
@@ -120,16 +119,6 @@
     struct tok_info	*ti;
 } ibmtr_dev_t;
 
-static void netdev_get_drvinfo(struct net_device *dev,
-			       struct ethtool_drvinfo *info)
-{
-	strcpy(info->driver, "ibmtr_cs");
-}
-
-static const struct ethtool_ops netdev_ethtool_ops = {
-	.get_drvinfo		= netdev_get_drvinfo,
-};
-
 /*======================================================================
 
     ibmtr_attach() creates an "instance" of the driver, allocating
@@ -170,8 +159,6 @@
 
     link->irq.Instance = info->dev = dev;
 
-    SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
-
     return ibmtr_config(link);
 } /* ibmtr_attach */
 
diff --git a/drivers/net/pcmcia/pcnet_cs.c b/drivers/net/pcmcia/pcnet_cs.c
index 4696844..02e2d05 100644
--- a/drivers/net/pcmcia/pcnet_cs.c
+++ b/drivers/net/pcmcia/pcnet_cs.c
@@ -36,7 +36,6 @@
 #include <linux/string.h>
 #include <linux/timer.h>
 #include <linux/delay.h>
-#include <linux/ethtool.h>
 #include <linux/netdevice.h>
 #include <linux/log2.h>
 #include <linux/etherdevice.h>
@@ -111,7 +110,6 @@
 static int pcnet_open(struct net_device *dev);
 static int pcnet_close(struct net_device *dev);
 static int ei_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
-static const struct ethtool_ops netdev_ethtool_ops;
 static irqreturn_t ei_irq_wrapper(int irq, void *dev_id);
 static void ei_watchdog(u_long arg);
 static void pcnet_reset_8390(struct net_device *dev);
@@ -654,8 +652,6 @@
     ei_status.word16 = 1;
     ei_status.reset_8390 = &pcnet_reset_8390;
 
-    SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
-
     if (info->flags & (IS_DL10019|IS_DL10022))
 	mii_phy_probe(dev);
 
@@ -1175,18 +1171,6 @@
 
 /*====================================================================*/
 
-static void netdev_get_drvinfo(struct net_device *dev,
-			       struct ethtool_drvinfo *info)
-{
-	strcpy(info->driver, "pcnet_cs");
-}
-
-static const struct ethtool_ops netdev_ethtool_ops = {
-	.get_drvinfo		= netdev_get_drvinfo,
-};
-
-/*====================================================================*/
-
 
 static int ei_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
diff --git a/drivers/net/sc92031.c b/drivers/net/sc92031.c
index 8d60300..0926832 100644
--- a/drivers/net/sc92031.c
+++ b/drivers/net/sc92031.c
@@ -1255,16 +1255,6 @@
 	return 0;
 }
 
-static void sc92031_ethtool_get_drvinfo(struct net_device *dev,
-		struct ethtool_drvinfo *drvinfo)
-{
-	struct sc92031_priv *priv = netdev_priv(dev);
-	struct pci_dev *pdev = priv->pdev;
-
-	strcpy(drvinfo->driver, SC92031_NAME);
-	strcpy(drvinfo->bus_info, pci_name(pdev));
-}
-
 static void sc92031_ethtool_get_wol(struct net_device *dev,
 		struct ethtool_wolinfo *wolinfo)
 {
@@ -1386,7 +1376,6 @@
 static const struct ethtool_ops sc92031_ethtool_ops = {
 	.get_settings		= sc92031_ethtool_get_settings,
 	.set_settings		= sc92031_ethtool_set_settings,
-	.get_drvinfo		= sc92031_ethtool_get_drvinfo,
 	.get_wol		= sc92031_ethtool_get_wol,
 	.set_wol		= sc92031_ethtool_set_wol,
 	.nway_reset		= sc92031_ethtool_nway_reset,
diff --git a/drivers/net/tulip/xircom_cb.c b/drivers/net/tulip/xircom_cb.c
index 0f2ca598..44159be 100644
--- a/drivers/net/tulip/xircom_cb.c
+++ b/drivers/net/tulip/xircom_cb.c
@@ -27,7 +27,6 @@
 #include <linux/skbuff.h>
 #include <linux/delay.h>
 #include <linux/init.h>
-#include <linux/ethtool.h>
 #include <linux/bitops.h>
 
 #include <asm/uaccess.h>
@@ -179,19 +178,6 @@
 }
 #endif
 
-static void netdev_get_drvinfo(struct net_device *dev,
-			       struct ethtool_drvinfo *info)
-{
-	struct xircom_private *private = netdev_priv(dev);
-
-	strcpy(info->driver, "xircom_cb");
-	strcpy(info->bus_info, pci_name(private->pdev));
-}
-
-static const struct ethtool_ops netdev_ethtool_ops = {
-	.get_drvinfo		= netdev_get_drvinfo,
-};
-
 static const struct net_device_ops netdev_ops = {
 	.ndo_open		= xircom_open,
 	.ndo_stop		= xircom_close,
@@ -277,7 +263,6 @@
 	setup_descriptors(private);
 
 	dev->netdev_ops = &netdev_ops;
-	SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
 	pci_set_drvdata(pdev, dev);
 
 	if (register_netdev(dev)) {
diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index f450bc9..2109514 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -820,17 +820,7 @@
 	return NETDEV_TX_OK;
 }
 
-static void hso_get_drvinfo(struct net_device *net, struct ethtool_drvinfo *info)
-{
-	struct hso_net *odev = netdev_priv(net);
-
-	strncpy(info->driver, driver_name, ETHTOOL_BUSINFO_LEN);
-	strncpy(info->version, DRIVER_VERSION, ETHTOOL_BUSINFO_LEN);
-	usb_make_path(odev->parent->usb, info->bus_info, sizeof info->bus_info);
-}
-
 static const struct ethtool_ops ops = {
-	.get_drvinfo = hso_get_drvinfo,
 	.get_link = ethtool_op_get_link
 };
 
diff --git a/drivers/net/usb/kaweth.c b/drivers/net/usb/kaweth.c
index e391ef9..47d1926 100644
--- a/drivers/net/usb/kaweth.c
+++ b/drivers/net/usb/kaweth.c
@@ -767,14 +767,6 @@
 	return 0;
 }
 
-static void kaweth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
-{
-	struct kaweth_device *kaweth = netdev_priv(dev);
-
-	strlcpy(info->driver, driver_name, sizeof(info->driver));
-	usb_make_path(kaweth->dev, info->bus_info, sizeof (info->bus_info));
-}
-
 static u32 kaweth_get_link(struct net_device *dev)
 {
 	struct kaweth_device *kaweth = netdev_priv(dev);
@@ -783,7 +775,6 @@
 }
 
 static const struct ethtool_ops ops = {
-	.get_drvinfo	= kaweth_get_drvinfo,
 	.get_link	= kaweth_get_link
 };
 
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index 1c88c2e..2e65100 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -44,7 +44,6 @@
 #include <linux/if_arp.h>
 #include <linux/ioport.h>
 #include <linux/skbuff.h>
-#include <linux/ethtool.h>
 #include <linux/ieee80211.h>
 
 #include <pcmcia/cs_types.h>
@@ -101,8 +100,6 @@
 static struct net_device_stats *ray_get_stats(struct net_device *dev);
 static int ray_dev_init(struct net_device *dev);
 
-static const struct ethtool_ops netdev_ethtool_ops;
-
 static int ray_open(struct net_device *dev);
 static netdev_tx_t ray_dev_start_xmit(struct sk_buff *skb,
 					    struct net_device *dev);
@@ -362,7 +359,6 @@
 
 	/* Raylink entries in the device structure */
 	dev->netdev_ops = &ray_netdev_ops;
-	SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
 	dev->wireless_handlers = &ray_handler_def;
 #ifdef WIRELESS_SPY
 	local->wireless_data.spy_data = &local->spy_data;
@@ -1106,18 +1102,6 @@
 	}
 } /* end encapsulate_frame */
 
-/*===========================================================================*/
-
-static void netdev_get_drvinfo(struct net_device *dev,
-			       struct ethtool_drvinfo *info)
-{
-	strcpy(info->driver, "ray_cs");
-}
-
-static const struct ethtool_ops netdev_ethtool_ops = {
-	.get_drvinfo = netdev_get_drvinfo,
-};
-
 /*====================================================================*/
 
 /*------------------------------------------------------------------*/
diff --git a/drivers/net/wireless/wl3501_cs.c b/drivers/net/wireless/wl3501_cs.c
index 4f1e0cf..22b2b43 100644
--- a/drivers/net/wireless/wl3501_cs.c
+++ b/drivers/net/wireless/wl3501_cs.c
@@ -29,7 +29,6 @@
 
 #include <linux/delay.h>
 #include <linux/types.h>
-#include <linux/ethtool.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/in.h>
@@ -1436,15 +1435,6 @@
 	return wstats;
 }
 
-static void wl3501_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
-{
-	strlcpy(info->driver, wl3501_dev_info, sizeof(info->driver));
-}
-
-static const struct ethtool_ops ops = {
-	.get_drvinfo = wl3501_get_drvinfo
-};
-
 /**
  * wl3501_detach - deletes a driver "instance"
  * @link - FILL_IN
@@ -1936,7 +1926,6 @@
 	this->p_dev = p_dev;
 	dev->wireless_data	= &this->wireless_data;
 	dev->wireless_handlers	= &wl3501_handler_def;
-	SET_ETHTOOL_OPS(dev, &ops);
 	netif_stop_queue(dev);
 	p_dev->priv = p_dev->irq.Instance = dev;
 
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 1a11d95..3f71199 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -42,6 +42,7 @@
 #include <linux/mm.h>
 #include <net/ip.h>
 
+#include <xen/xen.h>
 #include <xen/xenbus.h>
 #include <xen/events.h>
 #include <xen/page.h>
@@ -53,19 +54,36 @@
 
 static const struct ethtool_ops xennet_ethtool_ops;
 
+static int use_smartpoll = 0;
+module_param(use_smartpoll, int, 0600);
+MODULE_PARM_DESC (use_smartpoll, "Use smartpoll mechanism if available");
+
 struct netfront_cb {
 	struct page *page;
 	unsigned offset;
 };
 
+#define MICRO_SECOND 1000000UL
+#define NANO_SECOND 1000000000UL
+#define DEFAULT_SMART_POLL_FREQ   1000UL
+
+struct netfront_smart_poll {
+	struct hrtimer timer;
+	struct net_device *netdev;
+	unsigned int smart_poll_freq;
+	unsigned int feature_smart_poll;
+	unsigned int active;
+	unsigned long counter;
+};
+
 #define NETFRONT_SKB_CB(skb)	((struct netfront_cb *)((skb)->cb))
 
 #define RX_COPY_THRESHOLD 256
 
 #define GRANT_INVALID_REF	0
 
-#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
-#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
+#define NET_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
+#define NET_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE)
 #define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
 
 struct netfront_info {
@@ -104,7 +122,7 @@
 
 	/* Receive-ring batched refills. */
 #define RX_MIN_TARGET 8
-#define RX_DFL_MIN_TARGET 64
+#define RX_DFL_MIN_TARGET 80
 #define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
 	unsigned rx_min_target, rx_max_target, rx_target;
 	struct sk_buff_head rx_batch;
@@ -118,6 +136,8 @@
 	unsigned long rx_pfn_array[NET_RX_RING_SIZE];
 	struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
 	struct mmu_update rx_mmu[NET_RX_RING_SIZE];
+
+	struct netfront_smart_poll smart_poll;
 };
 
 struct netfront_rx_info {
@@ -337,15 +357,17 @@
 	return 0;
 }
 
-static void xennet_tx_buf_gc(struct net_device *dev)
+static int xennet_tx_buf_gc(struct net_device *dev)
 {
 	RING_IDX cons, prod;
+	RING_IDX cons_begin, cons_end;
 	unsigned short id;
 	struct netfront_info *np = netdev_priv(dev);
 	struct sk_buff *skb;
 
 	BUG_ON(!netif_carrier_ok(dev));
 
+	cons_begin = np->tx.rsp_cons;
 	do {
 		prod = np->tx.sring->rsp_prod;
 		rmb(); /* Ensure we see responses up to 'rp'. */
@@ -390,7 +412,11 @@
 		mb();		/* update shared area */
 	} while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
 
+	cons_end = np->tx.rsp_cons;
+
 	xennet_maybe_wake_tx(dev);
+
+	return (cons_begin == cons_end);
 }
 
 static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
@@ -1267,6 +1293,14 @@
 	info->rx.sring = NULL;
 }
 
+static int netfront_suspend(struct xenbus_device *dev, pm_message_t state)
+{
+	struct netfront_info *info = dev_get_drvdata(&dev->dev);
+	struct hrtimer *timer = &info->smart_poll.timer;
+	hrtimer_cancel(timer);
+	return 0;
+}
+
 /**
  * We are reconnecting to the backend, due to a suspend/resume, or a backend
  * driver restart.  We tear down our netif structure and recreate it, but
@@ -1305,6 +1339,59 @@
 	return 0;
 }
 
+static enum hrtimer_restart smart_poll_function(struct hrtimer *timer)
+{
+	struct netfront_smart_poll *psmart_poll;
+	struct net_device *dev;
+	struct netfront_info *np;
+	unsigned long flags;
+	unsigned int tx_active = 0, rx_active = 0;
+
+	psmart_poll = container_of(timer, struct netfront_smart_poll, timer);
+	dev = psmart_poll->netdev;
+	np = netdev_priv(dev);
+
+	spin_lock_irqsave(&np->tx_lock, flags);
+
+	if (!np->rx.sring)
+		goto end;
+
+	np->smart_poll.counter++;
+
+	if (likely(netif_carrier_ok(dev))) {
+		tx_active = !(xennet_tx_buf_gc(dev));
+		/* Under tx_lock: protects access to rx shared-ring indexes. */
+		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) {
+			rx_active = 1;
+			napi_schedule(&np->napi);
+		}
+	}
+
+	np->smart_poll.active |= (tx_active || rx_active);
+	if (np->smart_poll.counter %
+			(np->smart_poll.smart_poll_freq / 10) == 0) {
+		if (!np->smart_poll.active) {
+			np->rx.sring->private.netif.smartpoll_active = 0;
+			goto end;
+		}
+		np->smart_poll.active = 0;
+	}
+
+	if (np->rx.sring->private.netif.smartpoll_active) {
+		if ( hrtimer_start(timer,
+			ktime_set(0, NANO_SECOND/psmart_poll->smart_poll_freq),
+			HRTIMER_MODE_REL) ) {
+			printk(KERN_DEBUG "Failed to start hrtimer,"
+					"use interrupt mode for this packet\n");
+			np->rx.sring->private.netif.smartpoll_active = 0;
+		}
+	}
+
+end:
+	spin_unlock_irqrestore(&np->tx_lock, flags);
+	return HRTIMER_NORESTART;
+}
+
 static irqreturn_t xennet_interrupt(int irq, void *dev_id)
 {
 	struct net_device *dev = dev_id;
@@ -1320,6 +1407,16 @@
 			napi_schedule(&np->napi);
 	}
 
+	if (np->smart_poll.feature_smart_poll) {
+		if ( hrtimer_start(&np->smart_poll.timer,
+			ktime_set(0,NANO_SECOND/np->smart_poll.smart_poll_freq),
+			HRTIMER_MODE_REL) ) {
+			printk(KERN_DEBUG "Failed to start hrtimer,"
+					"use interrupt mode for this packet\n");
+			np->rx.sring->private.netif.smartpoll_active = 0;
+		}
+	}
+
 	spin_unlock_irqrestore(&np->tx_lock, flags);
 
 	return IRQ_HANDLED;
@@ -1393,7 +1490,7 @@
 }
 
 /* Common code used when first setting up, and when resuming. */
-static int talk_to_backend(struct xenbus_device *dev,
+static int talk_to_netback(struct xenbus_device *dev,
 			   struct netfront_info *info)
 {
 	const char *message;
@@ -1456,6 +1553,12 @@
 		goto abort_transaction;
 	}
 
+	err = xenbus_printf(xbt, dev->nodename, "feature-smart-poll", "%d", use_smartpoll);
+	if (err) {
+		message = "writing feature-smart-poll";
+		goto abort_transaction;
+	}
+
 	err = xenbus_transaction_end(xbt, 0);
 	if (err) {
 		if (err == -EAGAIN)
@@ -1543,7 +1646,26 @@
 		return -ENODEV;
 	}
 
-	err = talk_to_backend(np->xbdev, np);
+	np->smart_poll.feature_smart_poll = 0;
+	if (use_smartpoll) {
+		err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+				   "feature-smart-poll", "%u",
+				   &np->smart_poll.feature_smart_poll);
+		if (err != 1)
+			np->smart_poll.feature_smart_poll = 0;
+	}
+
+	hrtimer_init(&np->smart_poll.timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL);
+	if (np->smart_poll.feature_smart_poll) {
+		np->smart_poll.timer.function = smart_poll_function;
+		np->smart_poll.netdev = dev;
+		np->smart_poll.smart_poll_freq = DEFAULT_SMART_POLL_FREQ;
+		np->smart_poll.active = 0;
+		np->smart_poll.counter = 0;
+	}
+
+	err = talk_to_netback(np->xbdev, np);
 	if (err)
 		return err;
 
@@ -1597,7 +1719,7 @@
 /**
  * Callback received when the backend's state changes.
  */
-static void backend_changed(struct xenbus_device *dev,
+static void netback_changed(struct xenbus_device *dev,
 			    enum xenbus_state backend_state)
 {
 	struct netfront_info *np = dev_get_drvdata(&dev->dev);
@@ -1608,6 +1730,8 @@
 	switch (backend_state) {
 	case XenbusStateInitialising:
 	case XenbusStateInitialised:
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
 	case XenbusStateConnected:
 	case XenbusStateUnknown:
 	case XenbusStateClosed:
@@ -1628,12 +1752,30 @@
 	}
 }
 
+static int xennet_get_coalesce(struct net_device *netdev,
+			       struct ethtool_coalesce *ec)
+{
+	struct netfront_info *np = netdev_priv(netdev);
+	ec->rx_coalesce_usecs = MICRO_SECOND / np->smart_poll.smart_poll_freq;
+	return 0;
+}
+
+static int xennet_set_coalesce(struct net_device *netdev,
+		struct ethtool_coalesce *ec)
+{
+	struct netfront_info *np = netdev_priv(netdev);
+	np->smart_poll.smart_poll_freq = MICRO_SECOND / ec->rx_coalesce_usecs;
+	return 0;
+}
+
 static const struct ethtool_ops xennet_ethtool_ops =
 {
 	.set_tx_csum = ethtool_op_set_tx_csum,
 	.set_sg = xennet_set_sg,
 	.set_tso = xennet_set_tso,
 	.get_link = ethtool_op_get_link,
+	.get_coalesce = xennet_get_coalesce,
+	.set_coalesce = xennet_set_coalesce,
 };
 
 #ifdef CONFIG_SYSFS
@@ -1798,8 +1940,9 @@
 	.ids = netfront_ids,
 	.probe = netfront_probe,
 	.remove = __devexit_p(xennet_remove),
+	.suspend = netfront_suspend,
 	.resume = netfront_resume,
-	.otherend_changed = backend_changed,
+	.otherend_changed = netback_changed,
 };
 
 static int __init netif_init(void)
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index fdc864f..7802fcd 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -51,6 +51,16 @@
 
 	  When in doubt, say N.
 
+config XEN_PCIDEV_FRONTEND
+        tristate "Xen PCI Frontend"
+        depends on XEN && PCI && X86
+        select HOTPLUG
+	select XEN_XENBUS_FRONTEND
+	default y
+        help
+          The PCI device frontend driver allows the kernel to import arbitrary
+          PCI devices from a PCI backend to support PCI driver domains.
+
 config HT_IRQ
 	bool "Interrupts on hypertransport devices"
 	default y
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 4a7f11d..b70aa4d 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -31,6 +31,8 @@
 # Build Intel IOMMU support
 obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
 
+# Build Xen IOMMU support
+obj-$(CONFIG_PCI_XEN) += xen-iommu.o
 obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
 
 obj-$(CONFIG_PCI_IOV) += iov.o
@@ -60,6 +62,8 @@
 
 obj-$(CONFIG_PCI_STUB) += pci-stub.o
 
+obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
+
 ifeq ($(CONFIG_PCI_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
 endif
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index cef28a7..1940183 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -249,6 +249,7 @@
 	up_read(&pci_bus_sem);
 }
 
+EXPORT_SYMBOL_GPL(pci_walk_bus);
 EXPORT_SYMBOL(pci_bus_alloc_resource);
 EXPORT_SYMBOL_GPL(pci_bus_add_device);
 EXPORT_SYMBOL(pci_bus_add_devices);
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 91d0390..24f6f28 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -673,10 +673,13 @@
 			       "x2apic and Intr-remapping.\n");
 #endif
 #ifdef CONFIG_DMAR
-		if (ret && !no_iommu && !iommu_detected && !swiotlb &&
-		    !dmar_disabled)
+		if (ret && !no_iommu && !iommu_detected && !dmar_disabled)
 			iommu_detected = 1;
 #endif
+#ifdef CONFIG_X86
+		if (ret)
+			x86_init.iommu.iommu_init = intel_iommu_init;
+#endif
 	}
 	early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size);
 	dmar_tbl = NULL;
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index 73e7d8e..1ebd547 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -1018,6 +1018,13 @@
 	list_for_each_entry(func, &slot->funcs, sibling)
 		acpiphp_configure_ioapics(func->handle);
 	pci_enable_bridges(bus);
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		/* Assume that newly added devices are powered on already. */
+		if (!dev->is_added)
+			dev->current_state = PCI_D0;
+	}
+
 	pci_bus_add_devices(bus);
 
 	list_for_each (l, &slot->funcs) {
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 5b680df..02bf548 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3284,7 +3284,7 @@
 	 * Check the need for DMA-remapping initialization now.
 	 * Above initialization will also be used by Interrupt-remapping.
 	 */
-	if (no_iommu || swiotlb || dmar_disabled)
+	if (no_iommu || dmar_disabled)
 		return -ENODEV;
 
 	iommu_init_mempool();
@@ -3305,7 +3305,9 @@
 	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
 
 	init_timer(&unmap_timer);
-	force_iommu = 1;
+#ifdef CONFIG_SWIOTLB
+	swiotlb = 0;
+#endif
 	dma_ops = &intel_dma_ops;
 
 	init_iommu_sysfs();
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index e03fe98..f9db891 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -706,6 +706,21 @@
 }
 EXPORT_SYMBOL_GPL(pci_sriov_migration);
 
+/**
+ * pci_num_vf - return number of VFs associated with a PF device_release_driver
+ * @dev: the PCI device
+ *
+ * Returns number of VFs, or 0 if SR-IOV is not enabled.
+ */
+int pci_num_vf(struct pci_dev *dev)
+{
+	if (!dev || !dev->is_physfn)
+		return 0;
+	else
+		return dev->sriov->nr_virtfn;
+}
+EXPORT_SYMBOL_GPL(pci_num_vf);
+
 static int ats_alloc_one(struct pci_dev *dev, int ps)
 {
 	int pos;
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 0fb1d05..c7e8a69 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -19,6 +19,9 @@
 #include <linux/errno.h>
 #include <linux/io.h>
 
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
 #include "pci.h"
 #include "msi.h"
 
@@ -391,6 +394,20 @@
 
 void pci_restore_msi_state(struct pci_dev *dev)
 {
+	if (xen_initial_domain()) {
+		struct physdev_restore_msi physdev;
+
+		if (!dev->msi_enabled && !dev->msix_enabled)
+			return;
+
+		pci_intx_for_msi(dev, 0);
+
+		physdev.bus = dev->bus->number;
+		physdev.devfn = dev->devfn;
+		HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &physdev);
+
+		return;
+	}
 	__pci_restore_msi_state(dev);
 	__pci_restore_msix_state(dev);
 }
diff --git a/drivers/pci/xen-iommu.c b/drivers/pci/xen-iommu.c
new file mode 100644
index 0000000..ac6bcdb
--- /dev/null
+++ b/drivers/pci/xen-iommu.c
@@ -0,0 +1,271 @@
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/scatterlist.h>
+#include <linux/io.h>
+#include <linux/bug.h>
+
+#include <xen/interface/xen.h>
+#include <xen/grant_table.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+
+#include <asm/iommu.h>
+#include <asm/swiotlb.h>
+#include <asm/tlbflush.h>
+
+#define IOMMU_BUG_ON(test)				\
+do {							\
+	if (unlikely(test)) {				\
+		printk(KERN_ALERT "Fatal DMA error! "	\
+		       "Please use 'swiotlb=force'\n");	\
+		BUG();					\
+	}						\
+} while (0)
+
+/* Print address range with message */
+#define PAR(msg, addr, size)					\
+do {							\
+	printk(msg "[%#llx - %#llx]\n",			\
+	(unsigned long long)addr,			\
+	(unsigned long long)addr + size);		\
+} while (0)
+
+static inline int address_needs_mapping(struct device *hwdev,
+						dma_addr_t addr)
+{
+	dma_addr_t mask = DMA_BIT_MASK(32);
+	int ret;
+
+	/* If the device has a mask, use it, otherwise default to 32 bits */
+	if (hwdev)
+		mask = *hwdev->dma_mask;
+
+	ret = (addr & ~mask) != 0;
+
+	if (ret) {
+		printk(KERN_ERR "dma address needs mapping\n");
+		printk(KERN_ERR "mask: %#llx\n address: [%#llx]\n", mask, addr);
+	}
+	return ret;
+}
+
+static int check_pages_physically_contiguous(unsigned long pfn,
+					     unsigned int offset,
+					     size_t length)
+{
+	unsigned long next_mfn;
+	int i;
+	int nr_pages;
+
+	next_mfn = pfn_to_mfn(pfn);
+	nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+	for (i = 1; i < nr_pages; i++) {
+		if (pfn_to_mfn(++pfn) != ++next_mfn)
+			return 0;
+	}
+	return 1;
+}
+
+static int range_straddles_page_boundary(phys_addr_t p, size_t size)
+{
+	unsigned long pfn = PFN_DOWN(p);
+	unsigned int offset = p & ~PAGE_MASK;
+
+	if (offset + size <= PAGE_SIZE)
+		return 0;
+	if (check_pages_physically_contiguous(pfn, offset, size))
+		return 0;
+	return 1;
+}
+
+static inline void xen_dma_unmap_page(struct page *page)
+{
+	/* Xen TODO: 2.6.18 xen calls __gnttab_dma_unmap_page here
+	 * to deal with foreign pages.  We'll need similar logic here at
+	 * some point.
+	 */
+}
+
+/* Gets dma address of a page */
+static inline dma_addr_t xen_dma_map_page(struct page *page)
+{
+	/* Xen TODO: 2.6.18 xen calls __gnttab_dma_map_page here to deal
+	 * with foreign pages.  We'll need similar logic here at some
+	 * point.
+	 */
+	return ((dma_addr_t)pfn_to_mfn(page_to_pfn(page))) << PAGE_SHIFT;
+}
+
+static int xen_map_sg(struct device *hwdev, struct scatterlist *sg,
+		      int nents,
+		      enum dma_data_direction direction,
+		      struct dma_attrs *attrs)
+{
+	struct scatterlist *s;
+	struct page *page;
+	int i, rc;
+
+	BUG_ON(direction == DMA_NONE);
+	WARN_ON(nents == 0 || sg[0].length == 0);
+
+	for_each_sg(sg, s, nents, i) {
+		BUG_ON(!sg_page(s));
+		page = sg_page(s);
+		s->dma_address = xen_dma_map_page(page) + s->offset;
+		s->dma_length = s->length;
+		IOMMU_BUG_ON(range_straddles_page_boundary(
+				page_to_phys(page), s->length));
+	}
+
+	rc = nents;
+
+	flush_write_buffers();
+	return rc;
+}
+
+static void xen_unmap_sg(struct device *hwdev, struct scatterlist *sg,
+			 int nents,
+			 enum dma_data_direction direction,
+			 struct dma_attrs *attrs)
+{
+	struct scatterlist *s;
+	struct page *page;
+	int i;
+
+	for_each_sg(sg, s, nents, i) {
+		page = pfn_to_page(mfn_to_pfn(PFN_DOWN(s->dma_address)));
+		xen_dma_unmap_page(page);
+	}
+}
+
+static void *xen_alloc_coherent(struct device *dev, size_t size,
+				dma_addr_t *dma_handle, gfp_t gfp)
+{
+	void *ret;
+	unsigned int order = get_order(size);
+	unsigned long vstart;
+	u64 mask;
+
+	/* ignore region specifiers */
+	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
+
+	if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
+		return ret;
+
+	if (dev == NULL || (dev->coherent_dma_mask < DMA_BIT_MASK(32)))
+		gfp |= GFP_DMA;
+
+	vstart = __get_free_pages(gfp, order);
+	ret = (void *)vstart;
+
+	if (dev != NULL && dev->coherent_dma_mask)
+		mask = dev->coherent_dma_mask;
+	else
+		mask = DMA_BIT_MASK(32);
+
+	if (ret != NULL) {
+		if (xen_create_contiguous_region(vstart, order,
+						 fls64(mask)) != 0) {
+			free_pages(vstart, order);
+			return NULL;
+		}
+		memset(ret, 0, size);
+		*dma_handle = virt_to_machine(ret).maddr;
+	}
+	return ret;
+}
+
+static void xen_free_coherent(struct device *dev, size_t size,
+			      void *vaddr, dma_addr_t dma_addr)
+{
+	int order = get_order(size);
+
+	if (dma_release_from_coherent(dev, order, vaddr))
+		return;
+
+	xen_destroy_contiguous_region((unsigned long)vaddr, order);
+	free_pages((unsigned long)vaddr, order);
+}
+
+static dma_addr_t xen_map_page(struct device *dev, struct page *page,
+			       unsigned long offset, size_t size,
+			       enum dma_data_direction direction,
+			       struct dma_attrs *attrs)
+{
+	dma_addr_t dma;
+
+	BUG_ON(direction == DMA_NONE);
+
+	WARN_ON(size == 0);
+
+	dma = xen_dma_map_page(page) + offset;
+
+	IOMMU_BUG_ON(address_needs_mapping(dev, dma));
+	flush_write_buffers();
+	return dma;
+}
+
+static void xen_unmap_page(struct device *dev, dma_addr_t dma_addr,
+			   size_t size,
+			   enum dma_data_direction direction,
+			   struct dma_attrs *attrs)
+{
+	BUG_ON(direction == DMA_NONE);
+	xen_dma_unmap_page(pfn_to_page(mfn_to_pfn(PFN_DOWN(dma_addr))));
+}
+
+static struct dma_map_ops xen_dma_ops = {
+	.dma_supported = NULL,
+
+	.alloc_coherent = xen_alloc_coherent,
+	.free_coherent = xen_free_coherent,
+
+	.map_page = xen_map_page,
+	.unmap_page = xen_unmap_page,
+
+	.map_sg = xen_map_sg,
+	.unmap_sg = xen_unmap_sg,
+
+	.mapping_error = NULL,
+
+	.is_phys = 0,
+};
+
+static struct dma_map_ops xen_swiotlb_dma_ops = {
+	.dma_supported = swiotlb_dma_supported,
+
+	.alloc_coherent = xen_alloc_coherent,
+	.free_coherent = xen_free_coherent,
+
+	.map_page = swiotlb_map_page,
+	.unmap_page = swiotlb_unmap_page,
+
+	.map_sg = swiotlb_map_sg_attrs,
+	.unmap_sg = swiotlb_unmap_sg_attrs,
+
+	.mapping_error = swiotlb_dma_mapping_error,
+
+	.is_phys = 0,
+};
+
+void __init xen_iommu_init(void)
+{
+	if (!xen_pv_domain())
+		return;
+
+	printk(KERN_INFO "Xen: Initializing Xen DMA ops\n");
+
+	force_iommu = 0;
+	dma_ops = &xen_dma_ops;
+
+	if (swiotlb) {
+		printk(KERN_INFO "Xen: Enabling DMA fallback to swiotlb\n");
+		dma_ops = &xen_swiotlb_dma_ops;
+	}
+}
+
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
new file mode 100644
index 0000000..76d0bdd
--- /dev/null
+++ b/drivers/pci/xen-pcifront.c
@@ -0,0 +1,1157 @@
+/*
+ * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn)
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include <xen/page.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/msi.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/pciif.h>
+#include <asm/xen/pci.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+#include <linux/time.h>
+
+
+#ifndef __init_refok
+#define __init_refok
+#endif
+
+#define INVALID_GRANT_REF (0)
+#define INVALID_EVTCHN    (-1)
+
+
+struct pci_bus_entry {
+	struct list_head list;
+	struct pci_bus *bus;
+};
+
+#define _PDEVB_op_active		(0)
+#define PDEVB_op_active 		(1 << (_PDEVB_op_active))
+
+struct pcifront_device {
+	struct xenbus_device *xdev;
+	struct list_head root_buses;
+
+	int evtchn;
+	int gnt_ref;
+
+	int irq;
+
+	/* Lock this when doing any operations in sh_info */
+	spinlock_t sh_info_lock;
+	struct xen_pci_sharedinfo *sh_info;
+	struct work_struct op_work;
+	unsigned long flags;
+
+};
+
+struct pcifront_sd {
+	int domain;
+	struct pcifront_device *pdev;
+};
+
+static inline struct pcifront_device *
+pcifront_get_pdev(struct pcifront_sd *sd)
+{
+	return sd->pdev;
+}
+
+static inline void pcifront_init_sd(struct pcifront_sd *sd,
+				    unsigned int domain, unsigned int bus,
+				    struct pcifront_device *pdev)
+{
+	sd->domain = domain;
+	sd->pdev = pdev;
+}
+
+static inline void pcifront_setup_root_resources(struct pci_bus *bus,
+						 struct pcifront_sd *sd)
+{
+}
+
+
+DEFINE_SPINLOCK(pcifront_dev_lock);
+static struct pcifront_device *pcifront_dev;
+
+static int verbose_request;
+module_param(verbose_request, int, 0644);
+
+static int errno_to_pcibios_err(int errno)
+{
+	switch (errno) {
+	case XEN_PCI_ERR_success:
+		return PCIBIOS_SUCCESSFUL;
+
+	case XEN_PCI_ERR_dev_not_found:
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	case XEN_PCI_ERR_invalid_offset:
+	case XEN_PCI_ERR_op_failed:
+		return PCIBIOS_BAD_REGISTER_NUMBER;
+
+	case XEN_PCI_ERR_not_implemented:
+		return PCIBIOS_FUNC_NOT_SUPPORTED;
+
+	case XEN_PCI_ERR_access_denied:
+		return PCIBIOS_SET_FAILED;
+	}
+	return errno;
+}
+
+static inline void schedule_pcifront_aer_op(struct pcifront_device *pdev)
+{
+	if (test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
+		&& !test_and_set_bit(_PDEVB_op_active, &pdev->flags)) {
+		dev_dbg(&pdev->xdev->dev, "schedule aer frontend job\n");
+		schedule_work(&pdev->op_work);
+	}
+}
+
+static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op)
+{
+	int err = 0;
+	struct xen_pci_op *active_op = &pdev->sh_info->op;
+	unsigned long irq_flags;
+	evtchn_port_t port = pdev->evtchn;
+	unsigned irq = pdev->irq;
+	s64 ns, ns_timeout;
+	struct timeval tv;
+
+	spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
+
+	memcpy(active_op, op, sizeof(struct xen_pci_op));
+
+	/* Go */
+	wmb();
+	set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
+	notify_remote_via_evtchn(port);
+
+	/*
+	 * We set a poll timeout of 3 seconds but give up on return after
+	 * 2 seconds. It is better to time out too late rather than too early
+	 * (in the latter case we end up continually re-executing poll() with a
+	 * timeout in the past). 1s difference gives plenty of slack for error.
+	 */
+	do_gettimeofday(&tv);
+	ns_timeout = timeval_to_ns(&tv) + 2 * (s64)NSEC_PER_SEC;
+
+	xen_clear_irq_pending(irq);
+
+	while (test_bit(_XEN_PCIF_active,
+			(unsigned long *)&pdev->sh_info->flags)) {
+		xen_poll_irq_timeout(irq, jiffies + 3*HZ);
+		xen_clear_irq_pending(irq);
+		do_gettimeofday(&tv);
+		ns = timeval_to_ns(&tv);
+		if (ns > ns_timeout) {
+			dev_err(&pdev->xdev->dev,
+				"pciback not responding!!!\n");
+			clear_bit(_XEN_PCIF_active,
+				  (unsigned long *)&pdev->sh_info->flags);
+			err = XEN_PCI_ERR_dev_not_found;
+			goto out;
+		}
+	}
+
+	/*
+	* We might lose backend service request since we
+	* reuse same evtchn with pci_conf backend response. So re-schedule
+	* aer pcifront service.
+	*/
+	if (test_bit(_XEN_PCIB_active,
+			(unsigned long *)&pdev->sh_info->flags)) {
+		dev_err(&pdev->xdev->dev,
+			"schedule aer pcifront service\n");
+		schedule_pcifront_aer_op(pdev);
+	}
+
+	memcpy(op, active_op, sizeof(struct xen_pci_op));
+
+	err = op->err;
+out:
+	spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags);
+	return err;
+}
+
+/* Access to this function is spinlocked in drivers/pci/access.c */
+static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn,
+			     int where, int size, u32 *val)
+{
+	int err = 0;
+	struct xen_pci_op op = {
+		.cmd    = XEN_PCI_OP_conf_read,
+		.domain = pci_domain_nr(bus),
+		.bus    = bus->number,
+		.devfn  = devfn,
+		.offset = where,
+		.size   = size,
+	};
+	struct pcifront_sd *sd = bus->sysdata;
+	struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+	if (verbose_request)
+		dev_info(&pdev->xdev->dev,
+			 "read dev=%04x:%02x:%02x.%01x - offset %x size %d\n",
+			 pci_domain_nr(bus), bus->number, PCI_SLOT(devfn),
+			 PCI_FUNC(devfn), where, size);
+
+	err = do_pci_op(pdev, &op);
+
+	if (likely(!err)) {
+		if (verbose_request)
+			dev_info(&pdev->xdev->dev, "read got back value %x\n",
+				 op.value);
+
+		*val = op.value;
+	} else if (err == -ENODEV) {
+		/* No device here, pretend that it just returned 0 */
+		err = 0;
+		*val = 0;
+	}
+
+	return errno_to_pcibios_err(err);
+}
+
+/* Access to this function is spinlocked in drivers/pci/access.c */
+static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn,
+			      int where, int size, u32 val)
+{
+	struct xen_pci_op op = {
+		.cmd    = XEN_PCI_OP_conf_write,
+		.domain = pci_domain_nr(bus),
+		.bus    = bus->number,
+		.devfn  = devfn,
+		.offset = where,
+		.size   = size,
+		.value  = val,
+	};
+	struct pcifront_sd *sd = bus->sysdata;
+	struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+	if (verbose_request)
+		dev_info(&pdev->xdev->dev,
+			 "write dev=%04x:%02x:%02x.%01x - "
+			 "offset %x size %d val %x\n",
+			 pci_domain_nr(bus), bus->number,
+			 PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val);
+
+	return errno_to_pcibios_err(do_pci_op(pdev, &op));
+}
+
+struct pci_ops pcifront_bus_ops = {
+	.read = pcifront_bus_read,
+	.write = pcifront_bus_write,
+};
+
+#ifdef CONFIG_PCI_MSI
+static int pci_frontend_enable_msix(struct pci_dev *dev,
+				    int **vector, int nvec)
+{
+	int err;
+	int i;
+	struct xen_pci_op op = {
+		.cmd    = XEN_PCI_OP_enable_msix,
+		.domain = pci_domain_nr(dev->bus),
+		.bus = dev->bus->number,
+		.devfn = dev->devfn,
+		.value = nvec,
+	};
+	struct pcifront_sd *sd = dev->bus->sysdata;
+	struct pcifront_device *pdev = pcifront_get_pdev(sd);
+	struct msi_desc *entry;
+
+	if (nvec > SH_INFO_MAX_VEC) {
+		dev_err(&dev->dev, "too much vector for pci frontend: %x."
+				   " Increase SH_INFO_MAX_VEC.\n", nvec);
+		return -EINVAL;
+	}
+
+	i = 0;
+	list_for_each_entry(entry, &dev->msi_list, list) {
+		op.msix_entries[i].entry = entry->msi_attrib.entry_nr;
+		/* Vector is useless at this point. */
+		op.msix_entries[i].vector = -1;
+		i++;
+	}
+
+	err = do_pci_op(pdev, &op);
+
+	if (likely(!err)) {
+		if (likely(!op.value)) {
+			/* we get the result */
+			for (i = 0; i < nvec; i++)
+				*(*vector+i) = op.msix_entries[i].vector;
+			return 0;
+		} else {
+			printk(KERN_DEBUG "enable msix get value %x\n",
+				op.value);
+			return op.value;
+		}
+	} else {
+		dev_err(&dev->dev, "enable msix get err %x\n", err);
+		return err;
+	}
+}
+
+static void pci_frontend_disable_msix(struct pci_dev *dev)
+{
+	int err;
+	struct xen_pci_op op = {
+		.cmd    = XEN_PCI_OP_disable_msix,
+		.domain = pci_domain_nr(dev->bus),
+		.bus = dev->bus->number,
+		.devfn = dev->devfn,
+	};
+	struct pcifront_sd *sd = dev->bus->sysdata;
+	struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+	err = do_pci_op(pdev, &op);
+
+	/* What should do for error ? */
+	if (err)
+		dev_err(&dev->dev, "pci_disable_msix get err %x\n", err);
+}
+
+static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector)
+{
+	int err;
+	struct xen_pci_op op = {
+		.cmd    = XEN_PCI_OP_enable_msi,
+		.domain = pci_domain_nr(dev->bus),
+		.bus = dev->bus->number,
+		.devfn = dev->devfn,
+	};
+	struct pcifront_sd *sd = dev->bus->sysdata;
+	struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+	err = do_pci_op(pdev, &op);
+	if (likely(!err)) {
+		*(*vector) = op.value;
+	} else {
+		dev_err(&dev->dev, "pci frontend enable msi failed for dev "
+				   "%x:%x \n", op.bus, op.devfn);
+		err = -EINVAL;
+	}
+	return err;
+}
+
+static void pci_frontend_disable_msi(struct pci_dev *dev)
+{
+	int err;
+	struct xen_pci_op op = {
+		.cmd    = XEN_PCI_OP_disable_msi,
+		.domain = pci_domain_nr(dev->bus),
+		.bus = dev->bus->number,
+		.devfn = dev->devfn,
+	};
+	struct pcifront_sd *sd = dev->bus->sysdata;
+	struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+	err = do_pci_op(pdev, &op);
+	if (err == XEN_PCI_ERR_dev_not_found) {
+		/* XXX No response from backend, what shall we do? */
+		printk(KERN_DEBUG "get no response from backend for disable MSI\n");
+		return;
+	}
+	if (err)
+		/* how can pciback notify us fail? */
+		printk(KERN_DEBUG "get fake response frombackend \n");
+}
+
+static struct xen_pci_frontend_ops pci_frontend_ops = {
+	.enable_msi = pci_frontend_enable_msi,
+	.disable_msi = pci_frontend_disable_msi,
+	.enable_msix = pci_frontend_enable_msix,
+	.disable_msix = pci_frontend_disable_msix,
+};
+
+static void pci_frontend_registrar(int enable)
+{
+	if (enable)
+		xen_pci_frontend = &pci_frontend_ops;
+	else
+		xen_pci_frontend = NULL;
+};
+#else
+static inline void pci_frontend_registrar(int enable) { };
+#endif /* CONFIG_PCI_MSI */
+
+/* Claim resources for the PCI frontend as-is, backend won't allow changes */
+static int pcifront_claim_resource(struct pci_dev *dev, void *data)
+{
+	struct pcifront_device *pdev = data;
+	int i;
+	struct resource *r;
+
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		r = &dev->resource[i];
+
+		if (!r->parent && r->start && r->flags) {
+			dev_info(&pdev->xdev->dev, "claiming resource %s/%d\n",
+				pci_name(dev), i);
+			if (pci_claim_resource(dev, i)) {
+				dev_err(&pdev->xdev->dev, "Could not claim "
+					"resource %s/%d! Device offline. Try "
+					"giving less than 4GB to domain.\n",
+					pci_name(dev), i);
+			}
+		}
+	}
+
+	return 0;
+}
+
+int __devinit pcifront_scan_bus(struct pcifront_device *pdev,
+				unsigned int domain, unsigned int bus,
+				struct pci_bus *b)
+{
+	struct pci_dev *d;
+	unsigned int devfn;
+	int err;
+
+	/* Scan the bus for functions and add.
+	 * We omit handling of PCI bridge attachment because pciback prevents
+	 * bridges from being exported.
+	 */
+	for (devfn = 0; devfn < 0x100; devfn++) {
+		d = pci_get_slot(b, devfn);
+		if (d) {
+			/* Device is already known. */
+			pci_dev_put(d);
+			continue;
+		}
+
+		d = pci_scan_single_device(b, devfn);
+		if (d)
+			dev_info(&pdev->xdev->dev, "New device on "
+				 "%04x:%02x:%02x.%02x found.\n", domain, bus,
+				 PCI_SLOT(devfn), PCI_FUNC(devfn));
+	}
+
+	return 0;
+}
+
+int __devinit pcifront_scan_root(struct pcifront_device *pdev,
+				 unsigned int domain, unsigned int bus)
+{
+	struct pci_bus *b;
+	struct pcifront_sd *sd = NULL;
+	struct pci_bus_entry *bus_entry = NULL;
+	int err = 0;
+
+#ifndef CONFIG_PCI_DOMAINS
+	if (domain != 0) {
+		dev_err(&pdev->xdev->dev,
+			"PCI Root in non-zero PCI Domain! domain=%d\n", domain);
+		dev_err(&pdev->xdev->dev,
+			"Please compile with CONFIG_PCI_DOMAINS\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+#endif
+
+	dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n",
+		 domain, bus);
+
+	bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL);
+	sd = kmalloc(sizeof(*sd), GFP_KERNEL);
+	if (!bus_entry || !sd) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	pcifront_init_sd(sd, domain, bus, pdev);
+
+	b = pci_scan_bus_parented(&pdev->xdev->dev, bus,
+				  &pcifront_bus_ops, sd);
+	if (!b) {
+		dev_err(&pdev->xdev->dev,
+			"Error creating PCI Frontend Bus!\n");
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	pcifront_setup_root_resources(b, sd);
+	bus_entry->bus = b;
+
+	list_add(&bus_entry->list, &pdev->root_buses);
+
+	/* pci_scan_bus_parented skips devices which do not have a have
+	* devfn==0. The pcifront_scan_bus enumerates all devfn. */
+	err = pcifront_scan_bus(pdev, domain, bus, b);
+
+	/* Claim resources before going "live" with our devices */
+	pci_walk_bus(b, pcifront_claim_resource, pdev);
+
+	/* Create SysFS and notify udev of the devices. Aka: "going live" */
+	pci_bus_add_devices(b);
+
+	return err;
+
+err_out:
+	kfree(bus_entry);
+	kfree(sd);
+
+	return err;
+}
+
+int __devinit pcifront_rescan_root(struct pcifront_device *pdev,
+				   unsigned int domain, unsigned int bus)
+{
+	int err;
+	struct pci_bus *b;
+
+#ifndef CONFIG_PCI_DOMAINS
+	if (domain != 0) {
+		dev_err(&pdev->xdev->dev,
+			"PCI Root in non-zero PCI Domain! domain=%d\n", domain);
+		dev_err(&pdev->xdev->dev,
+			"Please compile with CONFIG_PCI_DOMAINS\n");
+		return -EINVAL;
+	}
+#endif
+
+	dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n",
+		 domain, bus);
+
+	b = pci_find_bus(domain, bus);
+	if (!b)
+		/* If the bus is unknown, create it. */
+		return pcifront_scan_root(pdev, domain, bus);
+
+	err = pcifront_scan_bus(pdev, domain, bus, b);
+
+	/* Claim resources before going "live" with our devices */
+	pci_walk_bus(b, pcifront_claim_resource, pdev);
+
+	/* Create SysFS and notify udev of the devices. Aka: "going live" */
+	pci_bus_add_devices(b);
+
+	return err;
+}
+
+static void free_root_bus_devs(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	while (!list_empty(&bus->devices)) {
+		dev = container_of(bus->devices.next, struct pci_dev,
+				   bus_list);
+		dev_dbg(&dev->dev, "removing device\n");
+		pci_remove_bus_device(dev);
+	}
+}
+
+void pcifront_free_roots(struct pcifront_device *pdev)
+{
+	struct pci_bus_entry *bus_entry, *t;
+
+	dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n");
+
+	list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) {
+		list_del(&bus_entry->list);
+
+		free_root_bus_devs(bus_entry->bus);
+
+		kfree(bus_entry->bus->sysdata);
+
+		device_unregister(bus_entry->bus->bridge);
+		pci_remove_bus(bus_entry->bus);
+
+		kfree(bus_entry);
+	}
+}
+
+static pci_ers_result_t pcifront_common_process(int cmd,
+						struct pcifront_device *pdev,
+						pci_channel_state_t state)
+{
+	pci_ers_result_t result;
+	struct pci_driver *pdrv;
+	int bus = pdev->sh_info->aer_op.bus;
+	int devfn = pdev->sh_info->aer_op.devfn;
+	struct pci_dev *pcidev;
+	int flag = 0;
+
+	dev_dbg(&pdev->xdev->dev,
+		"pcifront AER process: cmd %x (bus:%x, devfn%x)",
+		cmd, bus, devfn);
+	result = PCI_ERS_RESULT_NONE;
+
+	pcidev = pci_get_bus_and_slot(bus, devfn);
+	if (!pcidev || !pcidev->driver) {
+		dev_err(&pcidev->dev,
+			"device or driver is NULL\n");
+		return result;
+	}
+	pdrv = pcidev->driver;
+
+	if (get_driver(&pdrv->driver)) {
+		if (pdrv->err_handler && pdrv->err_handler->error_detected) {
+			dev_dbg(&pcidev->dev,
+				"trying to call AER service\n");
+			if (pcidev) {
+				flag = 1;
+				switch (cmd) {
+				case XEN_PCI_OP_aer_detected:
+					result = pdrv->err_handler->
+						 error_detected(pcidev, state);
+					break;
+				case XEN_PCI_OP_aer_mmio:
+					result = pdrv->err_handler->
+						 mmio_enabled(pcidev);
+					break;
+				case XEN_PCI_OP_aer_slotreset:
+					result = pdrv->err_handler->
+						 slot_reset(pcidev);
+					break;
+				case XEN_PCI_OP_aer_resume:
+					pdrv->err_handler->resume(pcidev);
+					break;
+				default:
+					dev_err(&pdev->xdev->dev,
+						"bad request in aer recovery "
+						"operation!\n");
+
+				}
+			}
+		}
+		put_driver(&pdrv->driver);
+	}
+	if (!flag)
+		result = PCI_ERS_RESULT_NONE;
+
+	return result;
+}
+
+
+void pcifront_do_aer(struct work_struct *data)
+{
+	struct pcifront_device *pdev =
+		container_of(data, struct pcifront_device, op_work);
+	int cmd = pdev->sh_info->aer_op.cmd;
+	pci_channel_state_t state =
+		(pci_channel_state_t)pdev->sh_info->aer_op.err;
+
+	/*If a pci_conf op is in progress,
+		we have to wait until it is done before service aer op*/
+	dev_dbg(&pdev->xdev->dev,
+		"pcifront service aer bus %x devfn %x\n",
+		pdev->sh_info->aer_op.bus, pdev->sh_info->aer_op.devfn);
+
+	pdev->sh_info->aer_op.err = pcifront_common_process(cmd, pdev, state);
+
+	wmb();
+	clear_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags);
+	notify_remote_via_evtchn(pdev->evtchn);
+
+	/*in case of we lost an aer request in four lines time_window*/
+	smp_mb__before_clear_bit();
+	clear_bit(_PDEVB_op_active, &pdev->flags);
+	smp_mb__after_clear_bit();
+
+	schedule_pcifront_aer_op(pdev);
+
+}
+
+irqreturn_t pcifront_handler_aer(int irq, void *dev)
+{
+	struct pcifront_device *pdev = dev;
+	schedule_pcifront_aer_op(pdev);
+	return IRQ_HANDLED;
+}
+int pcifront_connect(struct pcifront_device *pdev)
+{
+	int err = 0;
+
+	spin_lock(&pcifront_dev_lock);
+
+	if (!pcifront_dev) {
+		dev_info(&pdev->xdev->dev, "Installing PCI frontend\n");
+		pcifront_dev = pdev;
+	} else {
+		dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n");
+		err = -EEXIST;
+	}
+
+	spin_unlock(&pcifront_dev_lock);
+
+	return err;
+}
+
+void pcifront_disconnect(struct pcifront_device *pdev)
+{
+	spin_lock(&pcifront_dev_lock);
+
+	if (pdev == pcifront_dev) {
+		dev_info(&pdev->xdev->dev,
+			 "Disconnecting PCI Frontend Buses\n");
+		pcifront_dev = NULL;
+	}
+
+	spin_unlock(&pcifront_dev_lock);
+}
+static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev)
+{
+	struct pcifront_device *pdev;
+
+	pdev = kzalloc(sizeof(struct pcifront_device), GFP_KERNEL);
+	if (pdev == NULL)
+		goto out;
+
+	pdev->sh_info =
+	    (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL);
+	if (pdev->sh_info == NULL) {
+		kfree(pdev);
+		pdev = NULL;
+		goto out;
+	}
+	pdev->sh_info->flags = 0;
+
+	/*Flag for registering PV AER handler*/
+	set_bit(_XEN_PCIB_AERHANDLER, (void *)&pdev->sh_info->flags);
+
+	dev_set_drvdata(&xdev->dev, pdev);
+	pdev->xdev = xdev;
+
+	INIT_LIST_HEAD(&pdev->root_buses);
+
+	spin_lock_init(&pdev->sh_info_lock);
+
+	pdev->evtchn = INVALID_EVTCHN;
+	pdev->gnt_ref = INVALID_GRANT_REF;
+	pdev->irq = -1;
+
+	INIT_WORK(&pdev->op_work, pcifront_do_aer);
+
+	dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n",
+		pdev, pdev->sh_info);
+out:
+	return pdev;
+}
+
+static void free_pdev(struct pcifront_device *pdev)
+{
+	dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev);
+
+	pcifront_free_roots(pdev);
+
+	/*For PCIE_AER error handling job*/
+	flush_scheduled_work();
+	unbind_from_irqhandler(pdev->irq, pdev);
+
+	if (pdev->evtchn != INVALID_EVTCHN)
+		xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
+
+	if (pdev->gnt_ref != INVALID_GRANT_REF)
+		gnttab_end_foreign_access(pdev->gnt_ref, 0 /* r/w page */,
+					  (unsigned long)pdev->sh_info);
+
+	dev_set_drvdata(&pdev->xdev->dev, NULL);
+	kfree(pdev);
+}
+
+static int pcifront_publish_info(struct pcifront_device *pdev)
+{
+	int err = 0;
+	struct xenbus_transaction trans;
+
+	err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
+	if (err < 0)
+		goto out;
+
+	pdev->gnt_ref = err;
+
+	err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
+	if (err)
+		goto out;
+
+	err = bind_evtchn_to_irqhandler(pdev->evtchn, pcifront_handler_aer,
+		0, "pcifront", pdev);
+	if (err < 0) {
+		xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
+		xenbus_dev_fatal(pdev->xdev, err, "Failed to bind evtchn to "
+				 "irqhandler.\n");
+		return err;
+	}
+	pdev->irq = err;
+
+do_publish:
+	err = xenbus_transaction_start(&trans);
+	if (err) {
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error writing configuration for backend "
+				 "(start transaction)");
+		goto out;
+	}
+
+	err = xenbus_printf(trans, pdev->xdev->nodename,
+			    "pci-op-ref", "%u", pdev->gnt_ref);
+	if (!err)
+		err = xenbus_printf(trans, pdev->xdev->nodename,
+				    "event-channel", "%u", pdev->evtchn);
+	if (!err)
+		err = xenbus_printf(trans, pdev->xdev->nodename,
+				    "magic", XEN_PCI_MAGIC);
+
+	if (err) {
+		xenbus_transaction_end(trans, 1);
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error writing configuration for backend");
+		goto out;
+	} else {
+		err = xenbus_transaction_end(trans, 0);
+		if (err == -EAGAIN)
+			goto do_publish;
+		else if (err) {
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "Error completing transaction "
+					 "for backend");
+			goto out;
+		}
+	}
+
+	xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
+
+	dev_dbg(&pdev->xdev->dev, "publishing successful!\n");
+
+out:
+	return err;
+}
+
+static int __devinit pcifront_try_connect(struct pcifront_device *pdev)
+{
+	int err = -EFAULT;
+	int i, num_roots, len;
+	char str[64];
+	unsigned int domain, bus;
+
+
+	/* Only connect once */
+	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+	    XenbusStateInitialised)
+		goto out;
+
+	err = pcifront_connect(pdev);
+	if (err) {
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error connecting PCI Frontend");
+		goto out;
+	}
+
+	err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+			   "root_num", "%d", &num_roots);
+	if (err == -ENOENT) {
+		xenbus_dev_error(pdev->xdev, err,
+				 "No PCI Roots found, trying 0000:00");
+		err = pcifront_scan_root(pdev, 0, 0);
+		num_roots = 0;
+	} else if (err != 1) {
+		if (err == 0)
+			err = -EINVAL;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error reading number of PCI roots");
+		goto out;
+	}
+
+	for (i = 0; i < num_roots; i++) {
+		len = snprintf(str, sizeof(str), "root-%d", i);
+		if (unlikely(len >= (sizeof(str) - 1))) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+				   "%x:%x", &domain, &bus);
+		if (err != 2) {
+			if (err >= 0)
+				err = -EINVAL;
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "Error reading PCI root %d", i);
+			goto out;
+		}
+
+		err = pcifront_scan_root(pdev, domain, bus);
+		if (err) {
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "Error scanning PCI root %04x:%02x",
+					 domain, bus);
+			goto out;
+		}
+	}
+
+	err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+
+out:
+	return err;
+}
+
+static int pcifront_try_disconnect(struct pcifront_device *pdev)
+{
+	int err = 0;
+	enum xenbus_state prev_state;
+
+
+	prev_state = xenbus_read_driver_state(pdev->xdev->nodename);
+
+	if (prev_state >= XenbusStateClosing)
+		goto out;
+
+	if (prev_state == XenbusStateConnected) {
+		pcifront_free_roots(pdev);
+		pcifront_disconnect(pdev);
+	}
+
+	err = xenbus_switch_state(pdev->xdev, XenbusStateClosed);
+
+out:
+
+	return err;
+}
+
+static int __devinit pcifront_attach_devices(struct pcifront_device *pdev)
+{
+	int err = -EFAULT;
+	int i, num_roots, len;
+	unsigned int domain, bus;
+	char str[64];
+
+	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+	    XenbusStateReconfiguring)
+		goto out;
+
+	err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+			   "root_num", "%d", &num_roots);
+	if (err == -ENOENT) {
+		xenbus_dev_error(pdev->xdev, err,
+				 "No PCI Roots found, trying 0000:00");
+		err = pcifront_rescan_root(pdev, 0, 0);
+		num_roots = 0;
+	} else if (err != 1) {
+		if (err == 0)
+			err = -EINVAL;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error reading number of PCI roots");
+		goto out;
+	}
+
+	for (i = 0; i < num_roots; i++) {
+		len = snprintf(str, sizeof(str), "root-%d", i);
+		if (unlikely(len >= (sizeof(str) - 1))) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+				   "%x:%x", &domain, &bus);
+		if (err != 2) {
+			if (err >= 0)
+				err = -EINVAL;
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "Error reading PCI root %d", i);
+			goto out;
+		}
+
+		err = pcifront_rescan_root(pdev, domain, bus);
+		if (err) {
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "Error scanning PCI root %04x:%02x",
+					 domain, bus);
+			goto out;
+		}
+	}
+
+	xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+
+out:
+	return err;
+}
+
+static int pcifront_detach_devices(struct pcifront_device *pdev)
+{
+	int err = 0;
+	int i, num_devs;
+	unsigned int domain, bus, slot, func;
+	struct pci_bus *pci_bus;
+	struct pci_dev *pci_dev;
+	char str[64];
+
+	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+	    XenbusStateConnected)
+		goto out;
+
+	err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "num_devs", "%d",
+			   &num_devs);
+	if (err != 1) {
+		if (err >= 0)
+			err = -EINVAL;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error reading number of PCI devices");
+		goto out;
+	}
+
+	/* Find devices being detached and remove them. */
+	for (i = 0; i < num_devs; i++) {
+		int l, state;
+		l = snprintf(str, sizeof(str), "state-%d", i);
+		if (unlikely(l >= (sizeof(str) - 1))) {
+			err = -ENOMEM;
+			goto out;
+		}
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, "%d",
+				   &state);
+		if (err != 1)
+			state = XenbusStateUnknown;
+
+		if (state != XenbusStateClosing)
+			continue;
+
+		/* Remove device. */
+		l = snprintf(str, sizeof(str), "vdev-%d", i);
+		if (unlikely(l >= (sizeof(str) - 1))) {
+			err = -ENOMEM;
+			goto out;
+		}
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+				   "%x:%x:%x.%x", &domain, &bus, &slot, &func);
+		if (err != 4) {
+			if (err >= 0)
+				err = -EINVAL;
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "Error reading PCI device %d", i);
+			goto out;
+		}
+
+		pci_bus = pci_find_bus(domain, bus);
+		if (!pci_bus) {
+			dev_dbg(&pdev->xdev->dev, "Cannot get bus %04x:%02x\n",
+				domain, bus);
+			continue;
+		}
+		pci_dev = pci_get_slot(pci_bus, PCI_DEVFN(slot, func));
+		if (!pci_dev) {
+			dev_dbg(&pdev->xdev->dev,
+				"Cannot get PCI device %04x:%02x:%02x.%02x\n",
+				domain, bus, slot, func);
+			continue;
+		}
+		pci_remove_bus_device(pci_dev);
+		pci_dev_put(pci_dev);
+
+		dev_dbg(&pdev->xdev->dev,
+			"PCI device %04x:%02x:%02x.%02x removed.\n",
+			domain, bus, slot, func);
+	}
+
+	err = xenbus_switch_state(pdev->xdev, XenbusStateReconfiguring);
+
+out:
+	return err;
+}
+
+static void __init_refok pcifront_backend_changed(struct xenbus_device *xdev,
+						  enum xenbus_state be_state)
+{
+	struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev);
+
+	switch (be_state) {
+	case XenbusStateUnknown:
+	case XenbusStateInitialising:
+	case XenbusStateInitWait:
+	case XenbusStateInitialised:
+	case XenbusStateClosed:
+		break;
+
+	case XenbusStateConnected:
+		pcifront_try_connect(pdev);
+		break;
+
+	case XenbusStateClosing:
+		dev_warn(&xdev->dev, "backend going away!\n");
+		pcifront_try_disconnect(pdev);
+		break;
+
+	case XenbusStateReconfiguring:
+		pcifront_detach_devices(pdev);
+		break;
+
+	case XenbusStateReconfigured:
+		pcifront_attach_devices(pdev);
+		break;
+	}
+}
+
+static int pcifront_xenbus_probe(struct xenbus_device *xdev,
+				 const struct xenbus_device_id *id)
+{
+	int err = 0;
+	struct pcifront_device *pdev = alloc_pdev(xdev);
+
+	if (pdev == NULL) {
+		err = -ENOMEM;
+		xenbus_dev_fatal(xdev, err,
+				 "Error allocating pcifront_device struct");
+		goto out;
+	}
+
+	err = pcifront_publish_info(pdev);
+
+out:
+	return err;
+}
+
+static int pcifront_xenbus_remove(struct xenbus_device *xdev)
+{
+	struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev);
+
+	if (pdev)
+		free_pdev(pdev);
+
+	return 0;
+}
+
+static const struct xenbus_device_id xenpci_ids[] = {
+	{"pci"},
+	{""},
+};
+
+static struct xenbus_driver xenbus_pcifront_driver = {
+	.name 			= "pcifront",
+	.owner 			= THIS_MODULE,
+	.ids 			= xenpci_ids,
+	.probe 			= pcifront_xenbus_probe,
+	.remove 		= pcifront_xenbus_remove,
+	.otherend_changed 	= pcifront_backend_changed,
+};
+
+static int __init pcifront_init(void)
+{
+	if (!xen_domain())
+		return -ENODEV;
+
+	pci_frontend_registrar(1 /* enable */);
+
+	return xenbus_register_frontend(&xenbus_pcifront_driver);
+}
+
+static void __exit pcifront_cleanup(void)
+{
+	xenbus_unregister_driver(&xenbus_pcifront_driver);
+	pci_frontend_registrar(0 /* disable */);
+}
+module_init(pcifront_init);
+module_exit(pcifront_cleanup);
+
+MODULE_DESCRIPTION("Xen PCI passthrough frontend.");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("xen:pci");
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index 188e1ba..efac9e3 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -2063,6 +2063,7 @@
 	select FB_SYS_IMAGEBLIT
 	select FB_SYS_FOPS
 	select FB_DEFERRED_IO
+	select XEN_XENBUS_FRONTEND
 	default y
 	help
 	  This driver implements the front-end of the Xen virtual
diff --git a/drivers/video/broadsheetfb.c b/drivers/video/broadsheetfb.c
index 509cb92..df9ccb9 100644
--- a/drivers/video/broadsheetfb.c
+++ b/drivers/video/broadsheetfb.c
@@ -470,7 +470,7 @@
 	par->read_reg = broadsheet_read_reg;
 	init_waitqueue_head(&par->waitq);
 
-	info->flags = FBINFO_FLAG_DEFAULT;
+	info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
 
 	info->fbdefio = &broadsheetfb_defio;
 	fb_deferred_io_init(info);
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
index c27ab1e..94414fc 100644
--- a/drivers/video/fb_defio.c
+++ b/drivers/video/fb_defio.c
@@ -144,7 +144,9 @@
 static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	vma->vm_ops = &fb_deferred_io_vm_ops;
-	vma->vm_flags |= ( VM_IO | VM_RESERVED | VM_DONTEXPAND );
+	vma->vm_flags |= ( VM_RESERVED | VM_DONTEXPAND );
+	if (!(info->flags & FBINFO_VIRTFB))
+		vma->vm_flags |= VM_IO;
 	vma->vm_private_data = info;
 	return 0;
 }
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 99bbd28..057433a 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1362,6 +1362,7 @@
 	vma->vm_pgoff = off >> PAGE_SHIFT;
 	/* This is an IO map - tell maydump to skip this VMA */
 	vma->vm_flags |= VM_IO | VM_RESERVED;
+	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	fb_pgprotect(file, vma, off);
 	if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT,
 			     vma->vm_end - vma->vm_start, vma->vm_page_prot))
diff --git a/drivers/video/hecubafb.c b/drivers/video/hecubafb.c
index 0b4bffb..f9d77ad 100644
--- a/drivers/video/hecubafb.c
+++ b/drivers/video/hecubafb.c
@@ -253,7 +253,7 @@
 	par->send_command = apollo_send_command;
 	par->send_data = apollo_send_data;
 
-	info->flags = FBINFO_FLAG_DEFAULT;
+	info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
 
 	info->fbdefio = &hecubafb_defio;
 	fb_deferred_io_init(info);
diff --git a/drivers/video/metronomefb.c b/drivers/video/metronomefb.c
index df1f757..661bfd2 100644
--- a/drivers/video/metronomefb.c
+++ b/drivers/video/metronomefb.c
@@ -700,7 +700,7 @@
 	if (retval < 0)
 		goto err_free_irq;
 
-	info->flags = FBINFO_FLAG_DEFAULT;
+	info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
 
 	info->fbdefio = &metronomefb_defio;
 	fb_deferred_io_init(info);
diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c
index 54cd916..dc72563 100644
--- a/drivers/video/xen-fbfront.c
+++ b/drivers/video/xen-fbfront.c
@@ -25,7 +25,10 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
+
 #include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
 #include <xen/events.h>
 #include <xen/page.h>
 #include <xen/interface/io/fbif.h>
@@ -440,7 +443,7 @@
 	fb_info->fix.type = FB_TYPE_PACKED_PIXELS;
 	fb_info->fix.accel = FB_ACCEL_NONE;
 
-	fb_info->flags = FBINFO_FLAG_DEFAULT;
+	fb_info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
 
 	ret = fb_alloc_cmap(&fb_info->cmap, 256, 0);
 	if (ret < 0) {
@@ -627,6 +630,8 @@
 	switch (backend_state) {
 	case XenbusStateInitialising:
 	case XenbusStateInitialised:
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
 	case XenbusStateUnknown:
 	case XenbusStateClosed:
 		break;
@@ -680,7 +685,7 @@
 
 static int __init xenfb_init(void)
 {
-	if (!xen_domain())
+	if (!xen_domain() || xen_hvm_domain())
 		return -ENODEV;
 
 	/* Nothing to do if running in dom0. */
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 3711b88..4fcb4c5 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -975,6 +975,16 @@
 
 # XTENSA Architecture
 
+# Xen Architecture
+
+config XEN_WDT
+	tristate "Xen Watchdog support"
+	depends on XEN
+	help
+	  Say Y here to support the hypervisor watchdog capability provided
+	  by Xen 4.0 and newer.  The watchdog timeout period is normally one
+	  minute but can be changed with a boot-time parameter.
+
 #
 # ISA-based Watchdog Cards
 #
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index 699199b..2f6739a 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -141,6 +141,9 @@
 
 # XTENSA Architecture
 
+# Xen
+obj-$(CONFIG_XEN_WDT) += xen_wdt.o
+
 # Architecture Independant
 obj-$(CONFIG_WM831X_WATCHDOG) += wm831x_wdt.o
 obj-$(CONFIG_WM8350_WATCHDOG) += wm8350_wdt.o
diff --git a/drivers/watchdog/xen_wdt.c b/drivers/watchdog/xen_wdt.c
new file mode 100644
index 0000000..bcfaafb
--- /dev/null
+++ b/drivers/watchdog/xen_wdt.c
@@ -0,0 +1,359 @@
+/*
+ *	Xen Watchdog Driver
+ *
+ *	(c) Copyright 2010 Novell, Inc.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#define DRV_NAME	"wdt"
+#define DRV_VERSION	"0.01"
+#define PFX		DRV_NAME ": "
+
+#include <linux/bug.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/hrtimer.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/watchdog.h>
+#include <xen/xen.h>
+#include <asm/xen/hypercall.h>
+#include <xen/interface/sched.h>
+
+static struct platform_device *platform_device;
+static DEFINE_SPINLOCK(wdt_lock);
+static struct sched_watchdog wdt;
+static __kernel_time_t wdt_expires;
+static bool is_active, expect_release;
+
+#define WATCHDOG_TIMEOUT 60 /* in seconds */
+static unsigned int timeout = WATCHDOG_TIMEOUT;
+module_param(timeout, uint, S_IRUGO);
+MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds "
+	"(default=" __MODULE_STRING(WATCHDOG_TIMEOUT) ")");
+
+static bool nowayout = WATCHDOG_NOWAYOUT;
+module_param(nowayout, bool, S_IRUGO);
+MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started "
+	"(default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
+static inline __kernel_time_t set_timeout(void)
+{
+	wdt.timeout = timeout;
+	return ktime_to_timespec(ktime_get()).tv_sec + timeout;
+}
+
+static int xen_wdt_start(void)
+{
+	__kernel_time_t expires;
+	int err;
+
+	spin_lock(&wdt_lock);
+
+	expires = set_timeout();
+	if (!wdt.id)
+		err = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wdt);
+	else
+		err = -EBUSY;
+	if (err > 0) {
+		wdt.id = err;
+		wdt_expires = expires;
+		err = 0;
+	} else
+		BUG_ON(!err);
+
+	spin_unlock(&wdt_lock);
+
+	return err;
+}
+
+static int xen_wdt_stop(void)
+{
+	int err = 0;
+
+	spin_lock(&wdt_lock);
+
+	wdt.timeout = 0;
+	if (wdt.id)
+		err = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wdt);
+	if (!err)
+		wdt.id = 0;
+
+	spin_unlock(&wdt_lock);
+
+	return err;
+}
+
+static int xen_wdt_kick(void)
+{
+	__kernel_time_t expires;
+	int err;
+
+	spin_lock(&wdt_lock);
+
+	expires = set_timeout();
+	if (wdt.id)
+		err = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wdt);
+	else
+		err = -ENXIO;
+	if (!err)
+		wdt_expires = expires;
+
+	spin_unlock(&wdt_lock);
+
+	return err;
+}
+
+static int xen_wdt_open(struct inode *inode, struct file *file)
+{
+	int err;
+
+	/* /dev/watchdog can only be opened once */
+	if (xchg(&is_active, true))
+		return -EBUSY;
+
+	err = xen_wdt_start();
+	if (err == -EBUSY)
+		err = xen_wdt_kick();
+	return err ?: nonseekable_open(inode, file);
+}
+
+static int xen_wdt_release(struct inode *inode, struct file *file)
+{
+	if (expect_release)
+		xen_wdt_stop();
+	else {
+		printk(KERN_CRIT PFX
+		       "unexpected close, not stopping watchdog!\n");
+		xen_wdt_kick();
+	}
+	is_active = false;
+	expect_release = false;
+	return 0;
+}
+
+static ssize_t xen_wdt_write(struct file *file, const char __user *data,
+			     size_t len, loff_t *ppos)
+{
+	/* See if we got the magic character 'V' and reload the timer */
+	if (len) {
+		if (!nowayout) {
+			size_t i;
+
+			/* in case it was set long ago */
+			expect_release = false;
+
+			/* scan to see whether or not we got the magic
+			   character */
+			for (i = 0; i != len; i++) {
+				char c;
+				if (get_user(c, data + i))
+					return -EFAULT;
+				if (c == 'V')
+					expect_release = true;
+			}
+		}
+
+		/* someone wrote to us, we should reload the timer */
+		xen_wdt_kick();
+	}
+	return len;
+}
+
+static long xen_wdt_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	int new_options, retval = -EINVAL;
+	int new_timeout;
+	int __user *argp = (void __user *)arg;
+	static const struct watchdog_info ident = {
+		.options =		WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE,
+		.firmware_version =	0,
+		.identity =		DRV_NAME,
+	};
+
+	switch (cmd) {
+	case WDIOC_GETSUPPORT:
+		return copy_to_user(argp, &ident, sizeof(ident)) ? -EFAULT : 0;
+
+	case WDIOC_GETSTATUS:
+	case WDIOC_GETBOOTSTATUS:
+		return put_user(0, argp);
+
+	case WDIOC_SETOPTIONS:
+		if (get_user(new_options, argp))
+			return -EFAULT;
+
+		if (new_options & WDIOS_DISABLECARD)
+			retval = xen_wdt_stop();
+		if (new_options & WDIOS_ENABLECARD) {
+			retval = xen_wdt_start();
+			if (retval == -EBUSY)
+				retval = xen_wdt_kick();
+		}
+		return retval;
+
+	case WDIOC_KEEPALIVE:
+		xen_wdt_kick();
+		return 0;
+
+	case WDIOC_SETTIMEOUT:
+		if (get_user(new_timeout, argp))
+			return -EFAULT;
+		if (!new_timeout)
+			return -EINVAL;
+		timeout = new_timeout;
+		xen_wdt_kick();
+		/* fall through */
+	case WDIOC_GETTIMEOUT:
+		return put_user(timeout, argp);
+
+	case WDIOC_GETTIMELEFT:
+		retval = wdt_expires - ktime_to_timespec(ktime_get()).tv_sec;
+		return put_user(retval, argp);
+	}
+
+	return -ENOTTY;
+}
+
+static const struct file_operations xen_wdt_fops = {
+	.owner =		THIS_MODULE,
+	.llseek =		no_llseek,
+	.write =		xen_wdt_write,
+	.unlocked_ioctl =	xen_wdt_ioctl,
+	.open =			xen_wdt_open,
+	.release =		xen_wdt_release,
+};
+
+static struct miscdevice xen_wdt_miscdev = {
+	.minor =	WATCHDOG_MINOR,
+	.name =		"watchdog",
+	.fops =		&xen_wdt_fops,
+};
+
+static int __devinit xen_wdt_probe(struct platform_device *dev)
+{
+	struct sched_watchdog wd = { .id = ~0 };
+	int ret = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wd);
+
+	switch (ret) {
+	case -EINVAL:
+		if (!timeout) {
+			timeout = WATCHDOG_TIMEOUT;
+			printk(KERN_INFO PFX
+			       "timeout value invalid, using %d\n", timeout);
+		}
+
+		ret = misc_register(&xen_wdt_miscdev);
+		if (ret) {
+			printk(KERN_ERR PFX
+			       "cannot register miscdev on minor=%d (%d)\n",
+			       WATCHDOG_MINOR, ret);
+			break;
+		}
+
+		printk(KERN_INFO PFX
+		       "initialized (timeout=%ds, nowayout=%d)\n",
+		       timeout, nowayout);
+		break;
+
+	case -ENOSYS:
+		printk(KERN_INFO PFX "not supported\n");
+		ret = -ENODEV;
+		break;
+
+	default:
+		printk(KERN_INFO PFX "bogus return value %d\n", ret);
+		break;
+	}
+
+	return ret;
+}
+
+static int __devexit xen_wdt_remove(struct platform_device *dev)
+{
+	/* Stop the timer before we leave */
+	if (!nowayout)
+		xen_wdt_stop();
+
+	misc_deregister(&xen_wdt_miscdev);
+
+	return 0;
+}
+
+static void xen_wdt_shutdown(struct platform_device *dev)
+{
+	xen_wdt_stop();
+}
+
+static int xen_wdt_suspend(struct platform_device *dev, pm_message_t state)
+{
+	return xen_wdt_stop();
+}
+
+static int xen_wdt_resume(struct platform_device *dev)
+{
+	return xen_wdt_start();
+}
+
+static struct platform_driver xen_wdt_driver = {
+	.probe          = xen_wdt_probe,
+	.remove         = __devexit_p(xen_wdt_remove),
+	.shutdown       = xen_wdt_shutdown,
+	.suspend        = xen_wdt_suspend,
+	.resume         = xen_wdt_resume,
+	.driver         = {
+		.owner  = THIS_MODULE,
+		.name   = DRV_NAME,
+	},
+};
+
+static int __init xen_wdt_init_module(void)
+{
+	int err;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	printk(KERN_INFO PFX "Xen WatchDog Timer Driver v%s\n", DRV_VERSION);
+
+	err = platform_driver_register(&xen_wdt_driver);
+	if (err)
+		return err;
+
+	platform_device = platform_device_register_simple(DRV_NAME,
+								  -1, NULL, 0);
+	if (IS_ERR(platform_device)) {
+		err = PTR_ERR(platform_device);
+		platform_driver_unregister(&xen_wdt_driver);
+	}
+
+	return err;
+}
+
+static void __exit xen_wdt_cleanup_module(void)
+{
+	platform_device_unregister(platform_device);
+	platform_driver_unregister(&xen_wdt_driver);
+	printk(KERN_INFO PFX "module unloaded\n");
+}
+
+module_init(xen_wdt_init_module);
+module_exit(xen_wdt_cleanup_module);
+
+MODULE_AUTHOR("Jen Beulich <jbeulich@novell.com>");
+MODULE_DESCRIPTION("Xen WatchDog Timer Driver");
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index cab100a..fa9982e 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -28,6 +28,110 @@
 	  firing.
 	  If in doubt, say yes.
 
+config XEN_BACKEND
+	bool "Backend driver support"
+	depends on XEN_DOM0
+	default y
+	help
+	  Support for backend device drivers that provide I/O services
+	  to other virtual machines.
+
+config XEN_NETDEV_BACKEND
+       tristate "Xen backend network device"
+       depends on XEN_BACKEND && NET
+       help
+         Implement the network backend driver, which passes packets
+         from the guest domain's frontend drivers to the network.
+
+config XEN_BLKDEV_BACKEND
+	tristate "Block-device backend driver"
+	depends on XEN_BACKEND && BLOCK
+	help
+	  The block-device backend driver allows the kernel to export its
+	  block devices to other guests via a high-performance shared-memory
+	  interface.
+
+ 
+config XEN_BLKDEV_TAP
+	tristate "Block-device tap backend driver"
+	depends on XEN_BACKEND && BLOCK
+	help
+	  The block tap driver is an alternative to the block back driver
+	  and allows VM block requests to be redirected to userspace through
+	  a device interface.  The tap allows user-space development of
+	  high-performance block backends, where disk images may be implemented
+	  as files, in memory, or on other hosts across the network.  This
+	  driver can safely coexist with the existing blockback driver.
+
+config XEN_BLKBACK_PAGEMAP
+	tristate
+	depends on XEN_BLKDEV_BACKEND != n && XEN_BLKDEV_TAP != n
+	default XEN_BLKDEV_BACKEND || XEN_BLKDEV_TAP
+
+config XEN_PCIDEV_BACKEND
+	tristate "PCI-device backend driver"
+	depends on PCI && XEN_BACKEND
+	default XEN_BACKEND
+	help
+	  The PCI device backend driver allows the kernel to export arbitrary
+	  PCI devices to other guests. If you select this to be a module, you
+	  will need to make sure no other driver has bound to the device(s)
+	  you want to make visible to other guests.
+
+choice
+	prompt "PCI Backend Mode"
+	depends on XEN_PCIDEV_BACKEND
+	default XEN_PCIDEV_BACKEND_VPCI if !IA64
+	default XEN_PCIDEV_BACKEND_CONTROLLER if IA64
+
+config XEN_PCIDEV_BACKEND_VPCI
+	bool "Virtual PCI"
+	---help---
+	  This PCI Backend hides the true PCI topology and makes the frontend
+	  think there is a single PCI bus with only the exported devices on it.
+	  For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
+	  second device at 02:1a.1 will be re-assigned to 00:01.1.
+
+config XEN_PCIDEV_BACKEND_PASS
+	bool "Passthrough"
+	---help---
+	  This PCI Backend provides a real view of the PCI topology to the
+	  frontend (for example, a device at 06:01.b will still appear at
+	  06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
+	  PCI devices to its driver domains. This may be required for drivers
+	  which depend on finding their hardward in certain bus/slot
+	  locations.
+
+config XEN_PCIDEV_BACKEND_SLOT
+	bool "Slot"
+	---help---
+	  This PCI Backend hides the true PCI topology and makes the frontend
+	  think there is a single PCI bus with only the exported devices on it.
+	  Contrary to the virtual PCI backend, a function becomes a new slot.
+	  For example, a device at 03:05.2 will be re-assigned to 00:00.0. A
+	  second device at 02:1a.1 will be re-assigned to 00:01.0.
+
+config XEN_PCIDEV_BACKEND_CONTROLLER
+	bool "Controller"
+	depends on IA64
+	---help---
+	  This PCI backend virtualizes the PCI bus topology by providing a
+	  virtual bus per PCI root device.  Devices which are physically under
+	  the same root bus will appear on the same virtual bus.  For systems
+	  with complex I/O addressing, this is the only backend which supports
+	  extended I/O port spaces and MMIO translation offsets.  This backend
+	  also supports slot virtualization.  For example, a device at
+	  0000:01:02.1 will be re-assigned to 0000:00:00.0.  A second device
+	  at 0000:02:05.0 (behind a P2P bridge on bus 0000:01) will be
+	  re-assigned to 0000:00:01.0.  A third device at 0000:16:05.0 (under
+	  a different PCI root bus) will be re-assigned to 0000:01:00.0.
+
+endchoice
+
+config XEN_PCIDEV_BE_DEBUG
+	bool "PCI Backend Debugging"
+	depends on XEN_PCIDEV_BACKEND
+
 config XENFS
 	tristate "Xen filesystem"
 	depends on XEN
@@ -60,4 +164,37 @@
          Create entries under /sys/hypervisor describing the Xen
 	 hypervisor environment.  When running native or in another
 	 virtual environment, /sys/hypervisor will still be present,
-	 but will have no xen contents.
\ No newline at end of file
+	 but will have no xen contents.
+
+config XEN_MCE
+       def_bool y
+       depends on XEN_DOM0 && X86_64 && X86_MCE_INTEL
+
+config XEN_XENBUS_FRONTEND
+       tristate
+
+config XEN_GNTDEV
+	tristate "userspace grant access device driver"
+	depends on XEN
+	select MMU_NOTIFIER
+	help
+	  Allows userspace processes use grants.
+
+config XEN_S3
+       def_bool y
+       depends on XEN_DOM0 && ACPI
+
+config ACPI_PROCESSOR_XEN
+	   tristate
+	   depends on XEN_DOM0 && ACPI_PROCESSOR && CPU_FREQ
+	   default y
+
+config XEN_PLATFORM_PCI
+	tristate "xen platform pci device driver"
+	depends on XEN_PVHVM
+	default m
+	help
+	  Driver for the Xen PCI Platform device: it is responsible for
+	  initializing xenbus and grant_table when running in a Xen HVM
+	  domain. As a consequence this driver is required to run any Xen PV
+	  frontend on Xen HVM.
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 7c28434..ef1ea63 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,12 +1,27 @@
-obj-y	+= grant-table.o features.o events.o manage.o
+obj-y	+= grant-table.o features.o events.o manage.o biomerge.o pcpu.o
 obj-y	+= xenbus/
 
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_features.o			:= $(nostackp)
 
-obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o
-obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
-obj-$(CONFIG_XEN_BALLOON)	+= balloon.o
-obj-$(CONFIG_XEN_DEV_EVTCHN)	+= evtchn.o
-obj-$(CONFIG_XENFS)		+= xenfs/
-obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
\ No newline at end of file
+obj-$(CONFIG_PCI)			+= pci.o
+obj-$(CONFIG_HOTPLUG_CPU)		+= cpu_hotplug.o
+obj-$(CONFIG_XEN_XENCOMM)		+= xencomm.o
+obj-$(CONFIG_XEN_BALLOON)		+= balloon.o
+obj-$(CONFIG_XEN_DEV_EVTCHN)		+= xen-evtchn.o
+obj-$(CONFIG_XEN_GNTDEV)		+= xen-gntdev.o
+obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= pciback/
+obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= blkback/
+obj-$(CONFIG_XEN_BLKDEV_TAP)            += blktap/
+obj-$(CONFIG_XEN_NETDEV_BACKEND)	+= netback/
+obj-$(CONFIG_XENFS)			+= xenfs/
+obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
+obj-$(CONFIG_XEN_MCE)		+= mce.o
+
+obj-$(CONFIG_XEN_S3)           += acpi.o
+obj-$(CONFIG_ACPI_PROCESSOR_XEN) += acpi_processor.o
+obj-$(CONFIG_ACPI_HOTPLUG_MEMORY)  += xen_acpi_memhotplug.o
+obj-$(CONFIG_XEN_PLATFORM_PCI)	+= platform-pci.o
+
+xen-evtchn-y				:= evtchn.o
+xen-gntdev-y				:= gntdev.o
diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c
new file mode 100644
index 0000000..e6d3d0e
--- /dev/null
+++ b/drivers/xen/acpi.c
@@ -0,0 +1,23 @@
+#include <xen/acpi.h>
+
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+int acpi_notify_hypervisor_state(u8 sleep_state,
+				 u32 pm1a_cnt, u32 pm1b_cnt)
+{
+	struct xen_platform_op op = {
+		.cmd = XENPF_enter_acpi_sleep,
+		.interface_version = XENPF_INTERFACE_VERSION,
+		.u = {
+			.enter_acpi_sleep = {
+				.pm1a_cnt_val = (u16)pm1a_cnt,
+				.pm1b_cnt_val = (u16)pm1b_cnt,
+				.sleep_state = sleep_state,
+			},
+		},
+	};
+
+	return HYPERVISOR_dom0_op(&op);
+}
diff --git a/drivers/xen/acpi_processor.c b/drivers/xen/acpi_processor.c
new file mode 100644
index 0000000..e83b615
--- /dev/null
+++ b/drivers/xen/acpi_processor.c
@@ -0,0 +1,417 @@
+/*
+ *  acpi_processor.c - interface to notify Xen on acpi processor object
+ *                     info parsing
+ *
+ *  Copyright (C) 2008, Intel corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+
+#include <linux/cpufreq.h>
+#include <acpi/processor.h>
+#include <xen/acpi.h>
+#include <xen/pcpu.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+static int xen_hotplug_notifier(struct acpi_processor *pr, int event);
+
+static struct processor_cntl_xen_ops xen_ops = {
+	.hotplug		= xen_hotplug_notifier,
+};
+
+static struct acpi_power_register *power_registers[XEN_MAX_ACPI_ID + 1];
+
+int processor_cntl_xen_power_cache(int cpu, int cx,
+		struct acpi_power_register *reg)
+{
+	struct acpi_power_register *buf;
+
+	if (cpu < 0 || cpu > XEN_MAX_ACPI_ID ||
+			cx < 1 || cx > ACPI_PROCESSOR_MAX_POWER) {
+		return -EINVAL;
+	}
+
+	if (power_registers[cpu] == NULL) {
+		buf = kzalloc(ACPI_PROCESSOR_MAX_POWER *
+				sizeof(struct xen_processor_cx), GFP_KERNEL);
+		if (buf == NULL)
+			return -ENOMEM;
+
+		power_registers[cpu] = buf;
+	}
+
+	memcpy(power_registers[cpu]+cx-1, reg, sizeof(*reg));
+
+	return 0;
+}
+EXPORT_SYMBOL(processor_cntl_xen_power_cache);
+
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+static int xen_get_apic_id(acpi_handle handle)
+{
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	union acpi_object *obj;
+	struct acpi_madt_local_apic *lapic;
+	u8 physid;
+
+	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
+		return -EINVAL;
+
+	if (!buffer.length || !buffer.pointer)
+		return -EINVAL;
+
+	obj = buffer.pointer;
+	if (obj->type != ACPI_TYPE_BUFFER ||
+	    obj->buffer.length < sizeof(*lapic)) {
+		kfree(buffer.pointer);
+		return -EINVAL;
+	}
+
+	lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer;
+
+	if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC ||
+	    !(lapic->lapic_flags & ACPI_MADT_ENABLED)) {
+		kfree(buffer.pointer);
+		return -EINVAL;
+	}
+
+	physid = lapic->id;
+	kfree(buffer.pointer);
+	buffer.length = ACPI_ALLOCATE_BUFFER;
+	buffer.pointer = NULL;
+
+	return physid;
+}
+#else
+static int xen_get_apic_id(acpi_handle handle)
+{
+	return -1;
+}
+#endif
+
+int processor_cntl_xen_notify(struct acpi_processor *pr, int event, int type)
+{
+	int ret = -EINVAL;
+
+	switch (event) {
+	case PROCESSOR_PM_INIT:
+	case PROCESSOR_PM_CHANGE:
+		if ((type >= PM_TYPE_MAX) ||
+			!xen_ops.pm_ops[type])
+			break;
+
+		ret = xen_ops.pm_ops[type](pr, event);
+		break;
+	case PROCESSOR_HOTPLUG:
+	{
+		int apic_id;
+
+		apic_id = xen_get_apic_id(pr->handle);
+		if (apic_id < 0)
+			break;
+		if (xen_ops.hotplug)
+			ret = xen_ops.hotplug(pr, type);
+		xen_pcpu_hotplug(type, apic_id);
+		break;
+	}
+	default:
+		printk(KERN_ERR "Unsupport processor events %d.\n", event);
+		break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(processor_cntl_xen_notify);
+
+static inline void xen_convert_pct_reg(struct xen_pct_register *xpct,
+	struct acpi_pct_register *apct)
+{
+	xpct->descriptor = apct->descriptor;
+	xpct->length     = apct->length;
+	xpct->space_id   = apct->space_id;
+	xpct->bit_width  = apct->bit_width;
+	xpct->bit_offset = apct->bit_offset;
+	xpct->reserved   = apct->reserved;
+	xpct->address    = apct->address;
+}
+
+static inline void xen_convert_pss_states(struct xen_processor_px *xpss,
+	struct acpi_processor_px *apss, int state_count)
+{
+	int i;
+	for (i = 0; i < state_count; i++) {
+		xpss->core_frequency     = apss->core_frequency;
+		xpss->power              = apss->power;
+		xpss->transition_latency = apss->transition_latency;
+		xpss->bus_master_latency = apss->bus_master_latency;
+		xpss->control            = apss->control;
+		xpss->status             = apss->status;
+		xpss++;
+		apss++;
+	}
+}
+
+static inline void xen_convert_psd_pack(struct xen_psd_package *xpsd,
+	struct acpi_psd_package *apsd)
+{
+	xpsd->num_entries    = apsd->num_entries;
+	xpsd->revision       = apsd->revision;
+	xpsd->domain         = apsd->domain;
+	xpsd->coord_type     = apsd->coord_type;
+	xpsd->num_processors = apsd->num_processors;
+}
+
+static int xen_cx_notifier(struct acpi_processor *pr, int action)
+{
+	int ret, count = 0, i;
+	xen_platform_op_t op = {
+		.cmd			= XENPF_set_processor_pminfo,
+		.interface_version	= XENPF_INTERFACE_VERSION,
+		.u.set_pminfo.id	= pr->acpi_id,
+		.u.set_pminfo.type	= XEN_PM_CX,
+	};
+	struct xen_processor_cx *data, *buf;
+	struct acpi_processor_cx *cx;
+	struct acpi_power_register *reg;
+
+	if (action == PROCESSOR_PM_CHANGE)
+		return -EINVAL;
+
+	if (power_registers[pr->acpi_id] == NULL) {
+		printk(KERN_WARNING "No C state info for acpi processor %d\n",
+				pr->acpi_id);
+		return -EINVAL;
+	}
+
+	/* Convert to Xen defined structure and hypercall */
+	buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx),
+			GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	data = buf;
+	for (i = 1; i <= pr->power.count; i++) {
+		cx = &pr->power.states[i];
+		reg = power_registers[pr->acpi_id]+i-1;
+		/* Skip invalid cstate entry */
+		if (!cx->valid)
+			continue;
+
+		data->type = cx->type;
+		data->latency = cx->latency;
+		data->power = cx->power;
+		data->reg.space_id = reg->space_id;
+		data->reg.bit_width = reg->bit_width;
+		data->reg.bit_offset = reg->bit_offset;
+		data->reg.access_size = reg->access_size;
+		data->reg.address = reg->address;
+
+		/* Get dependency relationships, _CSD is not supported yet */
+		data->dpcnt = 0;
+		set_xen_guest_handle(data->dp, NULL);
+
+		data++;
+		count++;
+	}
+
+	if (!count) {
+		printk(KERN_ERR "No available Cx info for cpu %d\n",
+				pr->acpi_id);
+		kfree(buf);
+		return -EINVAL;
+	}
+
+	op.u.set_pminfo.power.count = count;
+	op.u.set_pminfo.power.flags.bm_control = pr->flags.bm_control;
+	op.u.set_pminfo.power.flags.bm_check = pr->flags.bm_check;
+	op.u.set_pminfo.power.flags.has_cst = pr->flags.has_cst;
+	op.u.set_pminfo.power.flags.power_setup_done =
+		pr->flags.power_setup_done;
+
+	set_xen_guest_handle(op.u.set_pminfo.power.states, buf);
+	ret = HYPERVISOR_dom0_op(&op);
+	kfree(buf);
+	return ret;
+}
+
+static int xen_px_notifier(struct acpi_processor *pr, int action)
+{
+	int ret = -EINVAL;
+	xen_platform_op_t op = {
+		.cmd			= XENPF_set_processor_pminfo,
+		.interface_version	= XENPF_INTERFACE_VERSION,
+		.u.set_pminfo.id	= pr->acpi_id,
+		.u.set_pminfo.type	= XEN_PM_PX,
+	};
+	struct xen_processor_performance *perf;
+	struct xen_processor_px *states = NULL;
+	struct acpi_processor_performance *px;
+	struct acpi_psd_package *pdomain;
+
+	if (!pr)
+		return -EINVAL;
+
+	perf = &op.u.set_pminfo.perf;
+	px = pr->performance;
+
+	switch (action) {
+	case PROCESSOR_PM_CHANGE:
+		/* ppc dynamic handle */
+		perf->flags = XEN_PX_PPC;
+		perf->platform_limit = pr->performance_platform_limit;
+
+		ret = HYPERVISOR_dom0_op(&op);
+		break;
+
+	case PROCESSOR_PM_INIT:
+		/* px normal init */
+		perf->flags = XEN_PX_PPC |
+			      XEN_PX_PCT |
+			      XEN_PX_PSS |
+			      XEN_PX_PSD;
+
+		/* ppc */
+		perf->platform_limit = pr->performance_platform_limit;
+
+		/* pct */
+		xen_convert_pct_reg(&perf->control_register,
+				&px->control_register);
+		xen_convert_pct_reg(&perf->status_register,
+				&px->status_register);
+
+		/* pss */
+		perf->state_count = px->state_count;
+		states = kzalloc(px->state_count*sizeof(xen_processor_px_t),
+				GFP_KERNEL);
+		if (!states)
+			return -ENOMEM;
+		xen_convert_pss_states(states, px->states, px->state_count);
+		set_xen_guest_handle(perf->states, states);
+
+		/* psd */
+		pdomain = &px->domain_info;
+		xen_convert_psd_pack(&perf->domain_info, pdomain);
+		if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL)
+			perf->shared_type = CPUFREQ_SHARED_TYPE_ALL;
+		else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY)
+			perf->shared_type = CPUFREQ_SHARED_TYPE_ANY;
+		else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL)
+			perf->shared_type = CPUFREQ_SHARED_TYPE_HW;
+		else {
+			ret = -ENODEV;
+			kfree(states);
+			break;
+		}
+
+		ret = HYPERVISOR_dom0_op(&op);
+		kfree(states);
+		break;
+
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static int xen_tx_notifier(struct acpi_processor *pr, int action)
+{
+	return -EINVAL;
+}
+
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
+{
+	int ret = -EINVAL;
+	uint32_t apic_id;
+	unsigned long long pxm;
+	acpi_status status = 0;
+
+	xen_platform_op_t op = {
+		.interface_version  = XENPF_INTERFACE_VERSION,
+	};
+
+	apic_id = xen_get_apic_id(pr->handle);
+	if (apic_id < 0) {
+		printk(KERN_WARNING "Can't get apic_id for acpi_id %x\n",
+		  pr->acpi_id);
+		return -1;
+	}
+
+	status = acpi_evaluate_integer(pr->handle, "_PXM",
+	  NULL, &pxm);
+	if (ACPI_FAILURE(status)) {
+		printk(KERN_WARNING "can't get pxm for acpi_id %x\n",
+		  pr->acpi_id);
+		return -1;
+	}
+
+	switch (event) {
+	case HOTPLUG_TYPE_ADD:
+		op.cmd = XENPF_cpu_hotadd;
+		op.u.cpu_add.apic_id = apic_id;
+		op.u.cpu_add.acpi_id = pr->acpi_id;
+		op.u.cpu_add.pxm = pxm;
+		ret = HYPERVISOR_dom0_op(&op);
+		break;
+	case HOTPLUG_TYPE_REMOVE:
+		printk(KERN_WARNING "Xen not support CPU hotremove\n");
+		ret = -ENOSYS;
+		break;
+	}
+
+	return ret;
+}
+#else
+static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
+{
+	return -ENOSYS;
+}
+#endif
+
+static int __init xen_acpi_processor_extcntl_init(void)
+{
+	unsigned int pmbits;
+
+	/* Only xen dom0 is allowed to handle ACPI processor info */
+	if (!xen_initial_domain())
+		return 0;
+
+	pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8;
+
+	if (pmbits & XEN_PROCESSOR_PM_CX)
+		xen_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier;
+	if (pmbits & XEN_PROCESSOR_PM_PX)
+		xen_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier;
+	if (pmbits & XEN_PROCESSOR_PM_TX)
+		xen_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier;
+
+	return 0;
+}
+
+subsys_initcall(xen_acpi_processor_extcntl_init);
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 4204336..a065fda 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -43,22 +43,26 @@
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/sysdev.h>
+#include <linux/swap.h>
 
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
+#include <asm/e820.h>
 
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/memory.h>
 #include <xen/xenbus.h>
 #include <xen/features.h>
 #include <xen/page.h>
 
-#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT+balloon_order-10))
 
 #define BALLOON_CLASS_NAME "xen_memory"
 
@@ -82,15 +86,16 @@
 
 static int register_balloon(struct sys_device *sysdev);
 
-/*
- * Protects atomic reservation decrease/increase against concurrent increases.
- * Also protects non-atomic updates of current_pages and driver_pages, and
- * balloon lists.
- */
-static DEFINE_SPINLOCK(balloon_lock);
-
 static struct balloon_stats balloon_stats;
 
+/*
+ * Work in pages of this order.  Can be either 0 for normal pages
+ * or 9 for hugepages.
+ */
+static int balloon_order;
+static unsigned long balloon_npages;
+static unsigned long discontig_frame_list[PAGE_SIZE / sizeof(unsigned long)];
+
 /* We increase/decrease in batches which fit in a page */
 static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
 
@@ -118,12 +123,43 @@
 static void scrub_page(struct page *page)
 {
 #ifdef CONFIG_XEN_SCRUB_PAGES
-	clear_highpage(page);
+	int i;
+
+	for (i = 0; i < balloon_npages; i++)
+		clear_highpage(page++);
 #endif
 }
 
+static void free_discontig_frame(void)
+{
+	int rc;
+	struct xen_memory_reservation reservation = {
+		.address_bits = 0,
+		.domid        = DOMID_SELF,
+		.nr_extents   = balloon_npages,
+		.extent_order = 0
+	};
+
+	set_xen_guest_handle(reservation.extent_start, discontig_frame_list);
+	rc = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+	BUG_ON(rc != balloon_npages);
+}
+
+static unsigned long shrink_frame(unsigned long nr_pages)
+{
+	unsigned long i, j;
+
+	for (i = 0, j = 0; i < nr_pages; i++, j++) {
+		if (frame_list[i] == 0)
+			j++;
+		if (i != j)
+			frame_list[i] = frame_list[j];
+	}
+	return i;
+}
+
 /* balloon_append: add the given page to the balloon. */
-static void balloon_append(struct page *page)
+static void __balloon_append(struct page *page)
 {
 	/* Lowmem is re-populated first, so highmem pages go at list tail. */
 	if (PageHighMem(page)) {
@@ -134,7 +170,11 @@
 		list_add(&page->lru, &ballooned_pages);
 		balloon_stats.balloon_low++;
 	}
+}
 
+static void balloon_append(struct page *page)
+{
+	__balloon_append(page);
 	totalram_pages--;
 }
 
@@ -195,20 +235,17 @@
 
 static int increase_reservation(unsigned long nr_pages)
 {
-	unsigned long  pfn, i, flags;
+	unsigned long  pfn, mfn, i, j;
 	struct page   *page;
 	long           rc;
 	struct xen_memory_reservation reservation = {
 		.address_bits = 0,
-		.extent_order = 0,
 		.domid        = DOMID_SELF
 	};
 
 	if (nr_pages > ARRAY_SIZE(frame_list))
 		nr_pages = ARRAY_SIZE(frame_list);
 
-	spin_lock_irqsave(&balloon_lock, flags);
-
 	page = balloon_first_page();
 	for (i = 0; i < nr_pages; i++) {
 		BUG_ON(page == NULL);
@@ -218,6 +255,8 @@
 
 	set_xen_guest_handle(reservation.extent_start, frame_list);
 	reservation.nr_extents = nr_pages;
+	reservation.extent_order = balloon_order;
+
 	rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
 	if (rc < 0)
 		goto out;
@@ -227,19 +266,22 @@
 		BUG_ON(page == NULL);
 
 		pfn = page_to_pfn(page);
+		mfn = frame_list[i];
 		BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
 		       phys_to_machine_mapping_valid(pfn));
 
-		set_phys_to_machine(pfn, frame_list[i]);
+		for (j = 0; j < balloon_npages; j++, pfn++, mfn++) {
+			set_phys_to_machine(pfn, mfn);
 
-		/* Link back into the page tables if not highmem. */
-		if (pfn < max_low_pfn) {
-			int ret;
-			ret = HYPERVISOR_update_va_mapping(
-				(unsigned long)__va(pfn << PAGE_SHIFT),
-				mfn_pte(frame_list[i], PAGE_KERNEL),
-				0);
-			BUG_ON(ret);
+			/* Link back into the page tables if not highmem. */
+			if (!xen_hvm_domain() && pfn < max_low_pfn) {
+				int ret;
+				ret = HYPERVISOR_update_va_mapping(
+					(unsigned long)__va(pfn << PAGE_SHIFT),
+					mfn_pte(mfn, PAGE_KERNEL),
+					0);
+				BUG_ON(ret);
+			}
 		}
 
 		/* Relinquish the page back to the allocator. */
@@ -251,20 +293,18 @@
 	balloon_stats.current_pages += rc;
 
  out:
-	spin_unlock_irqrestore(&balloon_lock, flags);
-
 	return rc < 0 ? rc : rc != nr_pages;
 }
 
 static int decrease_reservation(unsigned long nr_pages)
 {
-	unsigned long  pfn, i, flags;
-	struct page   *page;
+	unsigned long  pfn, lpfn, mfn, i, j;
+	struct page   *page = NULL;
 	int            need_sleep = 0;
-	int ret;
+	int		discontig, discontig_free;
+	int		ret;
 	struct xen_memory_reservation reservation = {
 		.address_bits = 0,
-		.extent_order = 0,
 		.domid        = DOMID_SELF
 	};
 
@@ -272,7 +312,7 @@
 		nr_pages = ARRAY_SIZE(frame_list);
 
 	for (i = 0; i < nr_pages; i++) {
-		if ((page = alloc_page(GFP_BALLOON)) == NULL) {
+		if ((page = alloc_pages(GFP_BALLOON, balloon_order)) == NULL) {
 			nr_pages = i;
 			need_sleep = 1;
 			break;
@@ -282,38 +322,49 @@
 		frame_list[i] = pfn_to_mfn(pfn);
 
 		scrub_page(page);
-
-		if (!PageHighMem(page)) {
-			ret = HYPERVISOR_update_va_mapping(
-				(unsigned long)__va(pfn << PAGE_SHIFT),
-				__pte_ma(0), 0);
-			BUG_ON(ret);
-                }
-
 	}
 
 	/* Ensure that ballooned highmem pages don't have kmaps. */
 	kmap_flush_unused();
 	flush_tlb_all();
 
-	spin_lock_irqsave(&balloon_lock, flags);
-
 	/* No more mappings: invalidate P2M and add to balloon. */
 	for (i = 0; i < nr_pages; i++) {
-		pfn = mfn_to_pfn(frame_list[i]);
-		set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+		mfn = frame_list[i];
+		lpfn = pfn = mfn_to_pfn(mfn);
 		balloon_append(pfn_to_page(pfn));
+		discontig_free = 0;
+		for (j = 0; j < balloon_npages; j++, lpfn++, mfn++) {
+			if ((discontig_frame_list[j] = pfn_to_mfn(lpfn)) != mfn)
+				discontig_free = 1;
+
+			set_phys_to_machine(lpfn, INVALID_P2M_ENTRY);
+                        page = pfn_to_page(lpfn);
+
+			if (!xen_hvm_domain() && !PageHighMem(page)) {
+				ret = HYPERVISOR_update_va_mapping(
+					(unsigned long)__va(lpfn << PAGE_SHIFT),
+					__pte_ma(0), 0);
+				BUG_ON(ret);
+			}
+		}
+		if (discontig_free) {
+			free_discontig_frame();
+			frame_list[i] = 0;
+			discontig = 1;
+		}
 	}
+	balloon_stats.current_pages -= nr_pages;
+
+	if (discontig)
+		nr_pages = shrink_frame(nr_pages);
 
 	set_xen_guest_handle(reservation.extent_start, frame_list);
 	reservation.nr_extents   = nr_pages;
+	reservation.extent_order = balloon_order;
 	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
 	BUG_ON(ret != nr_pages);
 
-	balloon_stats.current_pages -= nr_pages;
-
-	spin_unlock_irqrestore(&balloon_lock, flags);
-
 	return need_sleep;
 }
 
@@ -379,7 +430,7 @@
 	/* The given memory/target value is in KiB, so it needs converting to
 	 * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
 	 */
-	balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
+	balloon_set_new_target(new_target >> ((PAGE_SHIFT - 10) + balloon_order));
 }
 
 static int balloon_init_watcher(struct notifier_block *notifier,
@@ -399,15 +450,22 @@
 
 static int __init balloon_init(void)
 {
-	unsigned long pfn;
+ 	unsigned long pfn, nr_pages, extra_pfn_end;
 	struct page *page;
 
-	if (!xen_pv_domain())
+	if (!xen_domain())
 		return -ENODEV;
 
-	pr_info("xen_balloon: Initialising balloon driver.\n");
+	pr_info("xen_balloon: Initialising balloon driver with page order %d.\n",
+		balloon_order);
 
-	balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
+	balloon_npages = 1 << balloon_order;
+
+ 	if (xen_pv_domain())
+ 		nr_pages = xen_start_info->nr_pages;
+ 	else
+ 		nr_pages = max_pfn;
+ 	balloon_stats.current_pages = (min(nr_pages, max_pfn)) >> balloon_order;
 	balloon_stats.target_pages  = balloon_stats.current_pages;
 	balloon_stats.balloon_low   = 0;
 	balloon_stats.balloon_high  = 0;
@@ -419,11 +477,24 @@
 
 	register_balloon(&balloon_sysdev);
 
-	/* Initialise the balloon with excess memory space. */
-	for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
+	/*
+	 * Initialise the balloon with excess memory space.  We need
+	 * to make sure we don't add memory which doesn't exist or
+	 * logically exist.  The E820 map can be trimmed to be smaller
+	 * than the amount of physical memory due to the mem= command
+	 * line parameter.  And if this is a 32-bit non-HIGHMEM kernel
+	 * on a system with memory which requires highmem to access,
+	 * don't try to use it.
+	 */
+	extra_pfn_end = min(min(max_pfn, e820_end_of_ram_pfn()),
+			    (unsigned long)PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size));
+	for (pfn = PFN_UP(xen_extra_mem_start);
+	     pfn < extra_pfn_end;
+	     pfn += balloon_npages) {
 		page = pfn_to_page(pfn);
-		if (!PageReserved(page))
-			balloon_append(page);
+		/* totalram_pages doesn't include the boot-time
+		   balloon extension, so don't subtract from it. */
+		__balloon_append(page);
 	}
 
 	target_watch.callback = watch_target;
@@ -444,6 +515,121 @@
 
 module_exit(balloon_exit);
 
+static int __init balloon_parse_huge(char *s)
+{
+	balloon_order = 9;
+	return 1;
+}
+
+__setup("balloon_hugepages", balloon_parse_huge);
+
+static int dealloc_pte_fn(pte_t *pte, struct page *pmd_page,
+			  unsigned long addr, void *data)
+{
+	unsigned long mfn = pte_mfn(*pte);
+	int ret;
+	struct xen_memory_reservation reservation = {
+		.nr_extents   = 1,
+		.extent_order = 0,
+		.domid        = DOMID_SELF
+	};
+
+	set_xen_guest_handle(reservation.extent_start, &mfn);
+	set_pte_at(&init_mm, addr, pte, __pte_ma(0));
+	set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
+
+	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+	BUG_ON(ret != 1);
+
+	return 0;
+}
+
+struct page **alloc_empty_pages_and_pagevec(int nr_pages)
+{
+	struct page *page, **pagevec;
+	int npages;
+	int i, j, ret;
+
+	/* Round up to next number of balloon_order pages */
+	npages = (nr_pages + (balloon_npages-1)) >> balloon_order;
+
+	pagevec = kmalloc(sizeof(page) * nr_pages << balloon_order, GFP_KERNEL);
+	if (pagevec == NULL)
+		return NULL;
+
+	for (i = 0; i < nr_pages; i++) {
+		void *v;
+
+		page = alloc_pages(GFP_KERNEL|__GFP_COLD, balloon_order);
+		if (page == NULL)
+			goto err;
+
+		scrub_page(page);
+
+		mutex_lock(&balloon_mutex);
+
+		v = page_address(page);
+
+		ret = apply_to_page_range(&init_mm, (unsigned long)v,
+					  PAGE_SIZE << balloon_order,
+					  dealloc_pte_fn, NULL);
+
+		if (ret != 0) {
+			mutex_unlock(&balloon_mutex);
+			//balloon_free_page(page); /* tries to use free_cold_page */
+			__free_page(page);
+			goto err;
+		}
+		for (j = 0; j < balloon_npages; j++)
+			pagevec[(i<<balloon_order)+j] = page++;
+
+		totalram_pages = balloon_stats.current_pages -= balloon_npages;
+
+		mutex_unlock(&balloon_mutex);
+	}
+
+ out:
+	schedule_work(&balloon_worker);
+	flush_tlb_all();
+	return pagevec;
+
+ err:
+	mutex_lock(&balloon_mutex);
+	while (--i >= 0)
+		balloon_append(pagevec[i << balloon_order]);
+	mutex_unlock(&balloon_mutex);
+	kfree(pagevec);
+	pagevec = NULL;
+	goto out;
+}
+EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
+
+void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
+{
+	struct page *page;
+	int i;
+	int npages;
+
+	if (pagevec == NULL)
+		return;
+
+	/* Round up to next number of balloon_order pages */
+	npages = (nr_pages + (balloon_npages-1)) >> balloon_order;
+
+	mutex_lock(&balloon_mutex);
+	for (i = 0; i < nr_pages; i++) {
+		page = pagevec[i << balloon_order];
+		BUG_ON(page_count(page) != 1);
+		balloon_append(page);
+	}
+	mutex_unlock(&balloon_mutex);
+
+	kfree(pagevec);
+
+	schedule_work(&balloon_worker);
+}
+EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
+
 #define BALLOON_SHOW(name, format, args...)				\
 	static ssize_t show_##name(struct sys_device *dev,		\
 				   struct sysdev_attribute *attr,	\
@@ -477,7 +663,7 @@
 
 	target_bytes = simple_strtoull(buf, &endchar, 0) * 1024;
 
-	balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+	balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order));
 
 	return count;
 }
@@ -491,7 +677,7 @@
 {
 	return sprintf(buf, "%llu\n",
 		       (unsigned long long)balloon_stats.target_pages
-		       << PAGE_SHIFT);
+		       << (PAGE_SHIFT + balloon_order));
 }
 
 static ssize_t store_target(struct sys_device *dev,
@@ -507,7 +693,7 @@
 
 	target_bytes = memparse(buf, &endchar);
 
-	balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+	balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order));
 
 	return count;
 }
diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c
new file mode 100644
index 0000000..d40f534
--- /dev/null
+++ b/drivers/xen/biomerge.c
@@ -0,0 +1,14 @@
+#include <linux/bio.h>
+#include <asm/io.h>
+#include <xen/page.h>
+
+bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
+			       const struct bio_vec *vec2)
+{
+	unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page));
+	unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page));
+
+	return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&
+		((mfn1 == mfn2) || ((mfn1+1) == mfn2));
+}
+
diff --git a/drivers/xen/blkback/Makefile b/drivers/xen/blkback/Makefile
new file mode 100644
index 0000000..dee55ba
--- /dev/null
+++ b/drivers/xen/blkback/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
+obj-$(CONFIG_XEN_BLKBACK_PAGEMAP) += blkback-pagemap.o
+
+xen-blkback-y	:= blkback.o xenbus.o interface.o vbd.o
diff --git a/drivers/xen/blkback/blkback-pagemap.c b/drivers/xen/blkback/blkback-pagemap.c
new file mode 100644
index 0000000..45f6eb2
--- /dev/null
+++ b/drivers/xen/blkback/blkback-pagemap.c
@@ -0,0 +1,109 @@
+#include <linux/module.h>
+#include "blkback-pagemap.h"
+
+static int blkback_pagemap_size;
+static struct blkback_pagemap *blkback_pagemap;
+
+static inline int
+blkback_pagemap_entry_clear(struct blkback_pagemap *map)
+{
+	static struct blkback_pagemap zero;
+	return !memcmp(map, &zero, sizeof(zero));
+}
+
+int
+blkback_pagemap_init(int pages)
+{
+	blkback_pagemap = kzalloc(pages * sizeof(struct blkback_pagemap),
+				  GFP_KERNEL);
+	if (!blkback_pagemap)
+		return -ENOMEM;
+
+	blkback_pagemap_size = pages;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkback_pagemap_init);
+
+void
+blkback_pagemap_set(int idx, struct page *page,
+		    domid_t domid, busid_t busid, grant_ref_t gref)
+{
+	struct blkback_pagemap *entry;
+
+	BUG_ON(!blkback_pagemap);
+	BUG_ON(idx >= blkback_pagemap_size);
+
+	set_page_private(page, idx);
+
+	entry = blkback_pagemap + idx;
+	if (!blkback_pagemap_entry_clear(entry)) {
+		printk("overwriting pagemap %d: d %u b %u g %u\n",
+		       idx, entry->domid, entry->busid, entry->gref);
+		BUG();
+	}
+
+	entry->page  = page;
+	entry->domid = domid;
+	entry->busid = busid;
+	entry->gref  = gref;
+}
+EXPORT_SYMBOL_GPL(blkback_pagemap_set);
+
+void
+blkback_pagemap_clear(struct page *page)
+{
+	int idx;
+	struct blkback_pagemap *entry;
+
+	idx = (int)page_private(page);
+
+	BUG_ON(!blkback_pagemap);
+	BUG_ON(idx >= blkback_pagemap_size);
+
+	entry = blkback_pagemap + idx;
+	if (blkback_pagemap_entry_clear(entry)) {
+		printk("clearing empty pagemap %d\n", idx);
+		BUG();
+	}
+
+	memset(entry, 0, sizeof(*entry));
+}
+EXPORT_SYMBOL_GPL(blkback_pagemap_clear);
+
+struct blkback_pagemap
+blkback_pagemap_read(struct page *page)
+{
+	int idx;
+	struct blkback_pagemap *entry;
+
+	idx = (int)page_private(page);
+
+	BUG_ON(!blkback_pagemap);
+	BUG_ON(idx >= blkback_pagemap_size);
+
+	entry = blkback_pagemap + idx;
+	if (blkback_pagemap_entry_clear(entry)) {
+		printk("reading empty pagemap %d\n", idx);
+		BUG();
+	}
+
+	return *entry;
+}
+EXPORT_SYMBOL(blkback_pagemap_read);
+
+MODULE_LICENSE("Dual BSD/GPL");
+
+int
+blkback_pagemap_contains_page(struct page *page)
+{
+	struct blkback_pagemap *entry;
+	int idx = (int)page_private(page);
+
+	if (idx < 0 || idx >= blkback_pagemap_size)
+		return 0;
+
+	entry = blkback_pagemap + idx;
+
+	return (entry->page == page);
+}
+EXPORT_SYMBOL(blkback_pagemap_contains_page);
diff --git a/drivers/xen/blkback/blkback-pagemap.h b/drivers/xen/blkback/blkback-pagemap.h
new file mode 100644
index 0000000..7f97d15
--- /dev/null
+++ b/drivers/xen/blkback/blkback-pagemap.h
@@ -0,0 +1,36 @@
+#ifndef _BLKBACK_PAGEMAP_H_
+#define _BLKBACK_PAGEMAP_H_
+
+#include <linux/mm.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/grant_table.h>
+
+typedef unsigned int busid_t;
+
+struct blkback_pagemap {
+	struct page     *page;
+	domid_t          domid;
+	busid_t          busid;
+	grant_ref_t      gref;
+};
+
+#if defined(CONFIG_XEN_BLKBACK_PAGEMAP) || defined(CONFIG_XEN_BLKBACK_PAGEMAP_MODULE)
+
+int blkback_pagemap_init(int);
+void blkback_pagemap_set(int, struct page *, domid_t, busid_t, grant_ref_t);
+void blkback_pagemap_clear(struct page *);
+struct blkback_pagemap blkback_pagemap_read(struct page *);
+int blkback_pagemap_contains_page(struct page *page);
+
+#else /* CONFIG_XEN_BLKBACK_PAGEMAP */
+
+static inline int blkback_pagemap_init(int pages) { return 0; }
+static inline void blkback_pagemap_set(int idx, struct page *page, domid_t dom,
+				       busid_t bus, grant_ref_t gnt) {}
+static inline void blkback_pagemap_clear(struct page *page) {}
+#define blkback_pagemap_read(_page) ({ BUG(); (struct blkback_pagemap){0}; })
+static inline int blkback_pagemap_contains_page(struct page *page) { return 0; }
+
+#endif /* CONFIG_XEN_BLKBACK_PAGEMAP */
+
+#endif
diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
new file mode 100644
index 0000000..0bef445
--- /dev/null
+++ b/drivers/xen/blkback/blkback.c
@@ -0,0 +1,675 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/main.c
+ *
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ *  arch/xen/drivers/blkif/frontend
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Copyright (c) 2005, Christopher Clark
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+
+#include <xen/balloon.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include "common.h"
+
+/*
+ * These are rather arbitrary. They are fairly large because adjacent requests
+ * pulled from a communication ring are quite likely to end up being part of
+ * the same scatter/gather request at the disc.
+ *
+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ *
+ * This will increase the chances of being able to write whole tracks.
+ * 64 should be enough to keep us competitive with Linux.
+ */
+static int blkif_reqs = 64;
+module_param_named(reqs, blkif_reqs, int, 0);
+MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
+
+/* Run-time switchable: /sys/module/blkback/parameters/ */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+	blkif_t       *blkif;
+	u64            id;
+	int            nr_pages;
+	atomic_t       pendcnt;
+	unsigned short operation;
+	int            status;
+	struct list_head free_list;
+} pending_req_t;
+
+static pending_req_t *pending_reqs;
+static struct list_head pending_free;
+static DEFINE_SPINLOCK(pending_free_lock);
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+static struct page **pending_pages;
+static grant_handle_t *pending_grant_handles;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+	return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+	unsigned long pfn = page_to_pfn(pending_page(req, seg));
+	return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+#define pending_handle(_req, _seg) \
+	(pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
+
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 struct blkif_request *req,
+				 pending_req_t *pending_req);
+static void make_response(blkif_t *blkif, u64 id,
+			  unsigned short op, int st);
+
+/******************************************************************
+ * misc small helpers
+ */
+static pending_req_t* alloc_req(void)
+{
+	pending_req_t *req = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	if (!list_empty(&pending_free)) {
+		req = list_entry(pending_free.next, pending_req_t, free_list);
+		list_del(&req->free_list);
+	}
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+	unsigned long flags;
+	int was_empty;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	was_empty = list_empty(&pending_free);
+	list_add(&req->free_list, &pending_free);
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	if (was_empty)
+		wake_up(&pending_free_wq);
+}
+
+static void unplug_queue(blkif_t *blkif)
+{
+	if (blkif->plug == NULL)
+		return;
+	if (blkif->plug->unplug_fn)
+		blkif->plug->unplug_fn(blkif->plug);
+	blk_put_queue(blkif->plug);
+	blkif->plug = NULL;
+}
+
+static void plug_queue(blkif_t *blkif, struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q == blkif->plug)
+		return;
+	unplug_queue(blkif);
+	blk_get_queue(q);
+	blkif->plug = q;
+}
+
+static void fast_flush_area(pending_req_t *req)
+{
+	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	unsigned int i, invcount = 0;
+	grant_handle_t handle;
+	int ret;
+
+	for (i = 0; i < req->nr_pages; i++) {
+		handle = pending_handle(req, i);
+		if (handle == BLKBACK_INVALID_HANDLE)
+			continue;
+		blkback_pagemap_clear(pending_page(req, i));
+		gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
+				    GNTMAP_host_map, handle);
+		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
+		invcount++;
+	}
+
+	ret = HYPERVISOR_grant_table_op(
+		GNTTABOP_unmap_grant_ref, unmap, invcount);
+	BUG_ON(ret);
+}
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(blkif_t *blkif)
+{
+	printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d  |  br %4d\n",
+	       current->comm, blkif->st_oo_req,
+	       blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
+	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+	blkif->st_rd_req = 0;
+	blkif->st_wr_req = 0;
+	blkif->st_oo_req = 0;
+}
+
+int blkif_schedule(void *arg)
+{
+	blkif_t *blkif = arg;
+	struct vbd *vbd = &blkif->vbd;
+
+	blkif_get(blkif);
+
+	if (debug_lvl)
+		printk(KERN_DEBUG "%s: started\n", current->comm);
+
+	while (!kthread_should_stop()) {
+		if (try_to_freeze())
+			continue;
+		if (unlikely(vbd->size != vbd_size(vbd)))
+			vbd_resize(blkif);
+
+		wait_event_interruptible(
+			blkif->wq,
+			blkif->waiting_reqs || kthread_should_stop());
+		wait_event_interruptible(
+			pending_free_wq,
+			!list_empty(&pending_free) || kthread_should_stop());
+
+		blkif->waiting_reqs = 0;
+		smp_mb(); /* clear flag *before* checking for work */
+
+		if (do_block_io_op(blkif))
+			blkif->waiting_reqs = 1;
+		unplug_queue(blkif);
+
+		if (log_stats && time_after(jiffies, blkif->st_print))
+			print_stats(blkif);
+	}
+
+	if (log_stats)
+		print_stats(blkif);
+	if (debug_lvl)
+		printk(KERN_DEBUG "%s: exiting\n", current->comm);
+
+	blkif->xenblkd = NULL;
+	blkif_put(blkif);
+
+	return 0;
+}
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
+ */
+
+static void __end_block_io_op(pending_req_t *pending_req, int error)
+{
+	/* An error fails the entire request. */
+	if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
+	    (error == -EOPNOTSUPP)) {
+		DPRINTK("blkback: write barrier op failed, not supported\n");
+		blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
+		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
+	} else if (error) {
+		DPRINTK("Buffer not up-to-date at end of operation, "
+			"error=%d\n", error);
+		pending_req->status = BLKIF_RSP_ERROR;
+	}
+
+	if (atomic_dec_and_test(&pending_req->pendcnt)) {
+		fast_flush_area(pending_req);
+		make_response(pending_req->blkif, pending_req->id,
+			      pending_req->operation, pending_req->status);
+		blkif_put(pending_req->blkif);
+		free_req(pending_req);
+	}
+}
+
+static void end_block_io_op(struct bio *bio, int error)
+{
+	__end_block_io_op(bio->bi_private, error);
+	bio_put(bio);
+}
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+static void blkif_notify_work(blkif_t *blkif)
+{
+	blkif->waiting_reqs = 1;
+	wake_up(&blkif->wq);
+}
+
+irqreturn_t blkif_be_int(int irq, void *dev_id)
+{
+	blkif_notify_work(dev_id);
+	return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+
+static int do_block_io_op(blkif_t *blkif)
+{
+	union blkif_back_rings *blk_rings = &blkif->blk_rings;
+	struct blkif_request req;
+	pending_req_t *pending_req;
+	RING_IDX rc, rp;
+	int more_to_do = 0;
+
+	rc = blk_rings->common.req_cons;
+	rp = blk_rings->common.sring->req_prod;
+	rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+	while (rc != rp) {
+
+		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
+			break;
+
+		if (kthread_should_stop()) {
+			more_to_do = 1;
+			break;
+		}
+
+		pending_req = alloc_req();
+		if (NULL == pending_req) {
+			blkif->st_oo_req++;
+			more_to_do = 1;
+			break;
+		}
+
+		switch (blkif->blk_protocol) {
+		case BLKIF_PROTOCOL_NATIVE:
+			memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
+			break;
+		case BLKIF_PROTOCOL_X86_32:
+			blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
+			break;
+		case BLKIF_PROTOCOL_X86_64:
+			blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
+			break;
+		default:
+			BUG();
+		}
+		blk_rings->common.req_cons = ++rc; /* before make_response() */
+
+		/* Apply all sanity checks to /private copy/ of request. */
+		barrier();
+
+		switch (req.operation) {
+		case BLKIF_OP_READ:
+			blkif->st_rd_req++;
+			dispatch_rw_block_io(blkif, &req, pending_req);
+			break;
+		case BLKIF_OP_WRITE_BARRIER:
+			blkif->st_br_req++;
+			/* fall through */
+		case BLKIF_OP_WRITE:
+			blkif->st_wr_req++;
+			dispatch_rw_block_io(blkif, &req, pending_req);
+			break;
+		default:
+			/* A good sign something is wrong: sleep for a while to
+			 * avoid excessive CPU consumption by a bad guest. */
+			msleep(1);
+			DPRINTK("error: unknown block io operation [%d]\n",
+				req.operation);
+			make_response(blkif, req.id, req.operation,
+				      BLKIF_RSP_ERROR);
+			free_req(pending_req);
+			break;
+		}
+
+		/* Yield point for this unbounded loop. */
+		cond_resched();
+	}
+
+	return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 struct blkif_request *req,
+				 pending_req_t *pending_req)
+{
+	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct phys_req preq;
+	struct {
+		unsigned long buf; unsigned int nsec;
+	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	unsigned int nseg;
+	struct bio *bio = NULL;
+	int ret, i;
+	int operation;
+
+	switch (req->operation) {
+	case BLKIF_OP_READ:
+		operation = READ;
+		break;
+	case BLKIF_OP_WRITE:
+		operation = WRITE;
+		break;
+	case BLKIF_OP_WRITE_BARRIER:
+		operation = WRITE_BARRIER;
+		break;
+	default:
+		operation = 0; /* make gcc happy */
+		BUG();
+	}
+
+	/* Check that number of segments is sane. */
+	nseg = req->nr_segments;
+	if (unlikely(nseg == 0 && operation != WRITE_BARRIER) ||
+	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+		DPRINTK("Bad number of segments in request (%d)\n", nseg);
+		goto fail_response;
+	}
+
+	preq.dev           = req->handle;
+	preq.sector_number = req->sector_number;
+	preq.nr_sects      = 0;
+
+	pending_req->blkif     = blkif;
+	pending_req->id        = req->id;
+	pending_req->operation = req->operation;
+	pending_req->status    = BLKIF_RSP_OKAY;
+	pending_req->nr_pages  = nseg;
+
+	for (i = 0; i < nseg; i++) {
+		uint32_t flags;
+
+		seg[i].nsec = req->seg[i].last_sect -
+			req->seg[i].first_sect + 1;
+
+		if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
+		    (req->seg[i].last_sect < req->seg[i].first_sect))
+			goto fail_response;
+		preq.nr_sects += seg[i].nsec;
+
+		flags = GNTMAP_host_map;
+		if (operation != READ)
+			flags |= GNTMAP_readonly;
+		gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
+				  req->seg[i].gref, blkif->domid);
+	}
+
+	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
+	BUG_ON(ret);
+
+	for (i = 0; i < nseg; i++) {
+		if (unlikely(map[i].status != 0)) {
+			DPRINTK("invalid buffer -- could not remap it\n");
+			map[i].handle = BLKBACK_INVALID_HANDLE;
+			ret |= 1;
+			continue;
+		}
+
+		set_phys_to_machine(
+			page_to_pfn(pending_page(pending_req, i)),
+			FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
+		seg[i].buf  = map[i].dev_bus_addr |
+			(req->seg[i].first_sect << 9);
+		blkback_pagemap_set(vaddr_pagenr(pending_req, i),
+				    pending_page(pending_req, i),
+				    blkif->domid, req->handle,
+				    req->seg[i].gref);
+		pending_handle(pending_req, i) = map[i].handle;
+	}
+
+	if (ret)
+		goto fail_flush;
+
+	if (vbd_translate(&preq, blkif, operation) != 0) {
+		DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
+			operation == READ ? "read" : "write",
+			preq.sector_number,
+			preq.sector_number + preq.nr_sects, preq.dev);
+		goto fail_flush;
+	}
+
+	plug_queue(blkif, preq.bdev);
+	atomic_set(&pending_req->pendcnt, 1);
+	blkif_get(blkif);
+
+	for (i = 0; i < nseg; i++) {
+		if (((int)preq.sector_number|(int)seg[i].nsec) &
+		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
+			DPRINTK("Misaligned I/O request from domain %d",
+				blkif->domid);
+			goto fail_put_bio;
+		}
+
+		while ((bio == NULL) ||
+		       (bio_add_page(bio,
+				     pending_page(pending_req, i),
+				     seg[i].nsec << 9,
+				     seg[i].buf & ~PAGE_MASK) == 0)) {
+			if (bio) {
+				atomic_inc(&pending_req->pendcnt);
+				submit_bio(operation, bio);
+			}
+
+			bio = bio_alloc(GFP_KERNEL, nseg-i);
+			if (unlikely(bio == NULL))
+				goto fail_put_bio;
+
+			bio->bi_bdev    = preq.bdev;
+			bio->bi_private = pending_req;
+			bio->bi_end_io  = end_block_io_op;
+			bio->bi_sector  = preq.sector_number;
+		}
+
+		preq.sector_number += seg[i].nsec;
+	}
+
+	if (!bio) {
+		BUG_ON(operation != WRITE_BARRIER);
+		bio = bio_alloc(GFP_KERNEL, 0);
+		if (unlikely(bio == NULL))
+			goto fail_put_bio;
+
+		bio->bi_bdev    = preq.bdev;
+		bio->bi_private = pending_req;
+		bio->bi_end_io  = end_block_io_op;
+		bio->bi_sector  = -1;
+	}
+
+	submit_bio(operation, bio);
+
+	if (operation == READ)
+		blkif->st_rd_sect += preq.nr_sects;
+	else if (operation == WRITE || operation == WRITE_BARRIER)
+		blkif->st_wr_sect += preq.nr_sects;
+
+	return;
+
+ fail_flush:
+	fast_flush_area(pending_req);
+ fail_response:
+	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+	free_req(pending_req);
+	msleep(1); /* back off a bit */
+	return;
+
+ fail_put_bio:
+	__end_block_io_op(pending_req, -EINVAL);
+	if (bio)
+		bio_put(bio);
+	unplug_queue(blkif);
+	msleep(1); /* back off a bit */
+	return;
+}
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, u64 id,
+			  unsigned short op, int st)
+{
+	struct blkif_response  resp;
+	unsigned long     flags;
+	union blkif_back_rings *blk_rings = &blkif->blk_rings;
+	int more_to_do = 0;
+	int notify;
+
+	resp.id        = id;
+	resp.operation = op;
+	resp.status    = st;
+
+	spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+	/* Place on the response ring for the relevant domain. */
+	switch (blkif->blk_protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+		memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	case BLKIF_PROTOCOL_X86_32:
+		memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	case BLKIF_PROTOCOL_X86_64:
+		memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	default:
+		BUG();
+	}
+	blk_rings->common.rsp_prod_pvt++;
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
+	if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
+		/*
+		 * Tail check for pending requests. Allows frontend to avoid
+		 * notifications if requests are already in flight (lower
+		 * overheads and promotes batching).
+		 */
+		RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
+
+	} else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
+		more_to_do = 1;
+	}
+
+	spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+
+	if (more_to_do)
+		blkif_notify_work(blkif);
+	if (notify)
+		notify_remote_via_irq(blkif->irq);
+}
+
+static int __init blkif_init(void)
+{
+	int i, mmap_pages;
+	int rc = 0;
+
+	if (!xen_pv_domain())
+		return -ENODEV;
+
+	mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+	pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
+					blkif_reqs, GFP_KERNEL);
+	pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+					mmap_pages, GFP_KERNEL);
+	pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
+
+	if (blkback_pagemap_init(mmap_pages))
+		goto out_of_memory;
+
+	if (!pending_reqs || !pending_grant_handles || !pending_pages) {
+		rc = -ENOMEM;
+		goto out_of_memory;
+	}
+
+	for (i = 0; i < mmap_pages; i++)
+		pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
+
+	rc = blkif_interface_init();
+	if (rc)
+		goto failed_init;
+
+	memset(pending_reqs, 0, sizeof(pending_reqs));
+	INIT_LIST_HEAD(&pending_free);
+
+	for (i = 0; i < blkif_reqs; i++)
+		list_add_tail(&pending_reqs[i].free_list, &pending_free);
+
+	rc = blkif_xenbus_init();
+	if (rc)
+		goto failed_init;
+
+	return 0;
+
+ out_of_memory:
+	printk(KERN_ERR "%s: out of memory\n", __func__);
+ failed_init:
+	kfree(pending_reqs);
+	kfree(pending_grant_handles);
+	free_empty_pages_and_pagevec(pending_pages, mmap_pages);
+	return rc;
+}
+
+module_init(blkif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
new file mode 100644
index 0000000..531ba81
--- /dev/null
+++ b/drivers/xen/blkback/common.h
@@ -0,0 +1,143 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+#include <asm/pgalloc.h>
+#include <asm/hypervisor.h>
+#include <xen/blkif.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+#include "blkback-pagemap.h"
+
+
+#define DPRINTK(_f, _a...)			\
+	pr_debug("(file=%s, line=%d) " _f,	\
+		 __FILE__ , __LINE__ , ## _a )
+
+struct vbd {
+	blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
+	unsigned char  readonly;    /* Non-zero -> read-only */
+	unsigned char  type;        /* VDISK_xxx */
+	u32            pdevice;     /* phys device that this vbd maps to */
+	struct block_device *bdev;
+	sector_t       size;        /* Cached size parameter */
+};
+
+struct backend_info;
+
+typedef struct blkif_st {
+	/* Unique identifier for this interface. */
+	domid_t           domid;
+	unsigned int      handle;
+	/* Physical parameters of the comms window. */
+	unsigned int      irq;
+	/* Comms information. */
+	enum blkif_protocol blk_protocol;
+	union blkif_back_rings blk_rings;
+	struct vm_struct *blk_ring_area;
+	/* The VBD attached to this interface. */
+	struct vbd        vbd;
+	/* Back pointer to the backend_info. */
+	struct backend_info *be;
+	/* Private fields. */
+	spinlock_t       blk_ring_lock;
+	atomic_t         refcnt;
+
+	wait_queue_head_t   wq;
+	struct task_struct  *xenblkd;
+	unsigned int        waiting_reqs;
+	struct request_queue     *plug;
+
+	/* statistics */
+	unsigned long       st_print;
+	int                 st_rd_req;
+	int                 st_wr_req;
+	int                 st_oo_req;
+	int                 st_br_req;
+	int                 st_rd_sect;
+	int                 st_wr_sect;
+
+	wait_queue_head_t waiting_to_free;
+
+	grant_handle_t shmem_handle;
+	grant_ref_t    shmem_ref;
+} blkif_t;
+
+blkif_t *blkif_alloc(domid_t domid);
+void blkif_disconnect(blkif_t *blkif);
+void blkif_free(blkif_t *blkif);
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
+void vbd_resize(blkif_t *blkif);
+
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b)					\
+	do {						\
+		if (atomic_dec_and_test(&(_b)->refcnt))	\
+			wake_up(&(_b)->waiting_to_free);\
+	} while (0)
+
+/* Create a vbd. */
+int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
+	       unsigned minor, int readonly, int cdrom);
+void vbd_free(struct vbd *vbd);
+
+unsigned long long vbd_size(struct vbd *vbd);
+unsigned int vbd_info(struct vbd *vbd);
+unsigned long vbd_secsize(struct vbd *vbd);
+
+struct phys_req {
+	unsigned short       dev;
+	unsigned short       nr_sects;
+	struct block_device *bdev;
+	blkif_sector_t       sector_number;
+};
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
+
+int blkif_interface_init(void);
+
+int blkif_xenbus_init(void);
+
+irqreturn_t blkif_be_int(int irq, void *dev_id);
+int blkif_schedule(void *arg);
+
+int blkback_barrier(struct xenbus_transaction xbt,
+		    struct backend_info *be, int state);
+
+struct xenbus_device *blkback_xenbus(struct backend_info *be);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
new file mode 100644
index 0000000..e397a41
--- /dev/null
+++ b/drivers/xen/blkback/interface.c
@@ -0,0 +1,186 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/interface.c
+ *
+ * Block-device interface management.
+ *
+ * Copyright (c) 2004, Keir Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include <linux/kthread.h>
+
+static struct kmem_cache *blkif_cachep;
+
+blkif_t *blkif_alloc(domid_t domid)
+{
+	blkif_t *blkif;
+
+	blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
+	if (!blkif)
+		return ERR_PTR(-ENOMEM);
+
+	memset(blkif, 0, sizeof(*blkif));
+	blkif->domid = domid;
+	spin_lock_init(&blkif->blk_ring_lock);
+	atomic_set(&blkif->refcnt, 1);
+	init_waitqueue_head(&blkif->wq);
+	blkif->st_print = jiffies;
+	init_waitqueue_head(&blkif->waiting_to_free);
+
+	return blkif;
+}
+
+static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
+{
+	struct gnttab_map_grant_ref op;
+
+	gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+			  GNTMAP_host_map, shared_page, blkif->domid);
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+		BUG();
+
+	if (op.status) {
+		DPRINTK(" Grant table operation failure !\n");
+		return op.status;
+	}
+
+	blkif->shmem_ref = shared_page;
+	blkif->shmem_handle = op.handle;
+
+	return 0;
+}
+
+static void unmap_frontend_page(blkif_t *blkif)
+{
+	struct gnttab_unmap_grant_ref op;
+
+	gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+			    GNTMAP_host_map, blkif->shmem_handle);
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+		BUG();
+}
+
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
+{
+	int err;
+
+	/* Already connected through? */
+	if (blkif->irq)
+		return 0;
+
+	if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
+		return -ENOMEM;
+
+	err = map_frontend_page(blkif, shared_page);
+	if (err) {
+		free_vm_area(blkif->blk_ring_area);
+		return err;
+	}
+
+	switch (blkif->blk_protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+	{
+		struct blkif_sring *sring;
+		sring = (struct blkif_sring *)blkif->blk_ring_area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_32:
+	{
+		struct blkif_x86_32_sring *sring_x86_32;
+		sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_64:
+	{
+		struct blkif_x86_64_sring *sring_x86_64;
+		sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+		break;
+	}
+	default:
+		BUG();
+	}
+
+	err = bind_interdomain_evtchn_to_irqhandler(
+		blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
+	if (err < 0)
+	{
+		unmap_frontend_page(blkif);
+		free_vm_area(blkif->blk_ring_area);
+		blkif->blk_rings.common.sring = NULL;
+		return err;
+	}
+	blkif->irq = err;
+
+	return 0;
+}
+
+void blkif_disconnect(blkif_t *blkif)
+{
+	if (blkif->xenblkd) {
+		kthread_stop(blkif->xenblkd);
+		blkif->xenblkd = NULL;
+	}
+
+	atomic_dec(&blkif->refcnt);
+	wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
+	atomic_inc(&blkif->refcnt);
+
+	if (blkif->irq) {
+		unbind_from_irqhandler(blkif->irq, blkif);
+		blkif->irq = 0;
+	}
+
+	if (blkif->blk_rings.common.sring) {
+		unmap_frontend_page(blkif);
+		free_vm_area(blkif->blk_ring_area);
+		blkif->blk_rings.common.sring = NULL;
+	}
+}
+
+void blkif_free(blkif_t *blkif)
+{
+	if (!atomic_dec_and_test(&blkif->refcnt))
+		BUG();
+	kmem_cache_free(blkif_cachep, blkif);
+}
+
+int __init blkif_interface_init(void)
+{
+	blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
+					 0, 0, NULL);
+	if (!blkif_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
new file mode 100644
index 0000000..6e0abcc
--- /dev/null
+++ b/drivers/xen/blkback/vbd.c
@@ -0,0 +1,167 @@
+/******************************************************************************
+ * blkback/vbd.c
+ *
+ * Routines for managing virtual block devices (VBDs).
+ *
+ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#define vbd_sz(_v)   ((_v)->bdev->bd_part ?				\
+		      (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
+
+unsigned long long vbd_size(struct vbd *vbd)
+{
+	return vbd_sz(vbd);
+}
+
+unsigned int vbd_info(struct vbd *vbd)
+{
+	return vbd->type | (vbd->readonly?VDISK_READONLY:0);
+}
+
+unsigned long vbd_secsize(struct vbd *vbd)
+{
+	return bdev_logical_block_size(vbd->bdev);
+}
+
+int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
+	       unsigned minor, int readonly, int cdrom)
+{
+	struct vbd *vbd;
+	struct block_device *bdev;
+
+	vbd = &blkif->vbd;
+	vbd->handle   = handle;
+	vbd->readonly = readonly;
+	vbd->type     = 0;
+
+	vbd->pdevice  = MKDEV(major, minor);
+
+	bdev = open_by_devnum(vbd->pdevice,
+			      vbd->readonly ? FMODE_READ : FMODE_WRITE);
+
+	if (IS_ERR(bdev)) {
+		DPRINTK("vbd_creat: device %08x could not be opened.\n",
+			vbd->pdevice);
+		return -ENOENT;
+	}
+
+	vbd->bdev = bdev;
+	vbd->size = vbd_size(vbd);
+
+	if (vbd->bdev->bd_disk == NULL) {
+		DPRINTK("vbd_creat: device %08x doesn't exist.\n",
+			vbd->pdevice);
+		vbd_free(vbd);
+		return -ENOENT;
+	}
+
+	if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
+		vbd->type |= VDISK_CDROM;
+	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+		vbd->type |= VDISK_REMOVABLE;
+
+	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
+		handle, blkif->domid);
+	return 0;
+}
+
+void vbd_free(struct vbd *vbd)
+{
+	if (vbd->bdev)
+		blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
+	vbd->bdev = NULL;
+}
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
+{
+	struct vbd *vbd = &blkif->vbd;
+	int rc = -EACCES;
+
+	if ((operation != READ) && vbd->readonly)
+		goto out;
+
+	if (likely(req->nr_sects)) {
+		blkif_sector_t end = req->sector_number + req->nr_sects;
+
+		if (unlikely(end < req->sector_number))
+			goto out;
+		if (unlikely(end > vbd_sz(vbd)))
+			goto out;
+	}
+
+	req->dev  = vbd->pdevice;
+	req->bdev = vbd->bdev;
+	rc = 0;
+
+ out:
+	return rc;
+}
+
+void vbd_resize(blkif_t *blkif)
+{
+	struct vbd *vbd = &blkif->vbd;
+	struct xenbus_transaction xbt;
+	int err;
+	struct xenbus_device *dev = blkback_xenbus(blkif->be);
+	unsigned long long new_size = vbd_size(vbd);
+
+	printk(KERN_INFO "VBD Resize: new size %Lu\n", new_size);
+	vbd->size = new_size;
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		printk(KERN_WARNING "Error starting transaction");
+		return;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "sectors", "%Lu",
+			    vbd_size(vbd));
+	if (err) {
+		printk(KERN_WARNING "Error writing new size");
+		goto abort;
+	}
+	/*
+	 * Write the current state; we will use this to synchronize
+	 * the front-end. If the current state is "connected" the
+	 * front-end will get the new size information online.
+	 */
+	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
+	if (err) {
+		printk(KERN_WARNING "Error writing the state");
+		goto abort;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		printk(KERN_WARNING "Error ending transaction");
+abort:
+	xenbus_transaction_end(xbt, 1);
+}
diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
new file mode 100644
index 0000000..a0534fc
--- /dev/null
+++ b/drivers/xen/blkback/xenbus.c
@@ -0,0 +1,553 @@
+/*  Xenbus code for blkif backend
+    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+    Copyright (C) 2005 XenSource Ltd
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include "common.h"
+
+#undef DPRINTK
+#define DPRINTK(fmt, args...)				\
+	pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",	\
+		 __FUNCTION__, __LINE__, ##args)
+
+struct backend_info
+{
+	struct xenbus_device *dev;
+	blkif_t *blkif;
+	struct xenbus_watch backend_watch;
+	unsigned major;
+	unsigned minor;
+	char *mode;
+};
+
+static void connect(struct backend_info *);
+static int connect_ring(struct backend_info *);
+static void backend_changed(struct xenbus_watch *, const char **,
+			    unsigned int);
+
+struct xenbus_device *blkback_xenbus(struct backend_info *be)
+{
+	return be->dev;
+}
+
+static int blkback_name(blkif_t *blkif, char *buf)
+{
+	char *devpath, *devname;
+	struct xenbus_device *dev = blkif->be->dev;
+
+	devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
+	if (IS_ERR(devpath))
+		return PTR_ERR(devpath);
+
+	if ((devname = strstr(devpath, "/dev/")) != NULL)
+		devname += strlen("/dev/");
+	else
+		devname  = devpath;
+
+	snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
+	kfree(devpath);
+
+	return 0;
+}
+
+static void update_blkif_status(blkif_t *blkif)
+{
+	int err;
+	char name[TASK_COMM_LEN];
+
+	/* Not ready to connect? */
+	if (!blkif->irq || !blkif->vbd.bdev)
+		return;
+
+	/* Already connected? */
+	if (blkif->be->dev->state == XenbusStateConnected)
+		return;
+
+	/* Attempt to connect: exit if we fail to. */
+	connect(blkif->be);
+	if (blkif->be->dev->state != XenbusStateConnected)
+		return;
+
+	err = blkback_name(blkif, name);
+	if (err) {
+		xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
+		return;
+	}
+
+	err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping);
+	if (err) {
+		xenbus_dev_error(blkif->be->dev, err, "block flush");
+		return;
+	}
+	invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
+
+	blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
+	if (IS_ERR(blkif->xenblkd)) {
+		err = PTR_ERR(blkif->xenblkd);
+		blkif->xenblkd = NULL;
+		xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
+	}
+}
+
+
+/****************************************************************
+ *  sysfs interface for VBD I/O requests
+ */
+
+#define VBD_SHOW(name, format, args...)					\
+	static ssize_t show_##name(struct device *_dev,			\
+				   struct device_attribute *attr,	\
+				   char *buf)				\
+	{								\
+		struct xenbus_device *dev = to_xenbus_device(_dev);	\
+		struct backend_info *be = dev_get_drvdata(&dev->dev);	\
+									\
+		return sprintf(buf, format, ##args);			\
+	}								\
+	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+VBD_SHOW(oo_req,  "%d\n", be->blkif->st_oo_req);
+VBD_SHOW(rd_req,  "%d\n", be->blkif->st_rd_req);
+VBD_SHOW(wr_req,  "%d\n", be->blkif->st_wr_req);
+VBD_SHOW(br_req,  "%d\n", be->blkif->st_br_req);
+VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
+VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
+
+static struct attribute *vbdstat_attrs[] = {
+	&dev_attr_oo_req.attr,
+	&dev_attr_rd_req.attr,
+	&dev_attr_wr_req.attr,
+	&dev_attr_br_req.attr,
+	&dev_attr_rd_sect.attr,
+	&dev_attr_wr_sect.attr,
+	NULL
+};
+
+static struct attribute_group vbdstat_group = {
+	.name = "statistics",
+	.attrs = vbdstat_attrs,
+};
+
+VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
+VBD_SHOW(mode, "%s\n", be->mode);
+
+int xenvbd_sysfs_addif(struct xenbus_device *dev)
+{
+	int error;
+
+	error = device_create_file(&dev->dev, &dev_attr_physical_device);
+ 	if (error)
+		goto fail1;
+
+	error = device_create_file(&dev->dev, &dev_attr_mode);
+	if (error)
+		goto fail2;
+
+	error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
+	if (error)
+		goto fail3;
+
+	return 0;
+
+fail3:	sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+fail2:	device_remove_file(&dev->dev, &dev_attr_mode);
+fail1:	device_remove_file(&dev->dev, &dev_attr_physical_device);
+	return error;
+}
+
+void xenvbd_sysfs_delif(struct xenbus_device *dev)
+{
+	sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+	device_remove_file(&dev->dev, &dev_attr_mode);
+	device_remove_file(&dev->dev, &dev_attr_physical_device);
+}
+
+static int blkback_remove(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+	DPRINTK("");
+
+	if (be->major || be->minor)
+		xenvbd_sysfs_delif(dev);
+
+	if (be->backend_watch.node) {
+		unregister_xenbus_watch(&be->backend_watch);
+		kfree(be->backend_watch.node);
+		be->backend_watch.node = NULL;
+	}
+
+	if (be->blkif) {
+		blkif_disconnect(be->blkif);
+		vbd_free(&be->blkif->vbd);
+		blkif_free(be->blkif);
+		be->blkif = NULL;
+	}
+
+	kfree(be);
+	dev_set_drvdata(&dev->dev, NULL);
+	return 0;
+}
+
+int blkback_barrier(struct xenbus_transaction xbt,
+		    struct backend_info *be, int state)
+{
+	struct xenbus_device *dev = be->dev;
+	int err;
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
+			    "%d", state);
+	if (err)
+		xenbus_dev_fatal(dev, err, "writing feature-barrier");
+
+	return err;
+}
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures, and watch the store waiting for the hotplug scripts to tell us
+ * the device's physical major and minor numbers.  Switch to InitWait.
+ */
+static int blkback_probe(struct xenbus_device *dev,
+			 const struct xenbus_device_id *id)
+{
+	int err;
+	struct backend_info *be = kzalloc(sizeof(struct backend_info),
+					  GFP_KERNEL);
+	if (!be) {
+		xenbus_dev_fatal(dev, -ENOMEM,
+				 "allocating backend structure");
+		return -ENOMEM;
+	}
+	be->dev = dev;
+	dev_set_drvdata(&dev->dev, be);
+
+	be->blkif = blkif_alloc(dev->otherend_id);
+	if (IS_ERR(be->blkif)) {
+		err = PTR_ERR(be->blkif);
+		be->blkif = NULL;
+		xenbus_dev_fatal(dev, err, "creating block interface");
+		goto fail;
+	}
+
+	/* setup back pointer */
+	be->blkif->be = be;
+
+	err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed,
+				   "%s/%s", dev->nodename, "physical-device");
+	if (err)
+		goto fail;
+
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	DPRINTK("failed");
+	blkback_remove(dev);
+	return err;
+}
+
+
+/**
+ * Callback received when the hotplug scripts have placed the physical-device
+ * node.  Read it and the mode node, and create a vbd.  If the frontend is
+ * ready, connect.
+ */
+static void backend_changed(struct xenbus_watch *watch,
+			    const char **vec, unsigned int len)
+{
+	int err;
+	unsigned major;
+	unsigned minor;
+	struct backend_info *be
+		= container_of(watch, struct backend_info, backend_watch);
+	struct xenbus_device *dev = be->dev;
+	int cdrom = 0;
+	char *device_type;
+
+	DPRINTK("");
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
+			   &major, &minor);
+	if (XENBUS_EXIST_ERR(err)) {
+		/* Since this watch will fire once immediately after it is
+		   registered, we expect this.  Ignore it, and wait for the
+		   hotplug scripts. */
+		return;
+	}
+	if (err != 2) {
+		xenbus_dev_fatal(dev, err, "reading physical-device");
+		return;
+	}
+
+	if ((be->major || be->minor) &&
+	    ((be->major != major) || (be->minor != minor))) {
+		printk(KERN_WARNING
+		       "blkback: changing physical device (from %x:%x to "
+		       "%x:%x) not supported.\n", be->major, be->minor,
+		       major, minor);
+		return;
+	}
+
+	be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
+	if (IS_ERR(be->mode)) {
+		err = PTR_ERR(be->mode);
+		be->mode = NULL;
+		xenbus_dev_fatal(dev, err, "reading mode");
+		return;
+	}
+
+	device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
+	if (!IS_ERR(device_type)) {
+		cdrom = strcmp(device_type, "cdrom") == 0;
+		kfree(device_type);
+	}
+
+	if (be->major == 0 && be->minor == 0) {
+		/* Front end dir is a number, which is used as the handle. */
+
+		char *p = strrchr(dev->otherend, '/') + 1;
+		long handle = simple_strtoul(p, NULL, 0);
+
+		be->major = major;
+		be->minor = minor;
+
+		err = vbd_create(be->blkif, handle, major, minor,
+				 (NULL == strchr(be->mode, 'w')), cdrom);
+		if (err) {
+			be->major = be->minor = 0;
+			xenbus_dev_fatal(dev, err, "creating vbd structure");
+			return;
+		}
+
+		err = xenvbd_sysfs_addif(dev);
+		if (err) {
+			vbd_free(&be->blkif->vbd);
+			be->major = be->minor = 0;
+			xenbus_dev_fatal(dev, err, "creating sysfs entries");
+			return;
+		}
+
+		/* We're potentially connected now */
+		update_blkif_status(be->blkif);
+	}
+}
+
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+			     enum xenbus_state frontend_state)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+	int err;
+
+	DPRINTK("%s", xenbus_strstate(frontend_state));
+
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+		if (dev->state == XenbusStateClosed) {
+			printk(KERN_INFO "%s: %s: prepare for reconnect\n",
+			       __FUNCTION__, dev->nodename);
+			xenbus_switch_state(dev, XenbusStateInitWait);
+		}
+		break;
+
+	case XenbusStateInitialised:
+	case XenbusStateConnected:
+		/* Ensure we connect even when two watches fire in
+		   close successsion and we miss the intermediate value
+		   of frontend_state. */
+		if (dev->state == XenbusStateConnected)
+			break;
+
+		err = connect_ring(be);
+		if (err)
+			break;
+		update_blkif_status(be->blkif);
+		break;
+
+	case XenbusStateClosing:
+		blkif_disconnect(be->blkif);
+		xenbus_switch_state(dev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		xenbus_switch_state(dev, XenbusStateClosed);
+		if (xenbus_dev_is_online(dev))
+			break;
+		/* fall through if not online */
+	case XenbusStateUnknown:
+		device_unregister(&dev->dev);
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+				 frontend_state);
+		break;
+	}
+}
+
+
+/* ** Connection ** */
+
+
+/**
+ * Write the physical details regarding the block device to the store, and
+ * switch to Connected state.
+ */
+static void connect(struct backend_info *be)
+{
+	struct xenbus_transaction xbt;
+	int err;
+	struct xenbus_device *dev = be->dev;
+
+	DPRINTK("%s", dev->otherend);
+
+	/* Supply the information about the device the frontend needs */
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		return;
+	}
+
+	err = blkback_barrier(xbt, be, 1);
+	if (err)
+		goto abort;
+
+	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+			    vbd_size(&be->blkif->vbd));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/sectors",
+				 dev->nodename);
+		goto abort;
+	}
+
+	/* FIXME: use a typename instead */
+	err = xenbus_printf(xbt, dev->nodename, "info", "%u",
+			    vbd_info(&be->blkif->vbd));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/info",
+				 dev->nodename);
+		goto abort;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
+			    vbd_secsize(&be->blkif->vbd));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/sector-size",
+				 dev->nodename);
+		goto abort;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		xenbus_dev_fatal(dev, err, "ending transaction");
+
+	err = xenbus_switch_state(dev, XenbusStateConnected);
+	if (err)
+		xenbus_dev_fatal(dev, err, "switching to Connected state",
+				 dev->nodename);
+
+	return;
+ abort:
+	xenbus_transaction_end(xbt, 1);
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	unsigned long ring_ref;
+	unsigned int evtchn;
+	char protocol[64] = "";
+	int err;
+
+	DPRINTK("%s", dev->otherend);
+
+	err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
+			    "event-channel", "%u", &evtchn, NULL);
+	if (err) {
+		xenbus_dev_fatal(dev, err,
+				 "reading %s/ring-ref and event-channel",
+				 dev->otherend);
+		return err;
+	}
+
+	be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+			    "%63s", protocol, NULL);
+	if (err)
+		strcpy(protocol, "unspecified, assuming native");
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+	else {
+		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+		return -1;
+	}
+	printk(KERN_INFO
+	       "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
+	       ring_ref, evtchn, be->blkif->blk_protocol, protocol);
+
+	/* Map the shared frame, irq etc. */
+	err = blkif_map(be->blkif, ring_ref, evtchn);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
+				 ring_ref, evtchn);
+		return err;
+	}
+
+	return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id blkback_ids[] = {
+	{ "vbd" },
+	{ "" }
+};
+
+
+static struct xenbus_driver blkback = {
+	.name = "vbd",
+	.owner = THIS_MODULE,
+	.ids = blkback_ids,
+	.probe = blkback_probe,
+	.remove = blkback_remove,
+	.otherend_changed = frontend_changed
+};
+
+
+int blkif_xenbus_init(void)
+{
+	return xenbus_register_backend(&blkback);
+}
diff --git a/drivers/xen/blktap/Makefile b/drivers/xen/blktap/Makefile
new file mode 100644
index 0000000..822b4e4
--- /dev/null
+++ b/drivers/xen/blktap/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o
+
+blktap-objs := control.o ring.o device.o request.o sysfs.o
diff --git a/drivers/xen/blktap/blktap.h b/drivers/xen/blktap/blktap.h
new file mode 100644
index 0000000..fe63fc9
--- /dev/null
+++ b/drivers/xen/blktap/blktap.h
@@ -0,0 +1,209 @@
+#ifndef _BLKTAP_H_
+#define _BLKTAP_H_
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/init.h>
+#include <linux/scatterlist.h>
+#include <xen/blkif.h>
+
+extern int blktap_debug_level;
+extern int blktap_ring_major;
+extern int blktap_device_major;
+
+#define BTPRINTK(level, tag, force, _f, _a...)				\
+	do {								\
+		if (blktap_debug_level > level &&			\
+		    (force || printk_ratelimit()))			\
+			printk(tag "%s: " _f, __func__, ##_a);		\
+	} while (0)
+
+#define BTDBG(_f, _a...)             BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a)
+#define BTINFO(_f, _a...)            BTPRINTK(0, KERN_INFO, 0, _f, ##_a)
+#define BTWARN(_f, _a...)            BTPRINTK(0, KERN_WARNING, 0, _f, ##_a)
+#define BTERR(_f, _a...)             BTPRINTK(0, KERN_ERR, 0, _f, ##_a)
+
+#define MAX_BLKTAP_DEVICE            1024
+
+#define BLKTAP_DEVICE                4
+#define BLKTAP_DEVICE_CLOSED         5
+#define BLKTAP_SHUTDOWN_REQUESTED    8
+
+/* blktap IOCTLs: */
+#define BLKTAP2_IOCTL_KICK_FE        1
+#define BLKTAP2_IOCTL_ALLOC_TAP      200
+#define BLKTAP2_IOCTL_FREE_TAP       201
+#define BLKTAP2_IOCTL_CREATE_DEVICE  202
+#define BLKTAP2_IOCTL_REMOVE_DEVICE  207
+
+#define BLKTAP2_MAX_MESSAGE_LEN      256
+
+#define BLKTAP2_RING_MESSAGE_CLOSE   3
+
+#define BLKTAP_REQUEST_FREE          0
+#define BLKTAP_REQUEST_PENDING       1
+
+/*
+ * The maximum number of requests that can be outstanding at any time
+ * is determined by
+ *
+ *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
+ *
+ * where mmap_alloc < MAX_DYNAMIC_MEM.
+ *
+ * TODO:
+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
+ * sysfs.
+ */
+#define BLK_RING_SIZE		__RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
+#define MAX_DYNAMIC_MEM		BLK_RING_SIZE
+#define MAX_PENDING_REQS	BLK_RING_SIZE
+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req, _seg)					\
+        (_start +                                                       \
+         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
+         ((_seg) * PAGE_SIZE))
+
+struct grant_handle_pair {
+	grant_handle_t                 kernel;
+	grant_handle_t                 user;
+};
+#define INVALID_GRANT_HANDLE           0xFFFF
+
+struct blktap_handle {
+	unsigned int                   ring;
+	unsigned int                   device;
+	unsigned int                   minor;
+};
+
+struct blktap_params {
+	char                           name[BLKTAP2_MAX_MESSAGE_LEN];
+	unsigned long long             capacity;
+	unsigned long                  sector_size;
+};
+
+struct blktap_device {
+	spinlock_t                     lock;
+	struct gendisk                *gd;
+};
+
+struct blktap_ring {
+	struct task_struct            *task;
+
+	struct vm_area_struct         *vma;
+	struct blkif_front_ring        ring;
+	unsigned long                  ring_vstart;
+	unsigned long                  user_vstart;
+
+	int                            n_pending;
+	struct blktap_request         *pending[MAX_PENDING_REQS];
+
+	wait_queue_head_t              poll_wait;
+
+	dev_t                          devno;
+	struct device                 *dev;
+};
+
+struct blktap_statistics {
+	unsigned long                  st_print;
+	int                            st_rd_req;
+	int                            st_wr_req;
+	int                            st_oo_req;
+	int                            st_rd_sect;
+	int                            st_wr_sect;
+	s64                            st_rd_cnt;
+	s64                            st_rd_sum_usecs;
+	s64                            st_rd_max_usecs;
+	s64                            st_wr_cnt;
+	s64                            st_wr_sum_usecs;
+	s64                            st_wr_max_usecs;	
+};
+
+struct blktap_request {
+	struct blktap                 *tap;
+	struct request                *rq;
+	int                            usr_idx;
+
+	int                            operation;
+	struct timeval                 time;
+
+	struct scatterlist             sg_table[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page                   *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	int                            nr_pages;
+};
+
+#define blktap_for_each_sg(_sg, _req, _i)	\
+	for (_sg = (_req)->sg_table, _i = 0;	\
+	     _i < (_req)->nr_pages;		\
+	     (_sg)++, (_i)++)
+
+struct blktap {
+	int                            minor;
+	unsigned long                  dev_inuse;
+
+	struct blktap_ring             ring;
+	struct blktap_device           device;
+	struct blktap_page_pool       *pool;
+
+	wait_queue_head_t              remove_wait;
+	struct work_struct             remove_work;
+	char                           name[BLKTAP2_MAX_MESSAGE_LEN];
+
+	struct blktap_statistics       stats;
+};
+
+struct blktap_page_pool {
+	struct mempool_s              *bufs;
+	spinlock_t                     lock;
+	struct kobject                 kobj;
+	wait_queue_head_t              wait;
+};
+
+extern struct mutex blktap_lock;
+extern struct blktap **blktaps;
+extern int blktap_max_minor;
+
+int blktap_control_destroy_tap(struct blktap *);
+size_t blktap_control_debug(struct blktap *, char *, size_t);
+
+int blktap_ring_init(void);
+void blktap_ring_exit(void);
+size_t blktap_ring_debug(struct blktap *, char *, size_t);
+int blktap_ring_create(struct blktap *);
+int blktap_ring_destroy(struct blktap *);
+struct blktap_request *blktap_ring_make_request(struct blktap *);
+void blktap_ring_free_request(struct blktap *,struct blktap_request *);
+void blktap_ring_submit_request(struct blktap *, struct blktap_request *);
+int blktap_ring_map_request_segment(struct blktap *, struct blktap_request *, int);
+int blktap_ring_map_request(struct blktap *, struct blktap_request *);
+void blktap_ring_unmap_request(struct blktap *, struct blktap_request *);
+void blktap_ring_set_message(struct blktap *, int);
+void blktap_ring_kick_user(struct blktap *);
+
+int blktap_sysfs_init(void);
+void blktap_sysfs_exit(void);
+int blktap_sysfs_create(struct blktap *);
+void blktap_sysfs_destroy(struct blktap *);
+
+int blktap_device_init(void);
+void blktap_device_exit(void);
+size_t blktap_device_debug(struct blktap *, char *, size_t);
+int blktap_device_create(struct blktap *, struct blktap_params *);
+int blktap_device_destroy(struct blktap *);
+void blktap_device_destroy_sync(struct blktap *);
+void blktap_device_run_queue(struct blktap *);
+void blktap_device_end_request(struct blktap *, struct blktap_request *, int);
+
+int blktap_page_pool_init(struct kobject *);
+void blktap_page_pool_exit(void);
+struct blktap_page_pool *blktap_page_pool_get(const char *);
+
+size_t blktap_request_debug(struct blktap *, char *, size_t);
+struct blktap_request *blktap_request_alloc(struct blktap *);
+int blktap_request_get_pages(struct blktap *, struct blktap_request *, int);
+void blktap_request_free(struct blktap *, struct blktap_request *);
+void blktap_request_bounce(struct blktap *, struct blktap_request *, int, int);
+
+
+#endif
diff --git a/drivers/xen/blktap/control.c b/drivers/xen/blktap/control.c
new file mode 100644
index 0000000..f339bba
--- /dev/null
+++ b/drivers/xen/blktap/control.c
@@ -0,0 +1,315 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/miscdevice.h>
+#include <linux/device.h>
+#include <asm/uaccess.h>
+
+#include "blktap.h"
+
+DEFINE_MUTEX(blktap_lock);
+
+struct blktap **blktaps;
+int blktap_max_minor;
+static struct blktap_page_pool *default_pool;
+
+static struct blktap *
+blktap_control_get_minor(void)
+{
+	int minor;
+	struct blktap *tap;
+
+	tap = kzalloc(sizeof(*tap), GFP_KERNEL);
+	if (unlikely(!tap))
+		return NULL;
+
+	mutex_lock(&blktap_lock);
+
+	for (minor = 0; minor < blktap_max_minor; minor++)
+		if (!blktaps[minor])
+			break;
+
+	if (minor == MAX_BLKTAP_DEVICE)
+		goto fail;
+
+	if (minor == blktap_max_minor) {
+		void *p;
+		int n;
+
+		n = min(2 * blktap_max_minor, MAX_BLKTAP_DEVICE);
+		p = krealloc(blktaps, n * sizeof(blktaps[0]), GFP_KERNEL);
+		if (!p)
+			goto fail;
+
+		blktaps          = p;
+		minor            = blktap_max_minor;
+		blktap_max_minor = n;
+
+		memset(&blktaps[minor], 0, (n - minor) * sizeof(blktaps[0]));
+	}
+
+	tap->minor = minor;
+	blktaps[minor] = tap;
+
+	__module_get(THIS_MODULE);
+out:
+	mutex_unlock(&blktap_lock);
+	return tap;
+
+fail:
+	mutex_unlock(&blktap_lock);
+	kfree(tap);
+	tap = NULL;
+	goto out;
+}
+
+static void
+blktap_control_put_minor(struct blktap* tap)
+{
+	blktaps[tap->minor] = NULL;
+	kfree(tap);
+
+	module_put(THIS_MODULE);
+}
+
+static struct blktap*
+blktap_control_create_tap(void)
+{
+	struct blktap *tap;
+	int err;
+
+	tap = blktap_control_get_minor();
+	if (!tap)
+		return NULL;
+
+	kobject_get(&default_pool->kobj);
+	tap->pool = default_pool;
+
+	err = blktap_ring_create(tap);
+	if (err)
+		goto fail_tap;
+
+	err = blktap_sysfs_create(tap);
+	if (err)
+		goto fail_ring;
+
+	return tap;
+
+fail_ring:
+	blktap_ring_destroy(tap);
+fail_tap:
+	blktap_control_put_minor(tap);
+
+	return NULL;
+}
+
+int
+blktap_control_destroy_tap(struct blktap *tap)
+{
+	int err;
+
+	err = blktap_ring_destroy(tap);
+	if (err)
+		return err;
+
+	kobject_put(&tap->pool->kobj);
+
+	blktap_sysfs_destroy(tap);
+
+	blktap_control_put_minor(tap);
+
+	return 0;
+}
+
+static int
+blktap_control_ioctl(struct inode *inode, struct file *filp,
+		     unsigned int cmd, unsigned long arg)
+{
+	struct blktap *tap;
+
+	switch (cmd) {
+	case BLKTAP2_IOCTL_ALLOC_TAP: {
+		struct blktap_handle h;
+		void __user *ptr = (void __user*)arg;
+
+		tap = blktap_control_create_tap();
+		if (!tap)
+			return -ENOMEM;
+
+		h.ring   = blktap_ring_major;
+		h.device = blktap_device_major;
+		h.minor  = tap->minor;
+
+		if (copy_to_user(ptr, &h, sizeof(h))) {
+			blktap_control_destroy_tap(tap);
+			return -EFAULT;
+		}
+
+		return 0;
+	}
+
+	case BLKTAP2_IOCTL_FREE_TAP: {
+		int minor = arg;
+
+		if (minor > MAX_BLKTAP_DEVICE)
+			return -EINVAL;
+
+		tap = blktaps[minor];
+		if (!tap)
+			return -ENODEV;
+
+		return blktap_control_destroy_tap(tap);
+	}
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+static struct file_operations blktap_control_file_operations = {
+	.owner    = THIS_MODULE,
+	.ioctl    = blktap_control_ioctl,
+};
+
+static struct miscdevice blktap_control = {
+	.minor    = MISC_DYNAMIC_MINOR,
+	.name     = "blktap-control",
+	.fops     = &blktap_control_file_operations,
+};
+
+static struct device *control_device;
+
+static ssize_t
+blktap_control_show_default_pool(struct device *device,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	return sprintf(buf, "%s", kobject_name(&default_pool->kobj));
+}
+
+static ssize_t
+blktap_control_store_default_pool(struct device *device,
+				  struct device_attribute *attr,
+				  const char *buf, size_t size)
+{
+	struct blktap_page_pool *pool, *tmp = default_pool;
+
+	pool = blktap_page_pool_get(buf);
+	if (IS_ERR(pool))
+		return PTR_ERR(pool);
+
+	default_pool = pool;
+	kobject_put(&tmp->kobj);
+
+	return size;
+}
+
+static DEVICE_ATTR(default_pool, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH,
+		   blktap_control_show_default_pool,
+		   blktap_control_store_default_pool);
+
+size_t
+blktap_control_debug(struct blktap *tap, char *buf, size_t size)
+{
+	char *s = buf, *end = buf + size;
+
+	s += snprintf(s, end - s,
+		      "tap %u:%u name:'%s' flags:%#08lx\n",
+		      MAJOR(tap->ring.devno), MINOR(tap->ring.devno),
+		      tap->name, tap->dev_inuse);
+
+	return s - buf;
+}
+
+static int __init
+blktap_control_init(void)
+{
+	int err;
+
+	err = misc_register(&blktap_control);
+	if (err)
+		return err;
+
+	control_device = blktap_control.this_device;
+
+	blktap_max_minor = min(64, MAX_BLKTAP_DEVICE);
+	blktaps = kzalloc(blktap_max_minor * sizeof(blktaps[0]), GFP_KERNEL);
+	if (!blktaps) {
+		BTERR("failed to allocate blktap minor map");
+		return -ENOMEM;
+	}
+
+	err = blktap_page_pool_init(&control_device->kobj);
+	if (err)
+		return err;
+
+	default_pool = blktap_page_pool_get("default");
+	if (!default_pool)
+		return -ENOMEM;
+
+	err = device_create_file(control_device, &dev_attr_default_pool);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static void
+blktap_control_exit(void)
+{
+	if (default_pool) {
+		kobject_put(&default_pool->kobj);
+		default_pool = NULL;
+	}
+
+	blktap_page_pool_exit();
+
+	if (blktaps) {
+		kfree(blktaps);
+		blktaps = NULL;
+	}
+
+	if (control_device) {
+		misc_deregister(&blktap_control);
+		control_device = NULL;
+	}
+}
+
+static void
+blktap_exit(void)
+{
+	blktap_control_exit();
+	blktap_ring_exit();
+	blktap_sysfs_exit();
+	blktap_device_exit();
+}
+
+static int __init
+blktap_init(void)
+{
+	int err;
+
+	err = blktap_device_init();
+	if (err)
+		goto fail;
+
+	err = blktap_ring_init();
+	if (err)
+		goto fail;
+
+	err = blktap_sysfs_init();
+	if (err)
+		goto fail;
+
+	err = blktap_control_init();
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	blktap_exit();
+	return err;
+}
+
+module_init(blktap_init);
+module_exit(blktap_exit);
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/blktap/device.c b/drivers/xen/blktap/device.c
new file mode 100644
index 0000000..8ff276a
--- /dev/null
+++ b/drivers/xen/blktap/device.c
@@ -0,0 +1,566 @@
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/cdrom.h>
+#include <linux/hdreg.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+
+#include "blktap.h"
+
+int blktap_device_major;
+
+#define dev_to_blktap(_dev) container_of(_dev, struct blktap, device)
+
+static int
+blktap_device_open(struct block_device *bdev, fmode_t mode)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	struct blktap_device *tapdev = disk->private_data;
+
+	if (!tapdev)
+		return -ENXIO;
+
+	/* NB. we might have bounced a bd trylock by tapdisk. when
+	 * failing for reasons not !tapdev, make sure to kick tapdisk
+	 * out of destroy wait state again. */
+
+	return 0;
+}
+
+static int
+blktap_device_release(struct gendisk *disk, fmode_t mode)
+{
+	struct blktap_device *tapdev = disk->private_data;
+	struct block_device *bdev = bdget_disk(disk, 0);
+	struct blktap *tap = dev_to_blktap(tapdev);
+
+	bdput(bdev);
+
+	if (!bdev->bd_openers) {
+		set_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse);
+		blktap_ring_kick_user(tap);
+	}
+
+	return 0;
+}
+
+static int
+blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
+{
+	/* We don't have real geometry info, but let's at least return
+	   values consistent with the size of the device */
+	sector_t nsect = get_capacity(bd->bd_disk);
+	sector_t cylinders = nsect;
+
+	hg->heads = 0xff;
+	hg->sectors = 0x3f;
+	sector_div(cylinders, hg->heads * hg->sectors);
+	hg->cylinders = cylinders;
+	if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
+		hg->cylinders = 0xffff;
+	return 0;
+}
+
+static int
+blktap_device_ioctl(struct block_device *bd, fmode_t mode,
+		    unsigned command, unsigned long argument)
+{
+	int i;
+
+	switch (command) {
+	case CDROMMULTISESSION:
+		BTDBG("FIXME: support multisession CDs later\n");
+		for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+			if (put_user(0, (char __user *)(argument + i)))
+				return -EFAULT;
+		return 0;
+
+	case SCSI_IOCTL_GET_IDLUN:
+		if (!access_ok(VERIFY_WRITE, argument, 
+			sizeof(struct scsi_idlun)))
+			return -EFAULT;
+
+		/* return 0 for now. */
+		__put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
+		__put_user(0, 
+			&((struct scsi_idlun __user *)argument)->host_unique_id);
+		return 0;
+
+	default:
+		/*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
+		  command);*/
+		return -EINVAL; /* same return as native Linux */
+	}
+
+	return 0;
+}
+
+static struct block_device_operations blktap_device_file_operations = {
+	.owner     = THIS_MODULE,
+	.open      = blktap_device_open,
+	.release   = blktap_device_release,
+	.ioctl     = blktap_device_ioctl,
+	.getgeo    = blktap_device_getgeo
+};
+
+/* NB. __blktap holding the queue lock; blktap where unlocked */
+
+static inline struct request*
+__blktap_next_queued_rq(struct request_queue *q)
+{
+	return blk_peek_request(q);
+}
+
+static inline void
+__blktap_dequeue_rq(struct request *rq)
+{
+	blk_start_request(rq);
+}
+
+/* NB. err == 0 indicates success, failures < 0 */
+
+static inline void
+__blktap_end_queued_rq(struct request *rq, int err)
+{
+	blk_start_request(rq);
+	__blk_end_request(rq, err, blk_rq_bytes(rq));
+}
+
+static inline void
+__blktap_end_rq(struct request *rq, int err)
+{
+	__blk_end_request(rq, err, blk_rq_bytes(rq));
+}
+
+static inline void
+blktap_end_rq(struct request *rq, int err)
+{
+	struct request_queue *q = rq->q;
+
+	spin_lock_irq(q->queue_lock);
+	__blktap_end_rq(rq, err);
+	spin_unlock_irq(q->queue_lock);
+}
+
+void
+blktap_device_end_request(struct blktap *tap,
+			  struct blktap_request *request,
+			  int error)
+{
+	struct blktap_device *tapdev = &tap->device;
+	struct request *rq = request->rq;
+
+	blktap_ring_unmap_