Merge commit 'v2.6.32.51' into xen/next-2.6.32
* commit 'v2.6.32.51':
Linux 2.6.32.51
USB: cdc-acm: add IDs for Motorola H24 HSPA USB module.
ext4: avoid hangs in ext4_da_should_update_i_disksize()
oprofile, x86: Fix crash when unloading module (timer mode)
oprofile, x86: Fix nmi-unsafe callgraph support
export __get_user_pages_fast() function
hfs: fix hfs_find_init() sb->ext_tree NULL ptr oops
Make TASKSTATS require root access
jbd/jbd2: validate sb->s_first in journal_get_superblock()
linux/log2.h: Fix rounddown_pow_of_two(1)
xfrm: Fix key lengths for rfc3686(ctr(aes))
percpu: fix chunk range calculation
oprofile: Fix locking dependency in sync_start()
oprofile: Free potentially owned tasks in case of errors
ARM: davinci: dm646x evm: wrong register used in setup_vpif_input_channel_mode
ALSA: hda/realtek - Fix Oops in alc_mux_select()
ALSA: sis7019 - give slow codecs more time to reset
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c840e7d..9238f05 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -113,6 +113,7 @@
More X86-64 boot options can be found in
Documentation/x86/x86_64/boot-options.txt .
X86 Either 32bit or 64bit x86 (same as X86-32+X86-64)
+ XEN Xen support is enabled
In addition, the following text indicates that the option:
@@ -2765,6 +2766,18 @@
xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks.
xd_geo= See header of drivers/block/xd.c.
+ xen_emul_unplug= [HW,X86,XEN]
+ Unplug Xen emulated devices
+ Format: [unplug0,][unplug1]
+ ide-disks -- unplug primary master IDE devices
+ aux-ide-disks -- unplug non-primary-master IDE devices
+ nics -- unplug network devices
+ all -- unplug all emulated devices (NICs and IDE disks)
+ unnecessary -- unplugging emulated devices is
+ unnecessary even if the host did not respond to
+ the unplug protocol
+ never -- do not unplug even if version check succeeds
+
xirc2ps_cs= [NET,PCMCIA]
Format:
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 29a6ff8..81f9b94 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -267,10 +267,14 @@
iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU
implementation:
- swiotlb=<pages>[,force]
+ swiotlb=[npages=<pages>]
+ swiotlb=[force]
+ swiotlb=[overflow=<size>]
+
<pages> Prereserve that many 128K pages for the software IO
bounce buffering.
force Force all IO through the software TLB.
+ <size> Size in bytes of the overflow buffer.
Settings for the IBM Calgary hardware IOMMU currently found in IBM
pSeries and xSeries machines:
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index 8d3c79c..7d09a09 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -73,7 +73,7 @@
if (!dev->dma_mask)
return 0;
- return addr + size <= *dev->dma_mask;
+ return addr + size - 1 <= *dev->dma_mask;
}
static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
diff --git a/arch/ia64/include/asm/swiotlb.h b/arch/ia64/include/asm/swiotlb.h
index dcbaea7..f0acde6 100644
--- a/arch/ia64/include/asm/swiotlb.h
+++ b/arch/ia64/include/asm/swiotlb.h
@@ -4,8 +4,6 @@
#include <linux/dma-mapping.h>
#include <linux/swiotlb.h>
-extern int swiotlb_force;
-
#ifdef CONFIG_SWIOTLB
extern int swiotlb;
extern void pci_swiotlb_init(void);
diff --git a/arch/ia64/include/asm/xen/events.h b/arch/ia64/include/asm/xen/events.h
index b8370c8..baa74c8 100644
--- a/arch/ia64/include/asm/xen/events.h
+++ b/arch/ia64/include/asm/xen/events.h
@@ -36,10 +36,6 @@
return !(ia64_psr(regs)->i);
}
-static inline void handle_irq(int irq, struct pt_regs *regs)
-{
- __do_IRQ(irq);
-}
#define irq_ctx_init(cpu) do { } while (0)
#endif /* _ASM_IA64_XEN_EVENTS_H */
diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
index 285aae8..53292ab 100644
--- a/arch/ia64/kernel/pci-swiotlb.c
+++ b/arch/ia64/kernel/pci-swiotlb.c
@@ -41,7 +41,7 @@
void __init swiotlb_dma_init(void)
{
dma_ops = &swiotlb_dma_ops;
- swiotlb_init();
+ swiotlb_init(1);
}
void __init pci_swiotlb_init(void)
@@ -51,7 +51,7 @@
swiotlb = 1;
printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
machvec_init("dig");
- swiotlb_init();
+ swiotlb_init(1);
dma_ops = &swiotlb_dma_ops;
#else
panic("Unable to find Intel IOMMU");
diff --git a/arch/ia64/xen/suspend.c b/arch/ia64/xen/suspend.c
index fd66b04..419c862 100644
--- a/arch/ia64/xen/suspend.c
+++ b/arch/ia64/xen/suspend.c
@@ -37,19 +37,14 @@
/* nothing */
}
-void xen_pre_device_suspend(void)
+void
+xen_arch_pre_suspend()
{
/* nothing */
}
void
-xen_pre_suspend()
-{
- /* nothing */
-}
-
-void
-xen_post_suspend(int suspend_cancelled)
+xen_arch_post_suspend(int suspend_cancelled)
{
if (suspend_cancelled)
return;
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index e281dae..80a973b 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -197,7 +197,7 @@
if (!dev->dma_mask)
return 0;
- return addr + size <= *dev->dma_mask;
+ return addr + size - 1 <= *dev->dma_mask;
}
static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 53bcf3d..b152de3 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -345,7 +345,7 @@
#ifdef CONFIG_SWIOTLB
if (ppc_swiotlb_enable)
- swiotlb_init();
+ swiotlb_init(1);
#endif
paging_init();
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 00d3b65..dc5eede 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -559,7 +559,7 @@
#ifdef CONFIG_SWIOTLB
if (ppc_swiotlb_enable)
- swiotlb_init();
+ swiotlb_init(1);
#endif
paging_init();
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 73ae02a..0266f87 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1896,6 +1896,10 @@
def_bool y
depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
+config PCI_XEN
+ bool
+ select SWIOTLB
+
config PCI_DOMAINS
def_bool y
depends on PCI
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index 18aa3f8..4413ba4 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -23,20 +23,16 @@
#include <linux/irqreturn.h>
#ifdef CONFIG_AMD_IOMMU
-extern int amd_iommu_init(void);
extern int amd_iommu_init_dma_ops(void);
extern int amd_iommu_init_passthrough(void);
extern void amd_iommu_detect(void);
extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
extern void amd_iommu_flush_all_domains(void);
extern void amd_iommu_flush_all_devices(void);
-extern void amd_iommu_shutdown(void);
extern void amd_iommu_apply_erratum_63(u16 devid);
extern void amd_iommu_init_api(void);
#else
-static inline int amd_iommu_init(void) { return -ENODEV; }
static inline void amd_iommu_detect(void) { }
-static inline void amd_iommu_shutdown(void) { }
#endif
#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index b03bedb..0918654 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -62,10 +62,8 @@
extern int use_calgary;
#ifdef CONFIG_CALGARY_IOMMU
-extern int calgary_iommu_init(void);
extern void detect_calgary(void);
#else
-static inline int calgary_iommu_init(void) { return 1; }
static inline void detect_calgary(void) { return; }
#endif
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 6a25d5d..ac91eed 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -20,7 +20,8 @@
# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32)
#endif
-extern dma_addr_t bad_dma_address;
+#define DMA_ERROR_CODE 0
+
extern int iommu_merge;
extern struct device x86_dma_fallback_dev;
extern int panic_on_overflow;
@@ -48,7 +49,7 @@
if (ops->mapping_error)
return ops->mapping_error(dev, dma_addr);
- return (dma_addr == bad_dma_address);
+ return (dma_addr == DMA_ERROR_CODE);
}
#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
@@ -66,7 +67,7 @@
if (!dev->dma_mask)
return 0;
- return addr + size <= *dev->dma_mask;
+ return addr + size - 1 <= *dev->dma_mask;
}
static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 40b4e61..fa3fd43 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -109,6 +109,8 @@
extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
extern void free_early(u64 start, u64 end);
extern void early_res_to_bootmem(u64 start, u64 end);
+extern u64 early_res_next_free(u64 start);
+extern u64 early_res_next_reserved(u64 addr, u64 max);
extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
extern unsigned long e820_end_of_ram_pfn(void);
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 6cfdafa..4ac5b0f 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -35,8 +35,7 @@
extern int gart_iommu_aperture_disabled;
extern void early_gart_iommu_check(void);
-extern void gart_iommu_init(void);
-extern void gart_iommu_shutdown(void);
+extern int gart_iommu_init(void);
extern void __init gart_parse_options(char *);
extern void gart_iommu_hole_init(void);
@@ -48,12 +47,6 @@
static inline void early_gart_iommu_check(void)
{
}
-static inline void gart_iommu_init(void)
-{
-}
-static inline void gart_iommu_shutdown(void)
-{
-}
static inline void gart_parse_options(char *options)
{
}
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 3251e23..fa152cb 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -68,6 +68,7 @@
extern int hpet_force_user;
extern u8 hpet_msi_disable;
extern int is_hpet_enabled(void);
+extern int disable_hpet(char *);
extern int hpet_enable(void);
extern void hpet_disable(void);
extern unsigned long hpet_readl(unsigned long a);
@@ -108,6 +109,7 @@
#else /* CONFIG_HPET_TIMER */
static inline int hpet_enable(void) { return 0; }
+static inline int disable_hpet(char *s) { return 0; }
static inline int is_hpet_enabled(void) { return 0; }
#define hpet_readl(a) 0
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 439a9ac..bf88684 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -36,16 +36,28 @@
free_pgd_range(tlb, addr, end, floor, ceiling);
}
+static inline pte_t huge_ptep_get(pte_t *ptep)
+{
+ return *ptep;
+}
+
static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
- set_pte_at(mm, addr, ptep, pte);
+#if PAGETABLE_LEVELS >= 3
+ set_pmd((pmd_t *)ptep, native_make_pmd(native_pte_val(pte)));
+#else
+ set_pgd((pgd_t *)ptep, native_make_pgd(native_pte_val(pte)));
+#endif
}
static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- return ptep_get_and_clear(mm, addr, ptep);
+ pte_t pte = huge_ptep_get(ptep);
+
+ set_huge_pte_at(mm, addr, ptep, __pte(0));
+ return pte;
}
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
@@ -66,19 +78,25 @@
static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- ptep_set_wrprotect(mm, addr, ptep);
+ pte_t pte = huge_ptep_get(ptep);
+
+ pte = pte_wrprotect(pte);
+ set_huge_pte_at(mm, addr, ptep, pte);
}
static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t pte, int dirty)
{
- return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-}
+ pte_t oldpte = huge_ptep_get(ptep);
+ int changed = !pte_same(oldpte, pte);
-static inline pte_t huge_ptep_get(pte_t *ptep)
-{
- return *ptep;
+ if (changed && dirty) {
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+ flush_tlb_page(vma, addr);
+ }
+
+ return changed;
}
static inline int arch_prepare_hugepage(struct page *page)
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6a63b86..9ad387e 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -7,6 +7,10 @@
#include <asm-generic/int-ll64.h>
#include <asm/page.h>
+#include <xen/xen.h>
+
+extern int isapnp_disable;
+
#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
@@ -199,6 +203,18 @@
extern void __iomem *early_memremap(resource_size_t phys_addr,
unsigned long size);
extern void early_iounmap(void __iomem *addr, unsigned long size);
+extern bool is_early_ioremap_ptep(pte_t *ptep);
+
+#ifdef CONFIG_XEN
+struct bio_vec;
+
+extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
+ const struct bio_vec *vec2);
+
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+ (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \
+ (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
+#endif /* CONFIG_XEN */
#define IO_SPACE_LIMIT 0xffff
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 5f61f6e..b852da9 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -172,6 +172,7 @@
extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
extern void probe_nr_irqs_gsi(void);
+extern int get_nr_irqs_gsi(void);
extern int setup_ioapic_entry(int apic, int irq,
struct IO_APIC_route_entry *entry,
@@ -201,4 +202,6 @@
#endif
+void xen_io_apic_init(void);
+
#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index fd6d21b..345c99c 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -1,8 +1,6 @@
#ifndef _ASM_X86_IOMMU_H
#define _ASM_X86_IOMMU_H
-extern void pci_iommu_shutdown(void);
-extern void no_iommu_init(void);
extern struct dma_map_ops nommu_dma_ops;
extern int force_iommu, no_iommu;
extern int iommu_detected;
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6e90a04..ba4dc7b 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -120,6 +120,12 @@
*/
#define MCE_SELF_VECTOR 0xeb
+#ifdef CONFIG_XEN
+/* Xen vector callback to receive events in a HVM domain */
+#define XEN_HVM_EVTCHN_CALLBACK 0xe9
+#endif
+
+
/*
* First APIC vector available to drivers: (vectors 0x30-0xee) we
* start at 0x31(0x41) to spread out vectors evenly between priority
@@ -157,6 +163,14 @@
#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS )
#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS )
+#ifndef __ASSEMBLY__
+# if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SPARSE_IRQ)
+extern int nr_dynamic_irqs;
+# else
+# define NR_DYNAMIC_IRQS 256
+# endif
+#endif
+
#ifdef CONFIG_X86_IO_APIC
# ifdef CONFIG_SPARSE_IRQ
# define NR_IRQS \
@@ -165,13 +179,13 @@
(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
# else
# if NR_CPUS < MAX_IO_APICS
-# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT)
+# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT) + NR_DYNAMIC_IRQS
# else
-# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT)
+# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) + NR_DYNAMIC_IRQS
# endif
# endif
#else /* !CONFIG_X86_IO_APIC: */
-# define NR_IRQS NR_IRQS_LEGACY
+# define NR_IRQS NR_IRQS_LEGACY + NR_DYNAMIC_IRQS
#endif
#endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ef51b50..e15fca1 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -55,4 +55,13 @@
}
#endif
+#ifdef CONFIG_MICROCODE_XEN
+extern struct microcode_ops * __init init_xen_microcode(void);
+#else
+static inline struct microcode_ops * __init init_xen_microcode(void)
+{
+ return NULL;
+}
+#endif
+
#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 80a1dee..67eaa91 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -13,6 +13,9 @@
int size;
struct mutex lock;
void *vdso;
+#ifdef CONFIG_XEN
+ int has_foreign_mappings;
+#endif
} mm_context_t;
#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index efb3899..e571db4 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -330,11 +330,18 @@
{
PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
}
+
static inline void set_iopl_mask(unsigned mask)
{
PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
}
+static inline void set_io_bitmap(struct thread_struct *thread,
+ unsigned long bytes_updated)
+{
+ PVOP_VCALL2(pv_cpu_ops.set_io_bitmap, thread, bytes_updated);
+}
+
/* The paravirtualized I/O functions */
static inline void slow_down_io(void)
{
@@ -770,15 +777,28 @@
#define PV_RESTORE_REGS "popl %edx; popl %ecx;"
/* save and restore all caller-save registers, except return value */
-#define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;"
-#define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;"
+#define __PV_SAVE_ALL_CALLER_REGS "pushl %ecx;"
+#define __PV_RESTORE_ALL_CALLER_REGS "popl %ecx;"
+
+#ifdef CONFIG_FRAME_POINTER
+#define PV_SAVE_ALL_CALLER_REGS \
+ "push %ebp;" \
+ "mov %esp, %ebp;" \
+ __PV_SAVE_ALL_CALLER_REGS
+#define PV_RESTORE_ALL_CALLER_REGS \
+ __PV_RESTORE_ALL_CALLER_REGS \
+ "leave;"
+#else
+#define PV_SAVE_ALL_CALLER_REGS __PV_SAVE_ALL_CALLER_REGS
+#define PV_RESTORE_ALL_CALLER_REGS __PV_RESTORE_ALL_CALLER_REGS
+#endif
#define PV_FLAGS_ARG "0"
#define PV_EXTRA_CLOBBERS
#define PV_VEXTRA_CLOBBERS
#else
/* save and restore all caller-save registers, except return value */
-#define PV_SAVE_ALL_CALLER_REGS \
+#define __PV_SAVE_ALL_CALLER_REGS \
"push %rcx;" \
"push %rdx;" \
"push %rsi;" \
@@ -787,7 +807,7 @@
"push %r9;" \
"push %r10;" \
"push %r11;"
-#define PV_RESTORE_ALL_CALLER_REGS \
+#define __PV_RESTORE_ALL_CALLER_REGS \
"pop %r11;" \
"pop %r10;" \
"pop %r9;" \
@@ -797,6 +817,19 @@
"pop %rdx;" \
"pop %rcx;"
+#ifdef CONFIG_FRAME_POINTER
+#define PV_SAVE_ALL_CALLER_REGS \
+ "push %rbp;" \
+ "mov %rsp, %rbp;" \
+ __PV_SAVE_ALL_CALLER_REGS
+#define PV_RESTORE_ALL_CALLER_REGS \
+ __PV_RESTORE_ALL_CALLER_REGS \
+ "leaveq;"
+#else
+#define PV_SAVE_ALL_CALLER_REGS __PV_SAVE_ALL_CALLER_REGS
+#define PV_RESTORE_ALL_CALLER_REGS __PV_RESTORE_ALL_CALLER_REGS
+#endif
+
/* We save some registers, but all of them, that's too much. We clobber all
* caller saved registers but the argument parameter */
#define PV_SAVE_REGS "pushq %%rdi;"
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 9357473..3202dcc 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -135,6 +135,8 @@
void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
void (*set_iopl_mask)(unsigned mask);
+ void (*set_io_bitmap)(struct thread_struct *thread,
+ unsigned long bytes_updated);
void (*wbinvd)(void);
void (*io_delay)(void);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index ada8c20..faa0af17 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -21,6 +21,7 @@
extern int pci_routeirq;
extern int noioapicquirk;
extern int noioapicreroute;
+extern int pci_scan_all_fns;
/* scan a bus after allocating a pci_sysdata for it */
extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
@@ -49,6 +50,11 @@
#define pcibios_assign_all_busses() 0
#endif
+static inline int pcibios_scan_all_fns(struct pci_bus *bus, int devfn)
+{
+ return pci_scan_all_fns;
+}
+
extern unsigned long pci_mem_start;
#define PCIBIOS_MIN_IO 0x1000
#define PCIBIOS_MIN_MEM (pci_mem_start)
@@ -87,6 +93,7 @@
/* MSI arch hook */
#define arch_setup_msi_irqs arch_setup_msi_irqs
+#define arch_teardown_msi_irqs arch_teardown_msi_irqs
#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
@@ -128,6 +135,7 @@
#include <asm-generic/pci-dma-compat.h>
/* generic pci stuff */
+#define HAVE_ARCH_PCIBIOS_SCAN_ALL_FNS
#include <asm-generic/pci.h>
#define PCIBIOS_MAX_MEM_32 0xffffffff
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index b399988..30cbf49 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -45,6 +45,7 @@
extern unsigned int pcibios_max_latency;
void pcibios_resource_survey(void);
+void pcibios_set_cache_line_size(void);
/* pci-pc.c */
@@ -106,6 +107,7 @@
extern void pci_direct_init(int type);
extern void pci_pcbios_init(void);
extern int pci_olpc_init(void);
+extern int pci_xen_init(void);
extern void __init dmi_check_pciprobe(void);
extern void __init dmi_check_skip_isa_align(void);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index af6fd36..430e3cc 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -15,7 +15,6 @@
: (prot))
#ifndef __ASSEMBLY__
-
/*
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
@@ -26,6 +25,8 @@
extern spinlock_t pgd_lock;
extern struct list_head pgd_list;
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else /* !CONFIG_PARAVIRT */
@@ -76,6 +77,11 @@
#endif /* CONFIG_PARAVIRT */
+static inline pteval_t pte_flags(pte_t pte)
+{
+ return pte_val(pte) & PTE_FLAGS_MASK;
+}
+
/*
* The following only work if pte_present() is true.
* Undefined behaviour if not..
@@ -397,6 +403,9 @@
#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
remap_pfn_range(vma, vaddr, pfn, size, prot)
+#define arch_vm_get_page_prot arch_vm_get_page_prot
+extern pgprot_t arch_vm_get_page_prot(unsigned vm_flags);
+
#if PAGETABLE_LEVELS > 2
static inline int pud_none(pud_t pud)
{
@@ -616,6 +625,9 @@
memcpy(dst, src, count * sizeof(pgd_t));
}
+int create_lookup_pte_addr(struct mm_struct *mm,
+ unsigned long address,
+ uint64_t *ptep);
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index c57a301..4e46931 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -160,7 +160,7 @@
#define pgtable_cache_init() do { } while (0)
#define check_pgt_cache() do { } while (0)
-#define PAGE_AGP PAGE_KERNEL_NOCACHE
+#define PAGE_AGP PAGE_KERNEL_IO_NOCACHE
#define HAVE_PAGE_AGP 1
/* fs/proc/kcore.c */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d1f4a76..a81b0ed 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -265,11 +265,6 @@
return pte.pte;
}
-static inline pteval_t pte_flags(pte_t pte)
-{
- return native_pte_val(pte) & PTE_FLAGS_MASK;
-}
-
#define pgprot_val(x) ((x).pgprot)
#define __pgprot(x) ((pgprot_t) { (x) } )
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index fa04dea..3cf805d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -551,6 +551,9 @@
#endif
}
+extern void native_set_io_bitmap(struct thread_struct *thread,
+ unsigned long updated_bytes);
+
static inline void
native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
{
@@ -592,6 +595,7 @@
}
#define set_iopl_mask native_set_iopl_mask
+#define set_io_bitmap native_set_io_bitmap
#endif /* CONFIG_PARAVIRT */
/*
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 18e496c..154a5f1 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -95,6 +95,11 @@
: : "i" (sz)); \
}
+/* Helper for reserving space for arrays of things */
+#define RESERVE_BRK_ARRAY(type, name, entries) \
+ type *name; \
+ RESERVE_BRK(name, sizeof(type) * entries)
+
#ifdef __i386__
void __init i386_start_kernel(void);
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index b9e4e20..8085277 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -3,15 +3,16 @@
#include <linux/swiotlb.h>
-/* SWIOTLB interface */
-
-extern int swiotlb_force;
-
#ifdef CONFIG_SWIOTLB
extern int swiotlb;
-extern void pci_swiotlb_init(void);
+extern int __init pci_swiotlb_detect(void);
+extern void __init pci_swiotlb_init(void);
#else
#define swiotlb 0
+static inline int pci_swiotlb_detect(void)
+{
+ return 0;
+}
static inline void pci_swiotlb_init(void)
{
}
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 1bb6e39..ef0fa4d 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -33,11 +33,11 @@
asmlinkage int sys_set_thread_area(struct user_desc __user *);
asmlinkage int sys_get_thread_area(struct user_desc __user *);
+/* kernel/ioport.c */
+asmlinkage long sys_iopl(unsigned int);
+
/* X86_32 only */
#ifdef CONFIG_X86_32
-/* kernel/ioport.c */
-long sys_iopl(struct pt_regs *);
-
/* kernel/process_32.c */
int sys_clone(struct pt_regs *);
int sys_execve(struct pt_regs *);
@@ -68,8 +68,6 @@
#else /* CONFIG_X86_32 */
/* X86_64 only */
-/* kernel/ioport.c */
-asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
/* kernel/process_64.c */
asmlinkage long sys_clone(unsigned long, unsigned long,
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 7f3eba0..e4fc8ea 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -89,6 +89,10 @@
#ifndef CONFIG_SMP
+static inline void __init init_smp_flush(void)
+{
+}
+
#define flush_tlb() __flush_tlb()
#define flush_tlb_all() __flush_tlb_all()
#define local_flush_tlb() __flush_tlb()
@@ -129,6 +133,8 @@
#define local_flush_tlb() __flush_tlb()
+extern void init_smp_flush(void);
+
extern void flush_tlb_all(void);
extern void flush_tlb_current_task(void);
extern void flush_tlb_mm(struct mm_struct *);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 2c756fd..d8e7145 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -91,6 +91,14 @@
};
/**
+ * struct x86_init_iommu - platform specific iommu setup
+ * @iommu_init: platform specific iommu setup
+ */
+struct x86_init_iommu {
+ int (*iommu_init)(void);
+};
+
+/**
* struct x86_init_ops - functions for platform specific setup
*
*/
@@ -101,6 +109,7 @@
struct x86_init_oem oem;
struct x86_init_paging paging;
struct x86_init_timers timers;
+ struct x86_init_iommu iommu;
};
/**
@@ -121,6 +130,7 @@
unsigned long (*calibrate_tsc)(void);
unsigned long (*get_wallclock)(void);
int (*set_wallclock)(unsigned long nowtime);
+ void (*iommu_shutdown)(void);
};
extern struct x86_init_ops x86_init;
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 9c371e4..8bd4e63 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -45,6 +45,8 @@
#include <xen/interface/xen.h>
#include <xen/interface/sched.h>
#include <xen/interface/physdev.h>
+#include <xen/interface/platform.h>
+#include <xen/interface/xen-mca.h>
/*
* The hypercall asms have to meet several constraints:
@@ -200,6 +202,23 @@
(type)__res; \
})
+static inline long
+privcmd_call(unsigned call,
+ unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4,
+ unsigned long a5)
+{
+ __HYPERCALL_DECLS;
+ __HYPERCALL_5ARG(a1, a2, a3, a4, a5);
+
+ asm volatile("call *%[call]"
+ : __HYPERCALL_5PARAM
+ : [call] "a" (&hypercall_page[call])
+ : __HYPERCALL_CLOBBER5);
+
+ return (long)__res;
+}
+
static inline int
HYPERVISOR_set_trap_table(struct trap_info *table)
{
@@ -270,7 +289,7 @@
static inline int
HYPERVISOR_sched_op(int cmd, void *arg)
{
- return _hypercall2(int, sched_op_new, cmd, arg);
+ return _hypercall2(int, sched_op, cmd, arg);
}
static inline long
@@ -282,6 +301,20 @@
}
static inline int
+HYPERVISOR_mca(struct xen_mc *mc_op)
+{
+ mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
+ return _hypercall1(int, mca, mc_op);
+}
+
+static inline int
+HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
+{
+ platform_op->interface_version = XENPF_INTERFACE_VERSION;
+ return _hypercall1(int, dom0_op, platform_op);
+}
+
+static inline int
HYPERVISOR_set_debugreg(int reg, unsigned long value)
{
return _hypercall2(int, set_debugreg, reg, value);
@@ -405,10 +438,17 @@
#endif
static inline int
-HYPERVISOR_suspend(unsigned long srec)
+HYPERVISOR_suspend(unsigned long start_info_mfn)
{
- return _hypercall3(int, sched_op, SCHEDOP_shutdown,
- SHUTDOWN_suspend, srec);
+ struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
+
+ /*
+ * For a PV guest the tools require that the start_info mfn be
+ * present in rdx/edx when the hypercall is made. Per the
+ * hypercall calling convention this is the third hypercall
+ * argument, which is start_info_mfn here.
+ */
+ return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn);
}
static inline int
@@ -417,6 +457,12 @@
return _hypercall2(int, nmi_op, op, arg);
}
+static inline unsigned long __must_check
+HYPERVISOR_hvm_op(int op, void *arg)
+{
+ return _hypercall2(unsigned long, hvm_op, op, arg);
+}
+
static inline void
MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
{
@@ -424,6 +470,14 @@
mcl->args[0] = set;
}
+#if defined(CONFIG_X86_64)
+#define MULTI_UVMFLAGS_INDEX 2
+#define MULTI_UVMDOMID_INDEX 3
+#else
+#define MULTI_UVMFLAGS_INDEX 3
+#define MULTI_UVMDOMID_INDEX 4
+#endif
+
static inline void
MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
pte_t new_val, unsigned long flags)
@@ -432,12 +486,11 @@
mcl->args[0] = va;
if (sizeof(new_val) == sizeof(long)) {
mcl->args[1] = new_val.pte;
- mcl->args[2] = flags;
} else {
mcl->args[1] = new_val.pte;
mcl->args[2] = new_val.pte >> 32;
- mcl->args[3] = flags;
}
+ mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
}
static inline void
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index d5b7e90..396ff4c 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -37,31 +37,4 @@
extern struct shared_info *HYPERVISOR_shared_info;
extern struct start_info *xen_start_info;
-enum xen_domain_type {
- XEN_NATIVE, /* running on bare hardware */
- XEN_PV_DOMAIN, /* running in a PV domain */
- XEN_HVM_DOMAIN, /* running in a Xen hvm domain */
-};
-
-#ifdef CONFIG_XEN
-extern enum xen_domain_type xen_domain_type;
-#else
-#define xen_domain_type XEN_NATIVE
-#endif
-
-#define xen_domain() (xen_domain_type != XEN_NATIVE)
-#define xen_pv_domain() (xen_domain() && \
- xen_domain_type == XEN_PV_DOMAIN)
-#define xen_hvm_domain() (xen_domain() && \
- xen_domain_type == XEN_HVM_DOMAIN)
-
-#ifdef CONFIG_XEN_DOM0
-#include <xen/interface/xen.h>
-
-#define xen_initial_domain() (xen_pv_domain() && \
- xen_start_info->flags & SIF_INITDOMAIN)
-#else /* !CONFIG_XEN_DOM0 */
-#define xen_initial_domain() (0)
-#endif /* CONFIG_XEN_DOM0 */
-
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index e8506c1..9539998 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -61,9 +61,9 @@
#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
#endif
-#ifndef machine_to_phys_mapping
-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
-#endif
+#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT)
/* Maximum number of virtual CPUs in multi-processor guests. */
#define MAX_VIRT_CPUS 32
@@ -97,6 +97,8 @@
#define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2))
#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
struct trap_info {
uint8_t vector; /* exception vector */
uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
index 42a7e00..8413688 100644
--- a/arch/x86/include/asm/xen/interface_32.h
+++ b/arch/x86/include/asm/xen/interface_32.h
@@ -32,6 +32,11 @@
/* And the trap vector is... */
#define TRAP_INSTR "int $0x82"
+#define __MACH2PHYS_VIRT_START 0xF5800000
+#define __MACH2PHYS_VIRT_END 0xF6800000
+
+#define __MACH2PHYS_SHIFT 2
+
/*
* Virtual addresses beyond this are not modifiable by guest OSes. The
* machine->physical mapping table starts at this address, read-only.
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
index 100d266..839a481 100644
--- a/arch/x86/include/asm/xen/interface_64.h
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -39,18 +39,7 @@
#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
-
-#ifndef HYPERVISOR_VIRT_START
-#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
-#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
-#endif
-
-#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
-#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
-#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
-#ifndef machine_to_phys_mapping
-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
-#endif
+#define __MACH2PHYS_SHIFT 3
/*
* int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
diff --git a/arch/x86/include/asm/xen/iommu.h b/arch/x86/include/asm/xen/iommu.h
new file mode 100644
index 0000000..75df312
--- /dev/null
+++ b/arch/x86/include/asm/xen/iommu.h
@@ -0,0 +1,12 @@
+#ifndef ASM_X86__XEN_IOMMU_H
+
+#ifdef CONFIG_PCI_XEN
+extern void xen_iommu_init(void);
+#else
+static inline void xen_iommu_init(void)
+{
+}
+#endif
+
+#endif
+
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 018a0a4..05c5cf5 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/pfn.h>
+#include <linux/mm.h>
#include <asm/uaccess.h>
#include <asm/page.h>
@@ -28,23 +29,32 @@
/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
#define INVALID_P2M_ENTRY (~0UL)
-#define FOREIGN_FRAME_BIT (1UL<<31)
+#define FOREIGN_FRAME_BIT (1UL << (sizeof(unsigned long) * 8 - 1))
#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
/* Maximum amount of memory we can handle in a domain in pages */
#define MAX_DOMAIN_PAGES \
((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
+extern unsigned long *machine_to_phys_mapping;
+extern unsigned int machine_to_phys_order;
extern unsigned long get_phys_to_machine(unsigned long pfn);
-extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
static inline unsigned long pfn_to_mfn(unsigned long pfn)
{
+ unsigned long mfn;
+
if (xen_feature(XENFEAT_auto_translated_physmap))
return pfn;
- return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT;
+ mfn = get_phys_to_machine(pfn);
+
+ if (mfn != INVALID_P2M_ENTRY)
+ mfn &= ~FOREIGN_FRAME_BIT;
+
+ return mfn;
}
static inline int phys_to_machine_mapping_valid(unsigned long pfn)
@@ -62,10 +72,8 @@
if (xen_feature(XENFEAT_auto_translated_physmap))
return mfn;
-#if 0
if (unlikely((mfn >> machine_to_phys_order) != 0))
- return max_mapnr;
-#endif
+ return ~0;
pfn = 0;
/*
@@ -112,13 +120,9 @@
*/
static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
{
- extern unsigned long max_mapnr;
unsigned long pfn = mfn_to_pfn(mfn);
- if ((pfn < max_mapnr)
- && !xen_feature(XENFEAT_auto_translated_physmap)
- && (get_phys_to_machine(pfn) != mfn))
- return max_mapnr; /* force !pfn_valid() */
- /* XXX fixme; not true with sparsemem */
+ if (get_phys_to_machine(pfn) != mfn)
+ return -1; /* force !pfn_valid() */
return pfn;
}
@@ -163,6 +167,7 @@
#define pgd_val_ma(x) ((x).pgd)
+void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid);
xmaddr_t arbitrary_virt_to_machine(void *address);
unsigned long arbitrary_virt_to_mfn(void *vaddr);
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
new file mode 100644
index 0000000..6683196
--- /dev/null
+++ b/arch/x86/include/asm/xen/pci.h
@@ -0,0 +1,104 @@
+#ifndef _ASM_X86_XEN_PCI_H
+#define _ASM_X86_XEN_PCI_H
+
+#if defined(CONFIG_PCI_MSI)
+#if defined(CONFIG_PCI_XEN)
+int xen_register_pirq(u32 gsi, int triggering);
+int xen_register_gsi(u32 gsi, int triggering, int polarity);
+int xen_create_msi_irq(struct pci_dev *dev,
+ struct msi_desc *msidesc,
+ int type);
+void xen_pci_teardown_msi_dev(struct pci_dev *dev);
+void xen_pci_teardown_msi_irq(int irq);
+int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
+
+/* The drivers/pci/xen-pcifront.c sets this structure to
+ * its own functions.
+ */
+struct xen_pci_frontend_ops {
+ int (*enable_msi)(struct pci_dev *dev, int **vectors);
+ void (*disable_msi)(struct pci_dev *dev);
+ int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec);
+ void (*disable_msix)(struct pci_dev *dev);
+};
+
+extern struct xen_pci_frontend_ops *xen_pci_frontend;
+
+static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
+ int **vectors)
+{
+ if (xen_pci_frontend && xen_pci_frontend->enable_msi)
+ return xen_pci_frontend->enable_msi(dev, vectors);
+ return -ENODEV;
+}
+static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
+{
+ if (xen_pci_frontend && xen_pci_frontend->disable_msi)
+ xen_pci_frontend->disable_msi(dev);
+}
+static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
+ int **vectors, int nvec)
+{
+ if (xen_pci_frontend && xen_pci_frontend->enable_msix)
+ return xen_pci_frontend->enable_msix(dev, vectors, nvec);
+ return -ENODEV;
+}
+static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev)
+{
+ if (xen_pci_frontend && xen_pci_frontend->disable_msix)
+ xen_pci_frontend->disable_msix(dev);
+}
+#else
+static inline int xen_create_msi_irq(struct pci_dev *dev,
+ struct msi_desc *msidesc,
+ int type)
+{
+ return -1;
+}
+static inline void xen_pci_teardown_msi_dev(struct pci_dev *dev) { }
+static inline void xen_pci_teardown_msi_irq(int irq) { }
+static inline int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_PCI_XEN */
+
+#endif /* CONFIG_PCI_MSI */
+
+#ifdef CONFIG_XEN_DOM0_PCI
+int xen_register_gsi(u32 gsi, int triggering, int polarity);
+int xen_find_device_domain_owner(struct pci_dev *dev);
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
+int xen_unregister_device_domain_owner(struct pci_dev *dev);
+
+#else
+static inline int xen_register_gsi(u32 gsi, int triggering, int polarity)
+{
+ return -1;
+}
+
+static inline int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+ return -1;
+}
+static inline int xen_register_device_domain_owner(struct pci_dev *dev,
+ uint16_t domain)
+{
+ return -1;
+}
+static inline int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+ return -1;
+}
+#endif
+
+#if defined(CONFIG_PCI_MSI) && defined(CONFIG_XEN_DOM0_PCI)
+int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
+#else
+static inline int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ return -1;
+}
+#endif
+
+#endif /* _ASM_X86_XEN_PCI_H */
diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h
new file mode 100644
index 0000000..e4fe299
--- /dev/null
+++ b/arch/x86/include/asm/xen/swiotlb-xen.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_SWIOTLB_XEN_H
+#define _ASM_X86_SWIOTLB_XEN_H
+
+#ifdef CONFIG_PCI_XEN
+extern int xen_swiotlb;
+extern int __init pci_xen_swiotlb_detect(void);
+extern void __init pci_xen_swiotlb_init(void);
+#else
+#define xen_swiotlb 0
+static inline int __init pci_xen_swiotlb_detect(void) { return 0; }
+static inline void __init pci_xen_swiotlb_init(void) { }
+#endif
+
+#endif /* _ASM_X86_SWIOTLB_XEN_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d1911ab..cfe00bc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -113,6 +113,7 @@
microcode-y := microcode_core.o
microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
+microcode-$(CONFIG_MICROCODE_XEN) += microcode_xen.o
obj-$(CONFIG_MICROCODE) += microcode.o
obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 8ba08c7..90be997 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -42,6 +42,10 @@
#include <asm/mpspec.h>
#include <asm/smp.h>
+#include <asm/xen/pci.h>
+
+#include <asm/xen/hypervisor.h>
+
static int __initdata acpi_force = 0;
u32 acpi_rsdt_forced;
int acpi_disabled;
@@ -150,6 +154,10 @@
{
unsigned int ver = 0;
+ /* We don't want to register lapics when in Xen dom0 */
+ if (xen_initial_domain())
+ return;
+
if (!enabled) {
++disabled_cpus;
return;
@@ -467,9 +475,13 @@
*/
int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
{
- unsigned int irq;
+ int irq;
unsigned int plat_gsi = gsi;
+ irq = xen_register_gsi(gsi, trigger, polarity);
+ if (irq >= 0)
+ return irq;
+
#ifdef CONFIG_PCI
/*
* Make sure all (legacy) PCI IRQs are set as level-triggered.
@@ -746,6 +758,10 @@
static void __init acpi_register_lapic_address(unsigned long address)
{
+ /* Xen dom0 doesn't have usable lapics */
+ if (xen_initial_domain())
+ return;
+
mp_lapic_addr = address;
set_fixmap_nocache(FIX_APIC_BASE, address);
@@ -866,6 +882,9 @@
max_gsi = gsi;
}
+ if (xen_initial_domain())
+ max_gsi += 255; /* Plus maximum entries of an ioapic. */
+
return max_gsi + 1;
}
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index d85d1b2..8aabedd 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -12,6 +12,8 @@
#include <acpi/processor.h>
#include <asm/acpi.h>
+#include <xen/xen.h>
+
static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
{
struct acpi_object_list *obj_list;
@@ -59,7 +61,7 @@
/*
* If mwait/monitor is unsupported, C2/C3_FFH will be disabled
*/
- if (!cpu_has(c, X86_FEATURE_MWAIT))
+ if (!cpu_has(c, X86_FEATURE_MWAIT) && !xen_initial_domain())
buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
obj->type = ACPI_TYPE_BUFFER;
@@ -88,6 +90,19 @@
EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
+/* Initialize _PDC data based on the CPU vendor */
+void xen_arch_acpi_processor_init_pdc(struct acpi_processor *pr)
+{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+
+ pr->pdc = NULL;
+ if (c->x86_vendor == X86_VENDOR_INTEL)
+ init_intel_pdc(pr, c);
+
+ return;
+}
+EXPORT_SYMBOL(xen_arch_acpi_processor_init_pdc);
+
void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
{
if (pr->pdc) {
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ca93638..9eff23c 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -12,6 +12,8 @@
#include <asm/segment.h>
#include <asm/desc.h>
+#include <xen/acpi.h>
+
#include "realmode/wakeup.h"
#include "sleep.h"
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 3a44b75..e305008 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -928,7 +928,7 @@
}
if (unlikely(address == -1))
- address = bad_dma_address;
+ address = DMA_ERROR_CODE;
WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -1545,7 +1545,7 @@
pte = dma_ops_get_pte(dom, address);
if (!pte)
- return bad_dma_address;
+ return DMA_ERROR_CODE;
__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
@@ -1626,7 +1626,7 @@
retry:
address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
dma_mask);
- if (unlikely(address == bad_dma_address)) {
+ if (unlikely(address == DMA_ERROR_CODE)) {
/*
* setting next_address here will let the address
* allocator only scan the new allocated range in the
@@ -1647,7 +1647,7 @@
start = address;
for (i = 0; i < pages; ++i) {
ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
- if (ret == bad_dma_address)
+ if (ret == DMA_ERROR_CODE)
goto out_unmap;
paddr += PAGE_SIZE;
@@ -1675,7 +1675,7 @@
dma_ops_free_addresses(dma_dom, address, pages);
- return bad_dma_address;
+ return DMA_ERROR_CODE;
}
/*
@@ -1692,7 +1692,7 @@
dma_addr_t i, start;
unsigned int pages;
- if ((dma_addr == bad_dma_address) ||
+ if ((dma_addr == DMA_ERROR_CODE) ||
(dma_addr + size > dma_dom->aperture_size))
return;
@@ -1735,7 +1735,7 @@
INC_STATS_COUNTER(cnt_map_single);
if (!check_device(dev))
- return bad_dma_address;
+ return DMA_ERROR_CODE;
dma_mask = *dev->dma_mask;
@@ -1746,12 +1746,12 @@
return (dma_addr_t)paddr;
if (!dma_ops_domain(domain))
- return bad_dma_address;
+ return DMA_ERROR_CODE;
spin_lock_irqsave(&domain->lock, flags);
addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
dma_mask);
- if (addr == bad_dma_address)
+ if (addr == DMA_ERROR_CODE)
goto out;
iommu_completion_wait(iommu);
@@ -1960,7 +1960,7 @@
*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
size, DMA_BIDIRECTIONAL, true, dma_mask);
- if (*dma_addr == bad_dma_address) {
+ if (*dma_addr == DMA_ERROR_CODE) {
spin_unlock_irqrestore(&domain->lock, flags);
goto out_free;
}
@@ -2122,8 +2122,7 @@
prealloc_protection_domains();
iommu_detected = 1;
- force_iommu = 1;
- bad_dma_address = 0;
+ swiotlb = 0;
#ifdef CONFIG_GART_IOMMU
gart_iommu_aperture_disabled = 1;
gart_iommu_aperture = 0;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index fc5d470..838e1d7 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -29,6 +29,7 @@
#include <asm/amd_iommu.h>
#include <asm/iommu.h>
#include <asm/gart.h>
+#include <asm/x86_init.h>
/*
* definitions for the ACPI scanning code
@@ -1206,19 +1207,10 @@
* functions. Finally it prints some information about AMD IOMMUs and
* the driver state and enables the hardware.
*/
-int __init amd_iommu_init(void)
+static int __init amd_iommu_init(void)
{
int i, ret = 0;
-
- if (no_iommu) {
- printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
- return 0;
- }
-
- if (!amd_iommu_detected)
- return -ENODEV;
-
/*
* First parse ACPI tables to find the largest Bus/Dev/Func
* we need to handle. Upon this information the shared data
@@ -1333,6 +1325,7 @@
else
printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
+ x86_platform.iommu_shutdown = disable_iommus;
out:
return ret;
@@ -1361,11 +1354,6 @@
goto out;
}
-void amd_iommu_shutdown(void)
-{
- disable_iommus();
-}
-
/****************************************************************************
*
* Early detect code. This code runs at IOMMU detection time in the DMA
@@ -1380,16 +1368,13 @@
void __init amd_iommu_detect(void)
{
- if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
+ if (no_iommu || (iommu_detected && !gart_iommu_aperture))
return;
if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
iommu_detected = 1;
amd_iommu_detected = 1;
-#ifdef CONFIG_GART_IOMMU
- gart_iommu_aperture_disabled = 1;
- gart_iommu_aperture = 0;
-#endif
+ x86_init.iommu.iommu_init = amd_iommu_init;
}
}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 082089e..8d34362 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -28,6 +28,7 @@
#include <asm/pci-direct.h>
#include <asm/dma.h>
#include <asm/k8.h>
+#include <asm/x86_init.h>
int gart_iommu_aperture;
int gart_iommu_aperture_disabled __initdata;
@@ -401,6 +402,7 @@
iommu_detected = 1;
gart_iommu_aperture = 1;
+ x86_init.iommu.iommu_init = gart_iommu_init;
ctl = read_pci_config(bus, slot, 3,
AMD64_GARTAPERTURECTL);
@@ -469,7 +471,7 @@
if (aper_alloc) {
/* Got the aperture from the AGP bridge */
- } else if (swiotlb && !valid_agp) {
+ } else if (!valid_agp) {
/* Do nothing */
} else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
force_iommu ||
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 8928d97..4848d5d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -63,7 +63,12 @@
#include <asm/uv/uv_hub.h>
#include <asm/uv/uv_irq.h>
+#include <asm/xen/hypervisor.h>
#include <asm/apic.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/pci.h>
+
+#include <asm/xen/pci.h>
#define __apicdebuginit(type) static type __init
#define for_each_irq_pin(entry, head) \
@@ -395,14 +400,18 @@
static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
+ struct io_apic __iomem *io_apic;
+
+ io_apic = io_apic_base(apic);
writel(reg, &io_apic->index);
return readl(&io_apic->data);
}
static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
+ struct io_apic __iomem *io_apic;
+
+ io_apic = io_apic_base(apic);
writel(reg, &io_apic->index);
writel(value, &io_apic->data);
}
@@ -415,7 +424,9 @@
*/
static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
+ struct io_apic __iomem *io_apic;
+
+ io_apic = io_apic_base(apic);
if (sis_apic_bug)
writel(reg, &io_apic->index);
@@ -3494,6 +3505,9 @@
if (type == PCI_CAP_ID_MSI && nvec > 1)
return 1;
+ if (xen_pv_domain())
+ return xen_pci_setup_msi_irqs(dev, nvec, type);
+
node = dev_to_node(&dev->dev);
irq_want = nr_irqs_gsi;
sub_handle = 0;
@@ -3543,7 +3557,29 @@
void arch_teardown_msi_irq(unsigned int irq)
{
- destroy_irq(irq);
+ if (xen_domain())
+ xen_pci_teardown_msi_irq(irq);
+ else
+ destroy_irq(irq);
+}
+
+void arch_teardown_msi_irqs(struct pci_dev *dev)
+{
+ struct msi_desc *entry;
+
+ /* If we are non-privileged PV domain, we have to
+ * to call xen_teardown_msi_dev first. */
+ if (xen_domain())
+ xen_pci_teardown_msi_dev(dev);
+
+ list_for_each_entry(entry, &dev->msi_list, list) {
+ int i, nvec;
+ if (entry->irq == 0)
+ continue;
+ nvec = 1 << entry->msi_attrib.multiple;
+ for (i = 0; i < nvec; i++)
+ arch_teardown_msi_irq(entry->irq + i);
+ }
}
#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
@@ -3860,7 +3896,14 @@
printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
}
+int get_nr_irqs_gsi(void)
+{
+ return nr_irqs_gsi;
+}
+
#ifdef CONFIG_SPARSE_IRQ
+int nr_dynamic_irqs;
+
int __init arch_probe_nr_irqs(void)
{
int nr;
@@ -3878,6 +3921,8 @@
if (nr < nr_irqs)
nr_irqs = nr;
+ nr_irqs += nr_dynamic_irqs;
+
return 0;
}
#endif
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 7ff61d6..d1e6e60 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -558,6 +558,9 @@
{
int i;
+ if (!cpu_has_apic)
+ return;
+
cpumask_copy(&backtrace_mask, cpu_online_mask);
printk(KERN_INFO "sending NMI to all CPUs:\n");
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index f4361b5..404e458 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,4 @@
obj-y := main.o if.o generic.o state.o cleanup.o
obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
+obj-$(CONFIG_XEN_DOM0) += xen.o
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 33af141..378f8dc 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -108,6 +108,11 @@
return 0;
}
+static int amd_num_var_ranges(void)
+{
+ return 2;
+}
+
static struct mtrr_ops amd_mtrr_ops = {
.vendor = X86_VENDOR_AMD,
.set = amd_set_mtrr,
@@ -115,6 +120,7 @@
.get_free_region = generic_get_free_region,
.validate_add_page = amd_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
+ .num_var_ranges = amd_num_var_ranges,
};
int __init amd_init_mtrr(void)
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index de89f14..7c686a0 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -110,6 +110,11 @@
return 0;
}
+static int centaur_num_var_ranges(void)
+{
+ return 8;
+}
+
static struct mtrr_ops centaur_mtrr_ops = {
.vendor = X86_VENDOR_CENTAUR,
.set = centaur_set_mcr,
@@ -117,6 +122,7 @@
.get_free_region = centaur_get_free_region,
.validate_add_page = centaur_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
+ .num_var_ranges = centaur_num_var_ranges,
};
int __init centaur_init_mtrr(void)
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 228d982..fd6edcc 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -265,6 +265,11 @@
post_set();
}
+static int cyrix_num_var_ranges(void)
+{
+ return 8;
+}
+
static struct mtrr_ops cyrix_mtrr_ops = {
.vendor = X86_VENDOR_CYRIX,
.set_all = cyrix_set_all,
@@ -273,6 +278,7 @@
.get_free_region = cyrix_get_free_region,
.validate_add_page = generic_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
+ .num_var_ranges = cyrix_num_var_ranges,
};
int __init cyrix_init_mtrr(void)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 55da0c5..42f30cd 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -749,8 +749,16 @@
return 1;
}
-/*
- * Generic structure...
+static int generic_num_var_ranges(void)
+{
+ unsigned long config = 0, dummy;
+
+ rdmsr(MSR_MTRRcap, config, dummy);
+
+ return config & 0xff;
+}
+
+/* generic structure...
*/
struct mtrr_ops generic_mtrr_ops = {
.use_intel_if = 1,
@@ -760,4 +768,5 @@
.set = generic_set_mtrr,
.validate_add_page = generic_validate_add_page,
.have_wrcomb = generic_have_wrcomb,
+ .num_var_ranges = generic_num_var_ranges,
};
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index fd60f09..aa21994 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -110,21 +110,6 @@
return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0;
}
-/* This function returns the number of variable MTRRs */
-static void __init set_num_var_ranges(void)
-{
- unsigned long config = 0, dummy;
-
- if (use_intel())
- rdmsr(MSR_MTRRcap, config, dummy);
- else if (is_cpu(AMD))
- config = 2;
- else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
- config = 8;
-
- num_var_ranges = config & 0xff;
-}
-
static void __init init_table(void)
{
int i, max;
@@ -721,8 +706,11 @@
}
}
+ /* Let Xen code override the above if it wants */
+ xen_init_mtrr();
+
if (mtrr_if) {
- set_num_var_ranges();
+ num_var_ranges = mtrr_if->num_var_ranges();
init_table();
if (use_intel()) {
get_mtrr_state();
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index a501dee..98569c3 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,6 +5,8 @@
#include <linux/types.h>
#include <linux/stddef.h>
+#include <asm/mtrr.h>
+
#define MTRR_CHANGE_MASK_FIXED 0x01
#define MTRR_CHANGE_MASK_VARIABLE 0x02
#define MTRR_CHANGE_MASK_DEFTYPE 0x04
@@ -25,6 +27,8 @@
int (*validate_add_page)(unsigned long base, unsigned long size,
unsigned int type);
int (*have_wrcomb)(void);
+
+ int (*num_var_ranges)(void);
};
extern int generic_get_free_region(unsigned long base, unsigned long size,
@@ -73,6 +77,13 @@
int amd_init_mtrr(void);
int cyrix_init_mtrr(void);
int centaur_init_mtrr(void);
+#ifdef CONFIG_XEN_DOM0
+void xen_init_mtrr(void);
+#else
+static inline void xen_init_mtrr(void)
+{
+}
+#endif
extern int changed_by_mtrr_cleanup;
extern int mtrr_cleanup(unsigned address_bits);
diff --git a/arch/x86/kernel/cpu/mtrr/xen.c b/arch/x86/kernel/cpu/mtrr/xen.c
new file mode 100644
index 0000000..852018b
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/xen.c
@@ -0,0 +1,109 @@
+#include <linux/init.h>
+#include <linux/mm.h>
+
+#include <asm/pat.h>
+
+#include "mtrr.h"
+
+#include <xen/xen.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+static void xen_set_mtrr(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ struct xen_platform_op op;
+ int error;
+
+ /* mtrr_ops->set() is called once per CPU,
+ * but Xen's ops apply to all CPUs.
+ */
+ if (smp_processor_id())
+ return;
+
+ if (size == 0) {
+ op.cmd = XENPF_del_memtype;
+ op.u.del_memtype.handle = 0;
+ op.u.del_memtype.reg = reg;
+ } else {
+ op.cmd = XENPF_add_memtype;
+ op.u.add_memtype.mfn = base;
+ op.u.add_memtype.nr_mfns = size;
+ op.u.add_memtype.type = type;
+ }
+
+ error = HYPERVISOR_dom0_op(&op);
+ BUG_ON(error != 0);
+}
+
+static void xen_get_mtrr(unsigned int reg, unsigned long *base,
+ unsigned long *size, mtrr_type *type)
+{
+ struct xen_platform_op op;
+
+ op.cmd = XENPF_read_memtype;
+ op.u.read_memtype.reg = reg;
+ if (HYPERVISOR_dom0_op(&op) != 0) {
+ *base = 0;
+ *size = 0;
+ *type = 0;
+ return;
+ }
+
+ *size = op.u.read_memtype.nr_mfns;
+ *base = op.u.read_memtype.mfn;
+ *type = op.u.read_memtype.type;
+}
+
+static int __init xen_num_var_ranges(void)
+{
+ int ranges;
+ struct xen_platform_op op;
+
+ op.cmd = XENPF_read_memtype;
+
+ for (ranges = 0; ; ranges++) {
+ op.u.read_memtype.reg = ranges;
+ if (HYPERVISOR_dom0_op(&op) != 0)
+ break;
+ }
+ return ranges;
+}
+
+/*
+ * DOM0 TODO: Need to fill in the remaining mtrr methods to have full
+ * working userland mtrr support.
+ */
+static struct mtrr_ops xen_mtrr_ops = {
+ .vendor = X86_VENDOR_UNKNOWN,
+ .get_free_region = generic_get_free_region,
+ .set = xen_set_mtrr,
+ .get = xen_get_mtrr,
+ .have_wrcomb = positive_have_wrcomb,
+ .validate_add_page = generic_validate_add_page,
+ .use_intel_if = 0,
+ .num_var_ranges = xen_num_var_ranges,
+};
+
+void __init xen_init_mtrr(void)
+{
+ /*
+ * Check that we're running under Xen, and privileged enough
+ * to play with MTRRs.
+ */
+ if (!xen_initial_domain())
+ return;
+
+ /*
+ * Check that the CPU has an MTRR implementation we can
+ * support.
+ */
+ if (cpu_has_mtrr ||
+ cpu_has_k6_mtrr ||
+ cpu_has_cyrix_arr ||
+ cpu_has_centaur_mcr) {
+ mtrr_if = &xen_mtrr_ops;
+ pat_init();
+ }
+}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ff95824..ebd4c51 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -28,7 +28,6 @@
#include <asm/reboot.h>
#include <asm/virtext.h>
-
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct die_args *args)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a89739a..5476113 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -750,6 +750,36 @@
return i;
}
+u64 __init early_res_next_free(u64 addr)
+{
+ int i;
+ u64 end = addr;
+ struct early_res *r;
+
+ for (i = 0; i < MAX_EARLY_RES; i++) {
+ r = &early_res[i];
+ if (addr >= r->start && addr < r->end) {
+ end = r->end;
+ break;
+ }
+ }
+ return end;
+}
+
+u64 __init early_res_next_reserved(u64 addr, u64 max)
+{
+ int i;
+ struct early_res *r;
+ u64 next_res = max;
+
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ r = &early_res[i];
+ if ((r->start >= addr) && (r->start < next_res))
+ next_res = r->start;
+ }
+ return next_res;
+}
+
/*
* Drop the i-th range from the early reservation map,
* by copying any higher ranges down one over it, and
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d..7764118 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1088,6 +1088,9 @@
.previous
ENDPROC(xen_failsafe_callback)
+BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
+ xen_evtchn_do_upcall)
+
#endif /* CONFIG_XEN */
#ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 34a56a9..cba6d5a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1364,6 +1364,9 @@
CFI_ENDPROC
END(xen_failsafe_callback)
+apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
+ xen_hvm_callback_vector xen_evtchn_do_upcall
+
#endif /* CONFIG_XEN */
/*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 0b06cd7..6c5d59b 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -76,6 +76,7 @@
/* Make NULL pointers segfault */
zap_identity_mappings();
+ max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
/* Cleanup the over mapped high alias */
cleanup_highmap();
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 987ef29..5d1026c 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -101,7 +101,7 @@
}
__setup("hpet=", hpet_setup);
-static int __init disable_hpet(char *str)
+int __init disable_hpet(char *str)
{
boot_hpet_disable = 1;
return 1;
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 99c4d30..919c1a8 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -30,13 +30,29 @@
}
}
+void native_set_io_bitmap(struct thread_struct *t,
+ unsigned long bytes_updated)
+{
+ struct tss_struct *tss;
+
+ if (!bytes_updated)
+ return;
+
+ tss = &__get_cpu_var(init_tss);
+
+ /* Update the TSS: */
+ if (t->io_bitmap_ptr)
+ memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+ else
+ memset(tss->io_bitmap, 0xff, bytes_updated);
+}
+
/*
* this changes the io permissions bitmap in the current task.
*/
asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
{
struct thread_struct *t = ¤t->thread;
- struct tss_struct *tss;
unsigned int i, max_long, bytes, bytes_updated;
if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
@@ -61,13 +77,13 @@
}
/*
- * do it in the per-thread copy and in the TSS ...
+ * do it in the per-thread copy
*
- * Disable preemption via get_cpu() - we must not switch away
+ * Disable preemption - we must not switch away
* because the ->io_bitmap_max value must match the bitmap
* contents:
*/
- tss = &per_cpu(init_tss, get_cpu());
+ preempt_disable();
set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
@@ -85,10 +101,9 @@
t->io_bitmap_max = bytes;
- /* Update the TSS: */
- memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+ set_io_bitmap(t, bytes_updated);
- put_cpu();
+ preempt_enable();
return 0;
}
@@ -119,11 +134,10 @@
return 0;
}
-#ifdef CONFIG_X86_32
-long sys_iopl(struct pt_regs *regs)
+asmlinkage long sys_iopl(unsigned int level)
{
- unsigned int level = regs->bx;
struct thread_struct *t = ¤t->thread;
+ struct pt_regs *regs = task_pt_regs(current);
int rc;
rc = do_iopl(level, regs);
@@ -135,9 +149,3 @@
out:
return rc;
}
-#else
-asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
-{
- return do_iopl(level, regs);
-}
-#endif
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ec6ef60..fa5b061 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -109,6 +109,9 @@
mutex_init(&mm->context.lock);
mm->context.size = 0;
+#ifdef CONFIG_XEN
+ mm->context.has_foreign_mappings = 0;
+#endif
old_mm = current->mm;
if (old_mm && old_mm->context.size > 0) {
mutex_lock(&old_mm->context.lock);
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 378e9a8..86ca771 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -81,6 +81,8 @@
#include <linux/fs.h>
#include <linux/mm.h>
+#include <xen/xen.h>
+#include <asm/xen/hypervisor.h>
#include <asm/microcode.h>
#include <asm/processor.h>
@@ -503,7 +505,9 @@
struct cpuinfo_x86 *c = &cpu_data(0);
int error;
- if (c->x86_vendor == X86_VENDOR_INTEL)
+ if (xen_pv_domain())
+ microcode_ops = init_xen_microcode();
+ else if (c->x86_vendor == X86_VENDOR_INTEL)
microcode_ops = init_intel_microcode();
else if (c->x86_vendor == X86_VENDOR_AMD)
microcode_ops = init_amd_microcode();
diff --git a/arch/x86/kernel/microcode_xen.c b/arch/x86/kernel/microcode_xen.c
new file mode 100644
index 0000000..16c742e
--- /dev/null
+++ b/arch/x86/kernel/microcode_xen.c
@@ -0,0 +1,201 @@
+/*
+ * Xen microcode update driver
+ *
+ * Xen does most of the work here. We just pass the whole blob into
+ * Xen, and it will apply it to all CPUs as appropriate. Xen will
+ * worry about how different CPU models are actually updated.
+ */
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/firmware.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+
+#include <asm/microcode.h>
+
+#include <xen/xen.h>
+#include <xen/interface/platform.h>
+#include <xen/interface/xen.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+MODULE_DESCRIPTION("Xen microcode update driver");
+MODULE_LICENSE("GPL");
+
+struct xen_microcode {
+ size_t len;
+ char data[0];
+};
+
+static int xen_microcode_update(int cpu)
+{
+ int err;
+ struct xen_platform_op op;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct xen_microcode *uc = uci->mc;
+
+ if (uc == NULL || uc->len == 0) {
+ /*
+ * We do all cpus at once, so we don't need to do
+ * other cpus explicitly (besides, these vcpu numbers
+ * have no relationship to underlying physical cpus).
+ */
+ return 0;
+ }
+
+ op.cmd = XENPF_microcode_update;
+ set_xen_guest_handle(op.u.microcode.data, uc->data);
+ op.u.microcode.length = uc->len;
+
+ err = HYPERVISOR_dom0_op(&op);
+
+ if (err != 0)
+ printk(KERN_WARNING "microcode_xen: microcode update failed: %d\n", err);
+
+ return err;
+}
+
+static enum ucode_state xen_request_microcode_fw(int cpu, struct device *device)
+{
+ char name[30];
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ const struct firmware *firmware;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ enum ucode_state ret;
+ struct xen_microcode *uc;
+ size_t size;
+ int err;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ snprintf(name, sizeof(name), "intel-ucode/%02x-%02x-%02x",
+ c->x86, c->x86_model, c->x86_mask);
+ break;
+
+ case X86_VENDOR_AMD:
+ snprintf(name, sizeof(name), "amd-ucode/microcode_amd.bin");
+ break;
+
+ default:
+ return UCODE_NFOUND;
+ }
+
+ err = request_firmware(&firmware, name, device);
+ if (err) {
+ pr_debug("microcode: data file %s load failed\n", name);
+ return UCODE_NFOUND;
+ }
+
+ /*
+ * Only bother getting real firmware for cpu 0; the others get
+ * dummy placeholders.
+ */
+ if (cpu == 0)
+ size = firmware->size;
+ else
+ size = 0;
+
+ if (uci->mc != NULL) {
+ vfree(uci->mc);
+ uci->mc = NULL;
+ }
+
+ ret = UCODE_ERROR;
+ uc = vmalloc(sizeof(*uc) + size);
+ if (uc == NULL)
+ goto out;
+
+ ret = UCODE_OK;
+ uc->len = size;
+ memcpy(uc->data, firmware->data, uc->len);
+
+ uci->mc = uc;
+
+out:
+ release_firmware(firmware);
+
+ return ret;
+}
+
+static enum ucode_state xen_request_microcode_user(int cpu,
+ const void __user *buf, size_t size)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct xen_microcode *uc;
+ enum ucode_state ret;
+ size_t unread;
+
+ if (cpu != 0) {
+ /* No real firmware for non-zero cpus; just store a
+ placeholder */
+ size = 0;
+ }
+
+ if (uci->mc != NULL) {
+ vfree(uci->mc);
+ uci->mc = NULL;
+ }
+
+ ret = UCODE_ERROR;
+ uc = vmalloc(sizeof(*uc) + size);
+ if (uc == NULL)
+ goto out;
+
+ uc->len = size;
+
+ ret = UCODE_NFOUND;
+
+ /* XXX This sporadically returns uncopied bytes, so we return
+ EFAULT. As far as I can see, the usermode code
+ (microcode_ctl) isn't doing anything wrong... */
+ unread = copy_from_user(uc->data, buf, size);
+
+ if (unread != 0) {
+ printk(KERN_WARNING "failed to read %zd of %zd bytes at %p -> %p\n",
+ unread, size, buf, uc->data);
+ goto out;
+ }
+
+ ret = UCODE_OK;
+
+out:
+ if (ret == 0)
+ uci->mc = uc;
+ else
+ vfree(uc);
+
+ return ret;
+}
+
+static void xen_microcode_fini_cpu(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ vfree(uci->mc);
+ uci->mc = NULL;
+}
+
+static int xen_collect_cpu_info(int cpu, struct cpu_signature *sig)
+{
+ sig->sig = 0;
+ sig->pf = 0;
+ sig->rev = 0;
+
+ return 0;
+}
+
+static struct microcode_ops microcode_xen_ops = {
+ .request_microcode_user = xen_request_microcode_user,
+ .request_microcode_fw = xen_request_microcode_fw,
+ .collect_cpu_info = xen_collect_cpu_info,
+ .apply_microcode = xen_microcode_update,
+ .microcode_fini_cpu = xen_microcode_fini_cpu,
+};
+
+struct microcode_ops * __init init_xen_microcode(void)
+{
+ if (!xen_initial_domain())
+ return NULL;
+ return µcode_xen_ops;
+}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b1739d..f7e115c 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -376,6 +376,7 @@
.swapgs = native_swapgs,
.set_iopl_mask = native_set_iopl_mask,
+ .set_io_bitmap = native_set_io_bitmap,
.io_delay = native_io_delay,
.start_context_switch = paravirt_nop,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 1a2d4b1..2f158a5 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -46,6 +46,7 @@
#include <asm/dma.h>
#include <asm/rio.h>
#include <asm/bios_ebda.h>
+#include <asm/x86_init.h>
#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
int use_calgary __read_mostly = 1;
@@ -249,7 +250,7 @@
if (panic_on_overflow)
panic("Calgary: fix the allocator.\n");
else
- return bad_dma_address;
+ return DMA_ERROR_CODE;
}
}
@@ -265,11 +266,11 @@
void *vaddr, unsigned int npages, int direction)
{
unsigned long entry;
- dma_addr_t ret = bad_dma_address;
+ dma_addr_t ret = DMA_ERROR_CODE;
entry = iommu_range_alloc(dev, tbl, npages);
- if (unlikely(entry == bad_dma_address))
+ if (unlikely(entry == DMA_ERROR_CODE))
goto error;
/* set the return dma address */
@@ -284,7 +285,7 @@
error:
printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
"iommu %p\n", npages, tbl);
- return bad_dma_address;
+ return DMA_ERROR_CODE;
}
static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -295,8 +296,8 @@
unsigned long flags;
/* were we called with bad_dma_address? */
- badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
- if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
+ badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
+ if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
"address 0x%Lx\n", dma_addr);
return;
@@ -380,7 +381,7 @@
npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
entry = iommu_range_alloc(dev, tbl, npages);
- if (entry == bad_dma_address) {
+ if (entry == DMA_ERROR_CODE) {
/* makes sure unmap knows to stop */
s->dma_length = 0;
goto error;
@@ -398,7 +399,7 @@
error:
calgary_unmap_sg(dev, sg, nelems, dir, NULL);
for_each_sg(sg, s, nelems, i) {
- sg->dma_address = bad_dma_address;
+ sg->dma_address = DMA_ERROR_CODE;
sg->dma_length = 0;
}
return 0;
@@ -453,7 +454,7 @@
/* set up tces to cover the allocated range */
mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
- if (mapping == bad_dma_address)
+ if (mapping == DMA_ERROR_CODE)
goto free;
*dma_handle = mapping;
return ret;
@@ -734,7 +735,7 @@
struct iommu_table *tbl = pci_iommu(dev->bus);
/* reserve EMERGENCY_PAGES from bad_dma_address and up */
- iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
+ iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
/* avoid the BIOS/VGA first 640KB-1MB region */
/* for CalIOC2 - avoid the entire first MB */
@@ -1349,6 +1350,23 @@
return;
}
+static int __init calgary_iommu_init(void)
+{
+ int ret;
+
+ /* ok, we're trying to use Calgary - let's roll */
+ printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
+
+ ret = calgary_init();
+ if (ret) {
+ printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
+ "falling back to no_iommu\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
void __init detect_calgary(void)
{
int bus;
@@ -1362,7 +1380,7 @@
* if the user specified iommu=off or iommu=soft or we found
* another HW IOMMU already, bail out.
*/
- if (swiotlb || no_iommu || iommu_detected)
+ if (no_iommu || iommu_detected)
return;
if (!use_calgary)
@@ -1447,9 +1465,7 @@
printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
specified_table_size);
- /* swiotlb for devices that aren't behind the Calgary. */
- if (max_pfn > MAX_DMA32_PFN)
- swiotlb = 1;
+ x86_init.iommu.iommu_init = calgary_iommu_init;
}
return;
@@ -1462,35 +1478,6 @@
}
}
-int __init calgary_iommu_init(void)
-{
- int ret;
-
- if (no_iommu || (swiotlb && !calgary_detected))
- return -ENODEV;
-
- if (!calgary_detected)
- return -ENODEV;
-
- /* ok, we're trying to use Calgary - let's roll */
- printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
-
- ret = calgary_init();
- if (ret) {
- printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
- "falling back to no_iommu\n", ret);
- return ret;
- }
-
- force_iommu = 1;
- bad_dma_address = 0x0;
- /* dma_ops is set to swiotlb or nommu */
- if (!dma_ops)
- dma_ops = &nommu_dma_ops;
-
- return 0;
-}
-
static int __init calgary_parse_options(char *p)
{
unsigned int bridge;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 6ac3931..3e57c58 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,10 +11,12 @@
#include <asm/gart.h>
#include <asm/calgary.h>
#include <asm/amd_iommu.h>
+#include <asm/x86_init.h>
+#include <asm/xen/swiotlb-xen.h>
static int forbid_dac __read_mostly;
-struct dma_map_ops *dma_ops;
+struct dma_map_ops *dma_ops = &nommu_dma_ops;
EXPORT_SYMBOL(dma_ops);
static int iommu_sac_force __read_mostly;
@@ -42,9 +44,6 @@
*/
int iommu_pass_through __read_mostly;
-dma_addr_t bad_dma_address __read_mostly = 0;
-EXPORT_SYMBOL(bad_dma_address);
-
/* Dummy device used for NULL arguments (normally ISA). */
struct device x86_dma_fallback_dev = {
.init_name = "fallback device",
@@ -126,18 +125,19 @@
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
#endif
+ if (pci_xen_swiotlb_detect() || pci_swiotlb_detect())
+ goto out;
- /*
- * The order of these functions is important for
- * fall-back/fail-over reasons
- */
gart_iommu_hole_init();
detect_calgary();
detect_intel_iommu();
+ /* needs to be called after gart_iommu_hole_init */
amd_iommu_detect();
+out:
+ pci_xen_swiotlb_init();
pci_swiotlb_init();
}
@@ -289,25 +289,17 @@
#ifdef CONFIG_PCI
dma_debug_add_bus(&pci_bus_type);
#endif
+ x86_init.iommu.iommu_init();
- calgary_iommu_init();
+ if (swiotlb || xen_swiotlb) {
+ printk(KERN_INFO "PCI-DMA: "
+ "Using software bounce buffering for IO (SWIOTLB)\n");
+ swiotlb_print_info();
+ } else
+ swiotlb_free();
- intel_iommu_init();
-
- amd_iommu_init();
-
- gart_iommu_init();
-
- no_iommu_init();
return 0;
}
-
-void pci_iommu_shutdown(void)
-{
- gart_iommu_shutdown();
-
- amd_iommu_shutdown();
-}
/* Must execute after PCI subsystem */
rootfs_initcall(pci_iommu_init);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 1c76691..8c9dd05 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -39,6 +39,7 @@
#include <asm/swiotlb.h>
#include <asm/dma.h>
#include <asm/k8.h>
+#include <asm/x86_init.h>
static unsigned long iommu_bus_base; /* GART remapping area (physical) */
static unsigned long iommu_size; /* size of remapping area bytes */
@@ -46,6 +47,8 @@
static u32 *iommu_gatt_base; /* Remapping table */
+static dma_addr_t bad_dma_addr;
+
/*
* If this is disabled the IOMMU will use an optimized flushing strategy
* of only flushing when an mapping is reused. With it true the GART is
@@ -216,7 +219,7 @@
if (panic_on_overflow)
panic("dma_map_area overflow %lu bytes\n", size);
iommu_full(dev, size, dir);
- return bad_dma_address;
+ return bad_dma_addr;
}
for (i = 0; i < npages; i++) {
@@ -302,7 +305,7 @@
if (nonforced_iommu(dev, addr, s->length)) {
addr = dma_map_area(dev, addr, s->length, dir, 0);
- if (addr == bad_dma_address) {
+ if (addr == bad_dma_addr) {
if (i > 0)
gart_unmap_sg(dev, sg, i, dir, NULL);
nents = 0;
@@ -455,7 +458,7 @@
iommu_full(dev, pages << PAGE_SHIFT, dir);
for_each_sg(sg, s, nents, i)
- s->dma_address = bad_dma_address;
+ s->dma_address = bad_dma_addr;
return 0;
}
@@ -479,7 +482,7 @@
DMA_BIDIRECTIONAL, align_mask);
flush_gart();
- if (paddr != bad_dma_address) {
+ if (paddr != bad_dma_addr) {
*dma_addr = paddr;
return page_address(page);
}
@@ -499,6 +502,11 @@
free_pages((unsigned long)vaddr, get_order(size));
}
+static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return (dma_addr == bad_dma_addr);
+}
+
static int no_agp;
static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -689,14 +697,15 @@
.unmap_page = gart_unmap_page,
.alloc_coherent = gart_alloc_coherent,
.free_coherent = gart_free_coherent,
+ .mapping_error = gart_mapping_error,
};
-void gart_iommu_shutdown(void)
+static void gart_iommu_shutdown(void)
{
struct pci_dev *dev;
int i;
- if (no_agp && (dma_ops != &gart_dma_ops))
+ if (no_agp)
return;
for (i = 0; i < num_k8_northbridges; i++) {
@@ -711,7 +720,7 @@
}
}
-void __init gart_iommu_init(void)
+int __init gart_iommu_init(void)
{
struct agp_kern_info info;
unsigned long iommu_start;
@@ -721,7 +730,7 @@
long i;
if (num_k8_northbridges == 0)
- return;
+ return 0;
#ifndef CONFIG_AGP_AMD64
no_agp = 1;
@@ -733,13 +742,6 @@
(agp_copy_info(agp_bridge, &info) < 0);
#endif
- if (swiotlb)
- return;
-
- /* Did we detect a different HW IOMMU? */
- if (iommu_detected && !gart_iommu_aperture)
- return;
-
if (no_iommu ||
(!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
!gart_iommu_aperture ||
@@ -749,7 +751,7 @@
"but GART IOMMU not available.\n");
printk(KERN_WARNING "falling back to iommu=soft.\n");
}
- return;
+ return 0;
}
/* need to map that range */
@@ -794,7 +796,7 @@
iommu_start = aper_size - iommu_size;
iommu_bus_base = info.aper_base + iommu_start;
- bad_dma_address = iommu_bus_base;
+ bad_dma_addr = iommu_bus_base;
iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
/*
@@ -841,6 +843,10 @@
flush_gart();
dma_ops = &gart_dma_ops;
+ x86_platform.iommu_shutdown = gart_iommu_shutdown;
+ swiotlb = 0;
+
+ return 0;
}
void __init gart_parse_options(char *p)
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a3933d4..22be12b 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -33,7 +33,7 @@
dma_addr_t bus = page_to_phys(page) + offset;
WARN_ON(size == 0);
if (!check_addr("map_single", dev, bus, size))
- return bad_dma_address;
+ return DMA_ERROR_CODE;
flush_write_buffers();
return bus;
}
@@ -103,12 +103,3 @@
.sync_sg_for_device = nommu_sync_sg_for_device,
.is_phys = 1,
};
-
-void __init no_iommu_init(void)
-{
- if (dma_ops)
- return;
-
- force_iommu = 0; /* no HW IOMMU */
- dma_ops = &nommu_dma_ops;
-}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index aaa6b78..7d2829d 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -42,18 +42,31 @@
.dma_supported = NULL,
};
-void __init pci_swiotlb_init(void)
+/*
+ * pci_swiotlb_detect - set swiotlb to 1 if necessary
+ *
+ * This returns non-zero if we are forced to use swiotlb (by the boot
+ * option).
+ */
+int __init pci_swiotlb_detect(void)
{
+ int use_swiotlb = swiotlb | swiotlb_force;
+
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
#ifdef CONFIG_X86_64
- if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN))
+ if (!no_iommu && max_pfn > MAX_DMA32_PFN)
swiotlb = 1;
#endif
if (swiotlb_force)
swiotlb = 1;
+
+ return use_swiotlb;
+}
+
+void __init pci_swiotlb_init(void)
+{
if (swiotlb) {
- printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
- swiotlb_init();
+ swiotlb_init(0);
dma_ops = &swiotlb_dma_ops;
}
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index fc6c84d..d079f92 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -73,16 +73,12 @@
unsigned long *bp = t->io_bitmap_ptr;
if (bp) {
- struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
-
+ preempt_disable();
t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);
- /*
- * Careful, clear this in the TSS too:
- */
- memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
+ set_io_bitmap(t, t->io_bitmap_max);
t->io_bitmap_max = 0;
- put_cpu();
+ preempt_enable();
kfree(bp);
}
}
@@ -199,19 +195,10 @@
hard_enable_TSC();
}
- if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
- /*
- * Copy the relevant range of the IO bitmap.
- * Normally this is 128 bytes or less:
- */
- memcpy(tss->io_bitmap, next->io_bitmap_ptr,
- max(prev->io_bitmap_max, next->io_bitmap_max));
- } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
- /*
- * Clear any possible leftover bits:
- */
- memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
- }
+ if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP) ||
+ test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
+ set_io_bitmap(next,
+ max(prev->io_bitmap_max, next->io_bitmap_max));
}
int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index cf98100..7a5cb07 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -23,7 +23,7 @@
# include <linux/ctype.h>
# include <linux/mc146818rtc.h>
#else
-# include <asm/iommu.h>
+# include <asm/x86_init.h>
#endif
/*
@@ -655,7 +655,7 @@
#endif
#ifdef CONFIG_X86_64
- pci_iommu_shutdown();
+ x86_platform.iommu_shutdown();
#endif
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5449a26..56b4707 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -70,6 +70,7 @@
#include <linux/tboot.h>
#include <video/edid.h>
+#include <xen/xen.h>
#include <asm/mtrr.h>
#include <asm/apic.h>
@@ -89,6 +90,7 @@
#include <asm/cacheflush.h>
#include <asm/processor.h>
#include <asm/bugs.h>
+#include <asm/tlbflush.h>
#include <asm/system.h>
#include <asm/vsyscall.h>
@@ -909,7 +911,6 @@
max_low_pfn = max_pfn;
high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
- max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
#endif
#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
@@ -967,6 +968,9 @@
initmem_init(0, max_pfn);
+ /* Initialize cross-cpu tlb flushes */
+ init_smp_flush();
+
#ifdef CONFIG_ACPI_SLEEP
/*
* Reserve low memory region for sleep support.
@@ -1037,6 +1041,7 @@
probe_nr_irqs_gsi();
kvm_guest_init();
+ xen_hvm_guest_init();
e820_reserve_resources();
e820_mark_nosave_regions(max_low_pfn);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 4449a4a..d11c5ff 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -14,10 +14,13 @@
#include <asm/time.h>
#include <asm/irq.h>
#include <asm/tsc.h>
+#include <asm/iommu.h>
void __cpuinit x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
void __init x86_init_pgd_noop(pgd_t *unused) { }
+int __init iommu_init_noop(void) { return 0; }
+void iommu_shutdown_noop(void) { }
/*
* The platform setup functions are preset with the default functions
@@ -62,6 +65,10 @@
.tsc_pre_init = x86_init_noop,
.timer_init = hpet_time_init,
},
+
+ .iommu = {
+ .iommu_init = iommu_init_noop,
+ },
};
struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
@@ -72,4 +79,5 @@
.calibrate_tsc = native_calibrate_tsc,
.get_wallclock = mach_get_cmos_time,
.set_wallclock = mach_set_rtc_mmss,
+ .iommu_shutdown = iommu_shutdown_noop,
};
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 06630d2..ad895ae 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -6,6 +6,11 @@
CFLAGS_physaddr.o := $(nostackp)
CFLAGS_setup_nx.o := $(nostackp)
+# Make sure __phys_addr has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_ioremap.o := $(nostackp)
+CFLAGS_init.o := $(nostackp)
+
obj-$(CONFIG_SMP) += tlb.o
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8ac0d76..7e7dbd1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -228,7 +228,16 @@
spin_lock_irqsave(&pgd_lock, flags);
list_for_each_entry(page, &pgd_list, lru) {
- if (!vmalloc_sync_one(page_address(page), address))
+ spinlock_t *pgt_lock;
+ int ret;
+
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+ spin_lock(pgt_lock);
+ ret = vmalloc_sync_one(page_address(page), address);
+ spin_unlock(pgt_lock);
+
+ if (!ret)
break;
}
spin_unlock_irqrestore(&pgd_lock, flags);
@@ -340,11 +349,19 @@
spin_lock_irqsave(&pgd_lock, flags);
list_for_each_entry(page, &pgd_list, lru) {
pgd_t *pgd;
+ spinlock_t *pgt_lock;
+
pgd = (pgd_t *)page_address(page) + pgd_index(address);
+
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+ spin_lock(pgt_lock);
+
if (pgd_none(*pgd))
set_pgd(pgd, *pgd_ref);
else
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+ spin_unlock(pgt_lock);
}
spin_unlock_irqrestore(&pgd_lock, flags);
}
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 71da1bc..892b8eb 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -313,6 +313,11 @@
goto slow_irqon;
#endif
+#ifdef CONFIG_XEN
+ if (unlikely(mm->context.has_foreign_mappings))
+ goto slow_irqon;
+#endif
+
/*
* XXX: batch / limit 'nr', to avoid large irq off latency
* needs some instrumenting to determine the common sizes used by
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 73ffd55..3e3d5f9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -8,7 +8,6 @@
#include <asm/page.h>
#include <asm/page_types.h>
#include <asm/sections.h>
-#include <asm/setup.h>
#include <asm/system.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
@@ -288,23 +287,8 @@
#endif
#ifdef CONFIG_X86_64
- if (!after_bootmem && !start) {
- pud_t *pud;
- pmd_t *pmd;
-
+ if (!after_bootmem)
mmu_cr4_features = read_cr4();
-
- /*
- * _brk_end cannot change anymore, but it and _end may be
- * located on different 2M pages. cleanup_highmap(), however,
- * can only consider _end when it runs, so destroy any
- * mappings beyond _brk_end here.
- */
- pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
- pmd = pmd_offset(pud, _brk_end - 1);
- while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
- pmd_clear(pmd);
- }
#endif
__flush_tlb_all();
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 30938c1..10c3719 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -430,22 +430,45 @@
{
int node_pfn;
struct page *page;
+ phys_addr_t chunk_end, chunk_max;
unsigned long final_start_pfn, final_end_pfn;
- struct add_highpages_data *data;
-
- data = (struct add_highpages_data *)datax;
+ struct add_highpages_data *data = (struct add_highpages_data *)datax;
final_start_pfn = max(start_pfn, data->start_pfn);
final_end_pfn = min(end_pfn, data->end_pfn);
if (final_start_pfn >= final_end_pfn)
return 0;
- for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
- node_pfn++) {
- if (!pfn_valid(node_pfn))
- continue;
- page = pfn_to_page(node_pfn);
- add_one_highpage_init(page, node_pfn);
+ chunk_end = PFN_PHYS(final_start_pfn);
+ chunk_max = PFN_PHYS(final_end_pfn);
+
+ /*
+ * Check for reserved areas.
+ */
+ for (;;) {
+ phys_addr_t chunk_start;
+ chunk_start = early_res_next_free(chunk_end);
+
+ /*
+ * Reserved area. Just count high mem pages.
+ */
+ for (node_pfn = PFN_DOWN(chunk_end);
+ node_pfn < PFN_DOWN(chunk_start); node_pfn++) {
+ if (pfn_valid(node_pfn))
+ totalhigh_pages++;
+ }
+
+ if (chunk_start >= chunk_max)
+ break;
+
+ chunk_end = early_res_next_reserved(chunk_start, chunk_max);
+ for (node_pfn = PFN_DOWN(chunk_start);
+ node_pfn < PFN_DOWN(chunk_end); node_pfn++) {
+ if (!pfn_valid(node_pfn))
+ continue;
+ page = pfn_to_page(node_pfn);
+ add_one_highpage_init(page, node_pfn);
+ }
}
return 0;
@@ -459,7 +482,6 @@
data.start_pfn = start_pfn;
data.end_pfn = end_pfn;
-
work_with_active_regions(nid, add_highpages_work_fn, &data);
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 2feb9bd..2601df2 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -425,6 +425,11 @@
return &bm_pte[pte_index(addr)];
}
+bool __init is_early_ioremap_ptep(pte_t *ptep)
+{
+ return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
+}
+
static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
void __init early_ioremap_init(void)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e78cd0e..fb91994 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -666,7 +666,7 @@
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
{
- return vma_prot;
+ return __pgprot(pgprot_val(vma_prot) | _PAGE_IOMAP);
}
#ifdef CONFIG_STRICT_DEVMEM
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e0e6fad..a0db43b 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,6 +4,9 @@
#include <asm/tlb.h>
#include <asm/fixmap.h>
+#include <xen/xen.h>
+#include <asm/xen/hypervisor.h>
+
#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
#ifdef CONFIG_HIGHPTE
@@ -14,6 +17,16 @@
gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
+pgprot_t arch_vm_get_page_prot(unsigned vm_flags)
+{
+ pgprot_t ret = __pgprot(0);
+
+ if (vm_flags & VM_IO)
+ ret = __pgprot(_PAGE_IOMAP);
+
+ return ret;
+}
+
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
return (pte_t *)__get_free_page(PGALLOC_GFP);
@@ -86,7 +99,19 @@
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
-static void pgd_ctor(pgd_t *pgd)
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+ BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+ virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+ return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
{
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -104,8 +129,10 @@
}
/* list required to sync kernel mapping updates */
- if (!SHARED_KERNEL_PMD)
+ if (!SHARED_KERNEL_PMD) {
+ pgd_set_mm(pgd, mm);
pgd_list_add(pgd);
+ }
}
static void pgd_dtor(pgd_t *pgd)
@@ -270,7 +297,7 @@
*/
spin_lock_irqsave(&pgd_lock, flags);
- pgd_ctor(pgd);
+ pgd_ctor(mm, pgd);
pgd_prepopulate_pmd(mm, pgd, pmds);
spin_unlock_irqrestore(&pgd_lock, flags);
@@ -287,6 +314,12 @@
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
+#ifdef CONFIG_XEN
+ /* EEW */
+ extern void xen_late_unpin_pgd(struct mm_struct *mm, pgd_t *pgd);
+ if (xen_pv_domain())
+ xen_late_unpin_pgd(mm, pgd);
+#endif
pgd_mop_up_pmds(mm, pgd);
pgd_dtor(pgd);
paravirt_pgd_free(mm, pgd);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 36fe08e..7317947 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -148,13 +148,25 @@
* BUG();
*/
- if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
- if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
+ if (f->flush_mm == NULL ||
+ f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+ int tlbstate = percpu_read(cpu_tlbstate.state);
+
+ /*
+ * flush_mm == NULL means flush everything, including
+ * global tlbs, which will only happen when flushing
+ * kernel mappings.
+ */
+ if (f->flush_mm == NULL)
+ __flush_tlb_all();
+ else if (tlbstate == TLBSTATE_OK) {
if (f->flush_va == TLB_FLUSH_ALL)
local_flush_tlb();
else
__flush_tlb_one(f->flush_va);
- } else
+ }
+
+ if (tlbstate == TLBSTATE_LAZY)
leave_mm(cpu);
}
out:
@@ -217,16 +229,13 @@
flush_tlb_others_ipi(cpumask, mm, va);
}
-static int __cpuinit init_smp_flush(void)
+void __init init_smp_flush(void)
{
int i;
for (i = 0; i < ARRAY_SIZE(flush_state); i++)
spin_lock_init(&flush_state[i].tlbstate_lock);
-
- return 0;
}
-core_initcall(init_smp_flush);
void flush_tlb_current_task(void)
{
@@ -274,17 +283,19 @@
preempt_enable();
}
+EXPORT_SYMBOL_GPL(flush_tlb_page);
-static void do_flush_tlb_all(void *info)
+void flush_tlb_all(void)
{
- unsigned long cpu = smp_processor_id();
+ /* flush_tlb_others expects preempt to be disabled */
+ int cpu = get_cpu();
+
+ flush_tlb_others(cpu_online_mask, NULL, TLB_FLUSH_ALL);
__flush_tlb_all();
if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(cpu);
-}
-void flush_tlb_all(void)
-{
- on_each_cpu(do_flush_tlb_all, NULL, 1);
+ put_cpu();
}
+EXPORT_SYMBOL_GPL(flush_tlb_all);
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index d49202e..64182c5 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -4,6 +4,7 @@
obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o
obj-$(CONFIG_PCI_DIRECT) += direct.o
obj-$(CONFIG_PCI_OLPC) += olpc.o
+obj-$(CONFIG_PCI_XEN) += xen.o
obj-y += fixup.o
obj-$(CONFIG_ACPI) += acpi.o
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 1331fcf..30a9808 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -22,6 +22,7 @@
unsigned int pci_early_dump_regs;
static int pci_bf_sort;
int pci_routeirq;
+int pci_scan_all_fns;
int noioapicquirk;
#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
int noioapicreroute = 0;
@@ -412,26 +413,31 @@
extern u8 pci_cache_line_size;
-int __init pcibios_init(void)
+void __init pcibios_set_cache_line_size(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
- if (!raw_pci_ops) {
- printk(KERN_WARNING "PCI: System does not support PCI\n");
- return 0;
- }
-
/*
* Assume PCI cacheline size of 32 bytes for all x86s except K7/K8
* and P4. It's also good for 386/486s (which actually have 16)
* as quite a few PCI devices do not support smaller values.
*/
+
pci_cache_line_size = 32 >> 2;
if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
pci_cache_line_size = 64 >> 2; /* K7 & K8 */
else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
pci_cache_line_size = 128 >> 2; /* P4 */
+}
+int __init pcibios_init(void)
+{
+ if (!raw_pci_ops) {
+ printk(KERN_WARNING "PCI: System does not support PCI\n");
+ return 0;
+ }
+
+ pcibios_set_cache_line_size();
pcibios_resource_survey();
if (pci_bf_sort >= pci_force_bf)
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index a672f12..91d040e 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -283,6 +283,8 @@
prot = pgprot_val(vma->vm_page_prot);
+ prot |= _PAGE_IOMAP; /* creating a mapping for IO */
+
/*
* Return error if pat is not enabled and write_combine is requested.
* Caller can followup with UC MINUS request and add a WC mtrr if there
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index 25a1f8e..4e2f90a 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -15,10 +15,16 @@
if (!(pci_probe & PCI_PROBE_NOEARLY))
pci_mmcfg_early_init();
+#ifdef CONFIG_PCI_XEN
+ if (!pci_xen_init())
+ return 0;
+#endif
+
#ifdef CONFIG_PCI_OLPC
if (!pci_olpc_init())
return 0; /* skip additional checks if it's an XO */
#endif
+
#ifdef CONFIG_PCI_BIOS
pci_pcbios_init();
#endif
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
new file mode 100644
index 0000000..67fa926
--- /dev/null
+++ b/arch/x86/pci/xen.c
@@ -0,0 +1,154 @@
+/*
+ * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux
+ * x86 PCI core to support the Xen PCI Frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+
+#include <asm/io.h>
+#include <asm/pci_x86.h>
+
+#include <asm/xen/hypervisor.h>
+
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+
+#if defined(CONFIG_PCI_MSI)
+#include <linux/msi.h>
+
+struct xen_pci_frontend_ops *xen_pci_frontend;
+EXPORT_SYMBOL_GPL(xen_pci_frontend);
+
+/*
+ * For MSI interrupts we have to use drivers/xen/event.s functions to
+ * allocate an irq_desc and setup the right */
+
+
+int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ int irq, ret, i;
+ struct msi_desc *msidesc;
+ int *v;
+
+ /* Dom0 has another mechanism for this. The exit path
+ * (xen_pci_teardown_msi_irq) is shared with Dom0.
+ */
+ if (xen_initial_domain())
+ return xen_setup_msi_irqs(dev, nvec, type);
+
+ v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
+ if (!v)
+ return -ENOMEM;
+
+ if (!xen_initial_domain()) {
+ if (type == PCI_CAP_ID_MSIX)
+ ret = xen_pci_frontend_enable_msix(dev, &v, nvec);
+ else
+ ret = xen_pci_frontend_enable_msi(dev, &v);
+ if (ret)
+ goto error;
+ }
+ i = 0;
+ list_for_each_entry(msidesc, &dev->msi_list, list) {
+ irq = xen_allocate_pirq(v[i], 0, /* not sharable */
+ (type == PCI_CAP_ID_MSIX) ?
+ "pcifront-msi-x":"pcifront-msi");
+ if (irq < 0)
+ return -1;
+
+ ret = set_irq_msi(irq, msidesc);
+ if (ret)
+ goto error_while;
+ i++;
+ }
+ kfree(v);
+ return 0;
+
+error_while:
+ unbind_from_irqhandler(irq, NULL);
+error:
+ if (ret == -ENODEV)
+ dev_err(&dev->dev,"Xen PCI frontend has not registered" \
+ " MSI/MSI-X support!\n");
+
+ kfree(v);
+ return ret;
+}
+
+void xen_pci_teardown_msi_dev(struct pci_dev *dev)
+{
+ /* Only do this when were are in non-privileged mode.*/
+ if (!xen_initial_domain()) {
+ struct msi_desc *msidesc;
+
+ msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
+ if (msidesc->msi_attrib.is_msix)
+ xen_pci_frontend_disable_msix(dev);
+ else
+ xen_pci_frontend_disable_msi(dev);
+ }
+
+}
+
+void xen_pci_teardown_msi_irq(int irq)
+{
+ xen_destroy_irq(irq);
+}
+#endif
+
+static int xen_pcifront_enable_irq(struct pci_dev *dev)
+{
+ int rc;
+ int share = 1;
+
+ dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq);
+
+ if (dev->irq < 0)
+ return -EINVAL;
+
+ if (dev->irq < NR_IRQS_LEGACY)
+ share = 0;
+
+ rc = xen_allocate_pirq(dev->irq, share, "pcifront");
+ if (rc < 0) {
+ dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n",
+ dev->irq, rc);
+ return rc;
+ }
+ return 0;
+}
+
+int __init pci_xen_init(void)
+{
+ if (!xen_pv_domain() || xen_initial_domain())
+ return -ENODEV;
+
+ printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
+
+ pcibios_set_cache_line_size();
+
+ pcibios_enable_irq = xen_pcifront_enable_irq;
+ pcibios_disable_irq = NULL;
+
+#ifdef CONFIG_ACPI
+ /* Keep ACPI out of the picture */
+ acpi_noirq = 1;
+#endif
+
+#ifdef CONFIG_ISAPNP
+ /* Stop isapnp from probing */
+ isapnp_disable = 1;
+#endif
+
+ /* Ensure a device still gets scanned even if it's fn number
+ * is non-zero.
+ */
+ pci_scan_all_fns = 1;
+
+ return 0;
+}
+
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index b83e119..3f9f4a0 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -13,16 +13,18 @@
kernel to boot in a paravirtualized environment under the
Xen hypervisor.
+config XEN_PVHVM
+ def_bool y
+ depends on XEN
+ depends on X86_LOCAL_APIC
+
config XEN_MAX_DOMAIN_MEMORY
- int "Maximum allowed size of a domain in gigabytes"
- default 8 if X86_32
- default 32 if X86_64
+ int
+ default 128
depends on XEN
help
- The pseudo-physical to machine address array is sized
- according to the maximum possible memory size of a Xen
- domain. This array uses 1 page per gigabyte, so there's no
- need to be too stingy here.
+ This only affects the sizing of some bss arrays, the unused
+ portions of which are freed.
config XEN_SAVE_RESTORE
bool
@@ -36,3 +38,40 @@
help
Enable statistics output and various tuning options in debugfs.
Enabling this option may incur a significant performance overhead.
+
+config SWIOTLB_XEN
+ def_bool y
+ depends on XEN && SWIOTLB
+
+config MICROCODE_XEN
+ def_bool y
+ depends on XEN_DOM0 && MICROCODE
+
+config XEN_DOM0
+ bool "Enable Xen privileged domain support"
+ depends on XEN && X86_IO_APIC && ACPI
+ help
+ The Xen hypervisor requires a privileged domain ("dom0") to
+ actually manage the machine, provide devices drivers, etc.
+ This option enables dom0 support. A dom0 kernel can also
+ run as an unprivileged domU kernel, or a kernel running
+ native on bare hardware.
+
+# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
+# name in tools.
+config XEN_PRIVILEGED_GUEST
+ def_bool XEN_DOM0
+
+config XEN_DOM0_PCI
+ def_bool y
+ depends on XEN_DOM0 && PCI
+ select PCI_XEN
+
+config XEN_PCI_PASSTHROUGH
+ bool "Enable support for Xen PCI passthrough devices"
+ depends on XEN && PCI
+ select PCI_XEN
+ select SWIOTLB_XEN
+ help
+ Enable support for passing PCI devices through to
+ unprivileged domains. (COMPLETELY UNTESTED)
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3bb4fc2..13ca65c 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,9 +12,12 @@
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
- grant-table.o suspend.o
+ grant-table.o suspend.o platform-pci-unplug.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
-
+obj-$(CONFIG_XEN_DOM0) += vga.o
+obj-$(CONFIG_XEN_DOM0) += apic.o
+obj-$(CONFIG_SWIOTLB) += pci-swiotlb-xen.o
+obj-$(CONFIG_XEN_DOM0_PCI) += pci.o
\ No newline at end of file
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
new file mode 100644
index 0000000..21a3089
--- /dev/null
+++ b/arch/x86/xen/apic.c
@@ -0,0 +1,33 @@
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/bitmap.h>
+
+#include <asm/io_apic.h>
+#include <asm/acpi.h>
+#include <asm/hw_irq.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+
+void __init xen_io_apic_init(void)
+{
+ enable_IO_APIC();
+}
+
+void xen_init_apic(void)
+{
+ if (!xen_initial_domain())
+ return;
+
+#ifdef CONFIG_ACPI
+ /*
+ * Pretend ACPI found our lapic even though we've disabled it,
+ * to prevent MP tables from setting up lapics.
+ */
+ acpi_lapic = 1;
+#endif
+}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0087b00..7a48c0b 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -11,6 +11,7 @@
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
+#include <linux/cpu.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/smp.h>
@@ -28,12 +29,15 @@
#include <linux/highmem.h>
#include <linux/console.h>
+#include <xen/xen.h>
#include <xen/interface/xen.h>
#include <xen/interface/version.h>
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
+#include <xen/interface/memory.h>
#include <xen/features.h>
#include <xen/page.h>
+#include <xen/hvm.h>
#include <xen/hvc-console.h>
#include <asm/paravirt.h>
@@ -53,6 +57,7 @@
#include <asm/tlbflush.h>
#include <asm/reboot.h>
#include <asm/stackprotector.h>
+#include <asm/hypervisor.h>
#include "xen-ops.h"
#include "mmu.h"
@@ -66,6 +71,11 @@
enum xen_domain_type xen_domain_type = XEN_NATIVE;
EXPORT_SYMBOL_GPL(xen_domain_type);
+unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
+EXPORT_SYMBOL(machine_to_phys_mapping);
+unsigned int machine_to_phys_order;
+EXPORT_SYMBOL(machine_to_phys_order);
+
struct start_info *xen_start_info;
EXPORT_SYMBOL_GPL(xen_start_info);
@@ -73,6 +83,9 @@
void *xen_initial_gdt;
+__read_mostly int xen_have_vector_callback;
+EXPORT_SYMBOL_GPL(xen_have_vector_callback);
+
/*
* Point at some empty memory to start with. We map the real shared_info
* page as soon as fixmap is up and running.
@@ -94,6 +107,14 @@
*/
static int have_vcpu_info_placement = 1;
+static void clamp_max_cpus(void)
+{
+#ifdef CONFIG_SMP
+ if (setup_max_cpus > MAX_VIRT_CPUS)
+ setup_max_cpus = MAX_VIRT_CPUS;
+#endif
+}
+
static void xen_vcpu_setup(int cpu)
{
struct vcpu_register_vcpu_info info;
@@ -101,19 +122,20 @@
struct vcpu_info *vcpup;
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
- per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
- if (!have_vcpu_info_placement)
- return; /* already tested, not available */
+ if (cpu < MAX_VIRT_CPUS)
+ per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+
+ if (!have_vcpu_info_placement) {
+ if (cpu >= MAX_VIRT_CPUS)
+ clamp_max_cpus();
+ return;
+ }
vcpup = &per_cpu(xen_vcpu_info, cpu);
-
info.mfn = arbitrary_virt_to_mfn(vcpup);
info.offset = offset_in_page(vcpup);
- printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
- cpu, vcpup, info.mfn, info.offset);
-
/* Check to see if the hypervisor will put the vcpu_info
structure where we want it, which allows direct access via
a percpu-variable. */
@@ -122,13 +144,11 @@
if (err) {
printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
have_vcpu_info_placement = 0;
+ clamp_max_cpus();
} else {
/* This cpu is using the registered vcpu info, even if
later ones fail to. */
per_cpu(xen_vcpu, cpu) = vcpup;
-
- printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
- cpu, vcpup);
}
}
@@ -167,13 +187,16 @@
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
pv_info.name);
- printk(KERN_INFO "Xen version: %d.%d%s%s\n",
+ printk(KERN_INFO "Xen version: %d.%d%s%s%s\n",
version >> 16, version & 0xffff, extra.extraversion,
- xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
+ xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ?
+ " (preserve-AD)" : "",
+ xen_initial_domain() ? " (dom0)" : "");
}
static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
+static __read_mostly unsigned int cpuid_leaf81_edx_mask = ~0;
static void xen_cpuid(unsigned int *ax, unsigned int *bx,
unsigned int *cx, unsigned int *dx)
@@ -187,7 +210,7 @@
* unsupported kernel subsystems as possible.
*/
switch (*ax) {
- case 1:
+ case 0x1:
maskecx = cpuid_leaf1_ecx_mask;
maskedx = cpuid_leaf1_edx_mask;
break;
@@ -196,6 +219,10 @@
/* Suppress extended topology stuff */
maskebx = 0;
break;
+
+ case 0x80000001:
+ maskedx = cpuid_leaf81_edx_mask;
+ break;
}
asm(XEN_EMULATE_PREFIX "cpuid"
@@ -213,34 +240,29 @@
static __init void xen_init_cpuid_mask(void)
{
unsigned int ax, bx, cx, dx;
+ unsigned int xsave_mask;
cpuid_leaf1_edx_mask =
- ~((1 << X86_FEATURE_MCE) | /* disable MCE */
- (1 << X86_FEATURE_MCA) | /* disable MCA */
- (1 << X86_FEATURE_ACC)); /* thermal monitoring */
+ ~(1 << X86_FEATURE_ACC); /* thermal monitoring */
+
+ cpuid_leaf81_edx_mask = ~(1 << (X86_FEATURE_GBPAGES % 32));
if (!xen_initial_domain())
cpuid_leaf1_edx_mask &=
- ~((1 << X86_FEATURE_APIC) | /* disable local APIC */
+ ~((1 << X86_FEATURE_MCE) | /* disable MCE */
+ (1 << X86_FEATURE_MCA) | /* disable MCA */
+ (1 << X86_FEATURE_APIC) | /* disable local APIC */
(1 << X86_FEATURE_ACPI)); /* disable ACPI */
-
ax = 1;
- cx = 0;
xen_cpuid(&ax, &bx, &cx, &dx);
- /* cpuid claims we support xsave; try enabling it to see what happens */
- if (cx & (1 << (X86_FEATURE_XSAVE % 32))) {
- unsigned long cr4;
+ xsave_mask =
+ (1 << (X86_FEATURE_XSAVE % 32)) |
+ (1 << (X86_FEATURE_OSXSAVE % 32));
- set_in_cr4(X86_CR4_OSXSAVE);
-
- cr4 = read_cr4();
-
- if ((cr4 & X86_CR4_OSXSAVE) == 0)
- cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
-
- clear_in_cr4(X86_CR4_OSXSAVE);
- }
+ /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
+ if ((cx & xsave_mask) != xsave_mask)
+ cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
}
static void xen_set_debugreg(int reg, unsigned long val)
@@ -406,7 +428,7 @@
pte = pfn_pte(pfn, PAGE_KERNEL_RO);
- if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
+ if (HYPERVISOR_update_va_mapping(va, pte, 0))
BUG();
frames[f] = mfn;
@@ -517,13 +539,13 @@
return 0;
#ifdef CONFIG_X86_MCE
} else if (addr == (unsigned long)machine_check) {
- return 0;
+ /* We can use the original machine_check handler,
+ despite IST. */
#endif
- } else {
- /* Some other trap using IST? */
- if (WARN_ON(val->ist != 0))
- return 0;
- }
+ } else if (WARN(val->ist != 0,
+ "Unknown IST-using trap: vector %d, %pF, val->ist=%d\n",
+ vector, (void *)addr, val->ist))
+ return 0;
#endif /* CONFIG_X86_64 */
info->address = addr;
@@ -679,6 +701,18 @@
HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
}
+static void xen_set_io_bitmap(struct thread_struct *thread,
+ unsigned long bytes_updated)
+{
+ struct physdev_set_iobitmap set_iobitmap;
+
+ set_xen_guest_handle(set_iobitmap.bitmap,
+ (char *)thread->io_bitmap_ptr);
+ set_iobitmap.nr_ports = thread->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
+ &set_iobitmap));
+}
+
static void xen_io_delay(void)
{
}
@@ -716,7 +750,7 @@
return 0;
}
-static void set_xen_basic_apic_ops(void)
+static __init void set_xen_basic_apic_ops(void)
{
apic->read = xen_apic_read;
apic->write = xen_apic_write;
@@ -728,7 +762,6 @@
#endif
-
static void xen_clts(void)
{
struct multicall_space mcs;
@@ -811,6 +844,11 @@
Xen console noise. */
break;
+ case MSR_IA32_CR_PAT:
+ if (smp_processor_id() == 0)
+ xen_set_pat(((u64)high << 32) | low);
+ break;
+
default:
ret = native_write_msr_safe(msr, low, high);
}
@@ -849,8 +887,6 @@
/* xen_vcpu_setup managed to place the vcpu_info within the
percpu area for all cpus, so make use of it */
if (have_vcpu_info_placement) {
- printk(KERN_INFO "Xen: using vcpu_info placement\n");
-
pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
@@ -923,10 +959,6 @@
.patch = xen_patch,
};
-static const struct pv_time_ops xen_time_ops __initdata = {
- .sched_clock = xen_clocksource_read,
-};
-
static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.cpuid = xen_cpuid,
@@ -978,6 +1010,7 @@
.load_sp0 = xen_load_sp0,
.set_iopl_mask = xen_set_iopl_mask,
+ .set_io_bitmap = xen_set_io_bitmap,
.io_delay = xen_io_delay,
/* Xen takes care of %gs when switching to usermode for us */
@@ -1016,15 +1049,40 @@
xen_reboot(SHUTDOWN_poweroff);
}
+static void xen_machine_power_off(void)
+{
+ if (pm_power_off)
+ pm_power_off();
+ else
+ xen_reboot(SHUTDOWN_poweroff);
+}
+
static void xen_crash_shutdown(struct pt_regs *regs)
{
xen_reboot(SHUTDOWN_crash);
}
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ xen_reboot(SHUTDOWN_crash);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block xen_panic_block = {
+ .notifier_call= xen_panic_event,
+};
+
+int xen_panic_handler_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+ return 0;
+}
+
static const struct machine_ops __initdata xen_machine_ops = {
.restart = xen_restart,
.halt = xen_machine_halt,
- .power_off = xen_machine_halt,
+ .power_off = xen_machine_power_off,
.shutdown = xen_machine_halt,
.crash_shutdown = xen_crash_shutdown,
.emergency_restart = xen_emergency_restart,
@@ -1057,10 +1115,11 @@
xen_domain_type = XEN_PV_DOMAIN;
+ xen_setup_machphys_mapping();
+
/* Install Xen paravirt ops */
pv_info = xen_info;
pv_init_ops = xen_init_ops;
- pv_time_ops = xen_time_ops;
pv_cpu_ops = xen_cpu_ops;
pv_apic_ops = xen_apic_ops;
@@ -1068,13 +1127,7 @@
x86_init.oem.arch_setup = xen_arch_setup;
x86_init.oem.banner = xen_banner;
- x86_init.timers.timer_init = xen_time_init;
- x86_init.timers.setup_percpu_clockev = x86_init_noop;
- x86_cpuinit.setup_percpu_clockev = x86_init_noop;
-
- x86_platform.calibrate_tsc = xen_tsc_khz;
- x86_platform.get_wallclock = xen_get_wallclock;
- x86_platform.set_wallclock = xen_set_wallclock;
+ xen_init_time_ops();
/*
* Set up some pagetable state before starting to set any ptes.
@@ -1112,6 +1165,10 @@
*/
xen_setup_stackprotector();
+#ifdef CONFIG_SPARSE_IRQ
+ nr_dynamic_irqs += 256;
+#endif
+
xen_init_irq_ops();
xen_init_cpuid_mask();
@@ -1138,8 +1195,19 @@
xen_smp_init();
+#ifdef CONFIG_ACPI_NUMA
+ /*
+ * The pages we from Xen are not related to machine pages, so
+ * any NUMA information the kernel tries to get from ACPI will
+ * be meaningless. Prevent it from trying.
+ */
+ acpi_numa = -1;
+#endif
+
pgd = (pgd_t *)xen_start_info->pt_base;
+ __supported_pte_mask |= _PAGE_IOMAP;
+
/* Don't do the full vcpu_info placement stuff until we have a
possible map and a non-dummy shared_info. */
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
@@ -1149,6 +1217,10 @@
xen_raw_console_write("mapping kernel into physical memory\n");
pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
+ xen_ident_map_ISA();
+
+ /* Allocate and initialize top and mid mfn levels for p2m structure */
+ xen_build_mfn_list_list();
init_mm.pgd = pgd;
@@ -1158,6 +1230,14 @@
if (xen_feature(XENFEAT_supervisor_mode_kernel))
pv_info.kernel_rpl = 0;
+ if (xen_initial_domain()) {
+ struct physdev_set_iopl set_iopl;
+ set_iopl.iopl = 1;
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl) == -1)
+ BUG();
+ xen_init_apic();
+ }
+
/* set the limit of our address space */
xen_reserve_top();
@@ -1180,6 +1260,16 @@
add_preferred_console("xenboot", 0, NULL);
add_preferred_console("tty", 0, NULL);
add_preferred_console("hvc", 0, NULL);
+
+ boot_params.screen_info.orig_video_isVGA = 0;
+ } else {
+ const struct dom0_vga_console_info *info =
+ (void *)((char *)xen_start_info +
+ xen_start_info->console.dom0.info_off);
+
+ xen_init_vga(info, xen_start_info->console.dom0.info_size);
+ xen_start_info->console.domU.mfn = 0;
+ xen_start_info->console.domU.evtchn = 0;
}
xen_raw_console_write("about to get started...\n");
@@ -1193,3 +1283,128 @@
x86_64_start_reservations((char *)__pa_symbol(&boot_params));
#endif
}
+
+static uint32_t xen_cpuid_base(void)
+{
+ uint32_t base, eax, ebx, ecx, edx;
+ char signature[13];
+
+ for (base = 0x40000000; base < 0x40010000; base += 0x100) {
+ cpuid(base, &eax, &ebx, &ecx, &edx);
+ *(uint32_t *)(signature + 0) = ebx;
+ *(uint32_t *)(signature + 4) = ecx;
+ *(uint32_t *)(signature + 8) = edx;
+ signature[12] = 0;
+
+ if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
+ return base;
+ }
+
+ return 0;
+}
+
+static int init_hvm_pv_info(int *major, int *minor)
+{
+ uint32_t eax, ebx, ecx, edx, pages, msr, base;
+ u64 pfn;
+
+ base = xen_cpuid_base();
+ if (!base)
+ return -EINVAL;
+
+ cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+ *major = eax >> 16;
+ *minor = eax & 0xffff;
+ printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
+
+ cpuid(base + 2, &pages, &msr, &ecx, &edx);
+
+ pfn = __pa(hypercall_page);
+ wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+ xen_setup_features();
+
+ pv_info.name = "Xen HVM";
+
+ xen_domain_type = XEN_HVM_DOMAIN;
+
+ return 0;
+}
+
+void xen_hvm_init_shared_info(void)
+{
+ int cpu;
+ struct xen_add_to_physmap xatp;
+ static struct shared_info *shared_info_page = 0;
+
+ if (!shared_info_page)
+ shared_info_page = (struct shared_info *) alloc_bootmem_pages(PAGE_SIZE);
+ xatp.domid = DOMID_SELF;
+ xatp.idx = 0;
+ xatp.space = XENMAPSPACE_shared_info;
+ xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+ if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+ BUG();
+
+ HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+
+ /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
+ * page, we use it in the event channel upcall and in some pvclock
+ * related functions. We don't need the vcpu_info placement
+ * optimizations because we don't use any pv_mmu or pv_irq op on
+ * HVM.
+ * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
+ * online but xen_hvm_init_shared_info is run at resume time too and
+ * in that case multiple vcpus might be online. */
+ for_each_online_cpu(cpu) {
+ per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+ }
+}
+
+#ifdef CONFIG_XEN_PVHVM
+static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (long)hcpu;
+ switch (action) {
+ case CPU_UP_PREPARE:
+ per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+ if (xen_have_vector_callback)
+ xen_init_lock_cpu(cpu);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = {
+ .notifier_call = xen_hvm_cpu_notify,
+};
+
+void __init xen_hvm_guest_init(void)
+{
+ int r;
+ int major, minor;
+
+ if (xen_pv_domain())
+ return;
+
+ r = init_hvm_pv_info(&major, &minor);
+ if (r < 0)
+ return;
+
+ xen_hvm_init_shared_info();
+
+ if (xen_feature(XENFEAT_hvm_callback_vector))
+ xen_have_vector_callback = 1;
+ xen_hvm_smp_init();
+ register_cpu_notifier(&xen_hvm_cpu_notifier);
+ xen_unplug_emulated_devices();
+ have_vcpu_info_placement = 0;
+ x86_init.irqs.intr_init = xen_init_IRQ;
+ xen_hvm_init_time_ops();
+ xen_hvm_init_mmu_ops();
+}
+#endif
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3f90a2c..18f940e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -42,6 +42,7 @@
#include <linux/highmem.h>
#include <linux/debugfs.h>
#include <linux/bug.h>
+#include <linux/vmalloc.h>
#include <linux/module.h>
#include <asm/pgtable.h>
@@ -50,14 +51,20 @@
#include <asm/mmu_context.h>
#include <asm/setup.h>
#include <asm/paravirt.h>
+#include <asm/e820.h>
#include <asm/linkage.h>
+#include <asm/pat.h>
+#include <asm/init.h>
+#include <asm/page.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include <xen/page.h>
#include <xen/interface/xen.h>
+#include <xen/interface/hvm/hvm_op.h>
#include <xen/interface/version.h>
+#include <xen/interface/memory.h>
#include <xen/hvc-console.h>
#include "multicalls.h"
@@ -66,6 +73,13 @@
#define MMU_UPDATE_HISTO 30
+/*
+ * Protects atomic reservation decrease/increase against concurrent increases.
+ * Also protects non-atomic updates of current_pages and driver_pages, and
+ * balloon lists.
+ */
+DEFINE_SPINLOCK(xen_reservation_lock);
+
#ifdef CONFIG_XEN_DEBUG_FS
static struct {
@@ -124,7 +138,8 @@
* large enough to allocate page table pages to allocate the rest.
* Each page can map 2MB.
*/
-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
+static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
#ifdef CONFIG_X86_64
/* l3 pud for userspace vsyscall mapping */
@@ -155,49 +170,202 @@
*/
#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
+/*
+ * Xen leaves the responsibility for maintaining p2m mappings to the
+ * guests themselves, but it must also access and update the p2m array
+ * during suspend/resume when all the pages are reallocated.
+ *
+ * The p2m table is logically a flat array, but we implement it as a
+ * three-level tree to allow the address space to be sparse.
+ *
+ * Xen
+ * |
+ * p2m_top p2m_top_mfn
+ * / \ / \
+ * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
+ * / \ / \ / /
+ * p2m p2m p2m p2m p2m p2m p2m ...
+ *
+ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
+ *
+ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
+ * maximum representable pseudo-physical address space is:
+ * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
+ *
+ * P2M_PER_PAGE depends on the architecture, as a mfn is always
+ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
+ * 512 and 1024 entries respectively.
+ */
-#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
-#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
+unsigned long xen_max_p2m_pfn __read_mostly;
-/* Placeholder for holes in the address space */
-static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
- { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
+#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
- /* Array of pointers to pages containing p2m entries */
-static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
- { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
+#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-/* Arrays of p2m arrays expressed in mfns used for save/restore */
-static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
+/* Placeholders for holes in the address space */
+static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
-static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
- __page_aligned_bss;
+static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
+
+RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
static inline unsigned p2m_top_index(unsigned long pfn)
{
- BUG_ON(pfn >= MAX_DOMAIN_PAGES);
- return pfn / P2M_ENTRIES_PER_PAGE;
+ BUG_ON(pfn >= MAX_P2M_PFN);
+ return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
+}
+
+static inline unsigned p2m_mid_index(unsigned long pfn)
+{
+ return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
}
static inline unsigned p2m_index(unsigned long pfn)
{
- return pfn % P2M_ENTRIES_PER_PAGE;
+ return pfn % P2M_PER_PAGE;
}
-/* Build the parallel p2m_top_mfn structures */
+static void p2m_top_init(unsigned long ***top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing;
+}
+
+static void p2m_top_mfn_init(unsigned long *top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = virt_to_mfn(p2m_mid_missing_mfn);
+}
+
+static void p2m_top_mfn_p_init(unsigned long **top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing_mfn;
+}
+
+static void p2m_mid_init(unsigned long **mid)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = p2m_missing;
+}
+
+static void p2m_mid_mfn_init(unsigned long *mid)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = virt_to_mfn(p2m_missing);
+}
+
+static void p2m_init(unsigned long *p2m)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ p2m[i] = INVALID_P2M_ENTRY;
+}
+
+static int lookup_pte_fn(
+ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+ uint64_t *ptep = (uint64_t *)data;
+ if (ptep)
+ *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
+ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
+ return 0;
+}
+
+int create_lookup_pte_addr(struct mm_struct *mm,
+ unsigned long address,
+ uint64_t *ptep)
+{
+ return apply_to_page_range(mm, address, PAGE_SIZE,
+ lookup_pte_fn, ptep);
+}
+
+EXPORT_SYMBOL(create_lookup_pte_addr);
+
+/*
+ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
+ *
+ * This is called both at boot time, and after resuming from suspend:
+ * - At boot time we're called very early, and must use extend_brk()
+ * to allocate memory.
+ *
+ * - After resume we're called from within stop_machine, but the mfn
+ * tree should alreay be completely allocated.
+ */
void xen_build_mfn_list_list(void)
{
- unsigned pfn, idx;
+ unsigned long pfn;
- for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
+ /* Pre-initialize p2m_top_mfn to be completely missing */
+ if (p2m_top_mfn == NULL) {
+ p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(p2m_mid_missing_mfn);
- p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
+ p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_p_init(p2m_top_mfn_p);
+
+ p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_init(p2m_top_mfn);
+ } else {
+ /* Reinitialise, mfn's all change after migration */
+ p2m_mid_mfn_init(p2m_mid_missing_mfn);
}
- for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
- unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
- p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
+ for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+ unsigned long **mid;
+ unsigned long *mid_mfn_p;
+
+ mid = p2m_top[topidx];
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+
+ /* Don't bother allocating any mfn mid levels if
+ * they're just missing, just update the stored mfn,
+ * since all could have changed over a migrate.
+ */
+ if (mid == p2m_mid_missing) {
+ BUG_ON(mididx);
+ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+ p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
+ pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
+ continue;
+ }
+
+ if (mid_mfn_p == p2m_mid_missing_mfn) {
+ /*
+ * XXX boot-time only! We should never find
+ * missing parts of the mfn tree after
+ * runtime. extend_brk() will BUG if we call
+ * it too late.
+ */
+ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(mid_mfn_p);
+
+ p2m_top_mfn_p[topidx] = mid_mfn_p;
+ }
+
+ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+ mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
}
}
@@ -206,8 +374,8 @@
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
- virt_to_mfn(p2m_top_mfn_list);
- HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
+ virt_to_mfn(p2m_top_mfn);
+ HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
}
/* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -215,98 +383,188 @@
{
unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
- unsigned pfn;
+ unsigned long pfn;
- for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
+ xen_max_p2m_pfn = max_pfn;
+
+ p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_init(p2m_missing);
+
+ p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(p2m_mid_missing);
+
+ p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_init(p2m_top);
+
+ /*
+ * The domain builder gives us a pre-constructed p2m array in
+ * mfn_list for all the pages initially given to us, so we just
+ * need to graft that into our tree structure.
+ */
+ for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
- p2m_top[topidx] = &mfn_list[pfn];
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(mid);
+
+ p2m_top[topidx] = mid;
+ }
+
+ /*
+ * As long as the mfn_list has enough entries to completely
+ * fill a p2m page, pointing into the array is ok. But if
+ * not the entries beyond the last pfn will be undefined.
+ */
+ if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
+ unsigned long p2midx;
+
+ p2midx = max_pfn % P2M_PER_PAGE;
+ for ( ; p2midx < P2M_PER_PAGE; p2midx++)
+ mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
+ }
+ p2m_top[topidx][mididx] = &mfn_list[pfn];
}
-
- xen_build_mfn_list_list();
}
unsigned long get_phys_to_machine(unsigned long pfn)
{
- unsigned topidx, idx;
+ unsigned topidx, mididx, idx;
- if (unlikely(pfn >= MAX_DOMAIN_PAGES))
+ if (unlikely(pfn >= MAX_P2M_PFN))
return INVALID_P2M_ENTRY;
topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
idx = p2m_index(pfn);
- return p2m_top[topidx][idx];
+
+ return p2m_top[topidx][mididx][idx];
}
EXPORT_SYMBOL_GPL(get_phys_to_machine);
-/* install a new p2m_top page */
-bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
+static void *alloc_p2m_page(void)
{
- unsigned topidx = p2m_top_index(pfn);
- unsigned long **pfnp, *mfnp;
- unsigned i;
-
- pfnp = &p2m_top[topidx];
- mfnp = &p2m_top_mfn[topidx];
-
- for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
- p[i] = INVALID_P2M_ENTRY;
-
- if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
- *mfnp = virt_to_mfn(p);
- return true;
- }
-
- return false;
+ return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
}
-static void alloc_p2m(unsigned long pfn)
+static void free_p2m_page(void *p)
{
- unsigned long *p;
+ free_page((unsigned long)p);
+}
- p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
- BUG_ON(p == NULL);
+/*
+ * Fully allocate the p2m structure for a given pfn. We need to check
+ * that both the top and mid levels are allocated, and make sure the
+ * parallel mfn tree is kept in sync. We may race with other cpus, so
+ * the new pages are installed with cmpxchg; if we lose the race then
+ * simply free the page we allocated and use the one that's there.
+ */
+static bool alloc_p2m(unsigned long pfn)
+{
+ unsigned topidx, mididx;
+ unsigned long ***top_p, **mid;
+ unsigned long *top_mfn_p, *mid_mfn;
- if (!install_p2mtop_page(pfn, p))
- free_page((unsigned long)p);
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+
+ top_p = &p2m_top[topidx];
+ mid = *top_p;
+
+ if (mid == p2m_mid_missing) {
+ /* Mid level is missing, allocate a new one */
+ mid = alloc_p2m_page();
+ if (!mid)
+ return false;
+
+ p2m_mid_init(mid);
+
+ if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
+ free_p2m_page(mid);
+ }
+
+ top_mfn_p = &p2m_top_mfn[topidx];
+ mid_mfn = p2m_top_mfn_p[topidx];
+
+ BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
+
+ if (mid_mfn == p2m_mid_missing_mfn) {
+ /* Separately check the mid mfn level */
+ unsigned long missing_mfn;
+ unsigned long mid_mfn_mfn;
+
+ mid_mfn = alloc_p2m_page();
+ if (!mid_mfn)
+ return false;
+
+ p2m_mid_mfn_init(mid_mfn);
+
+ missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+ mid_mfn_mfn = virt_to_mfn(mid_mfn);
+ if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
+ free_p2m_page(mid_mfn);
+ else
+ p2m_top_mfn_p[topidx] = mid_mfn;
+ }
+
+ if (p2m_top[topidx][mididx] == p2m_missing) {
+ /* p2m leaf page is missing */
+ unsigned long *p2m;
+
+ p2m = alloc_p2m_page();
+ if (!p2m)
+ return false;
+
+ p2m_init(p2m);
+
+ if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
+ free_p2m_page(p2m);
+ else
+ mid_mfn[mididx] = virt_to_mfn(p2m);
+ }
+
+ return true;
}
/* Try to install p2m mapping; fail if intermediate bits missing */
bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
- unsigned topidx, idx;
+ unsigned topidx, mididx, idx;
- if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
+ if (unlikely(pfn >= MAX_P2M_PFN)) {
BUG_ON(mfn != INVALID_P2M_ENTRY);
return true;
}
topidx = p2m_top_index(pfn);
- if (p2m_top[topidx] == p2m_missing) {
- if (mfn == INVALID_P2M_ENTRY)
- return true;
- return false;
- }
-
+ mididx = p2m_mid_index(pfn);
idx = p2m_index(pfn);
- p2m_top[topidx][idx] = mfn;
+
+ if (p2m_top[topidx][mididx] == p2m_missing)
+ return mfn == INVALID_P2M_ENTRY;
+
+ p2m_top[topidx][mididx][idx] = mfn;
return true;
}
-void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
- return;
+ return true;
}
if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
- alloc_p2m(pfn);
+ if (!alloc_p2m(pfn))
+ return false;
if (!__set_phys_to_machine(pfn, mfn))
- BUG();
+ return false;
}
+
+ return true;
}
unsigned long arbitrary_virt_to_mfn(void *vaddr)
@@ -315,6 +573,7 @@
return PFN_DOWN(maddr.maddr);
}
+EXPORT_SYMBOL_GPL(set_phys_to_machine);
xmaddr_t arbitrary_virt_to_machine(void *vaddr)
{
@@ -345,7 +604,8 @@
unsigned int level;
pte = lookup_address(address, &level);
- BUG_ON(pte == NULL);
+ if (pte == NULL)
+ return; /* vaddr missing */
ptev = pte_wrprotect(*pte);
@@ -360,7 +620,8 @@
unsigned int level;
pte = lookup_address(address, &level);
- BUG_ON(pte == NULL);
+ if (pte == NULL)
+ return; /* vaddr missing */
ptev = pte_mkwrite(*pte);
@@ -376,6 +637,24 @@
return PagePinned(page);
}
+void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
+{
+ struct multicall_space mcs;
+ struct mmu_update *u;
+
+ mcs = xen_mc_entry(sizeof(*u));
+ u = mcs.args;
+
+ /* ptep might be kmapped when using 32-bit HIGHPTE */
+ u->ptr = arbitrary_virt_to_machine(ptep).maddr;
+ u->val = pte_val_ma(pteval);
+
+ MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+EXPORT_SYMBOL_GPL(xen_set_domain_pte);
+
static void xen_extend_mmu_update(const struct mmu_update *update)
{
struct multicall_space mcs;
@@ -516,7 +795,34 @@
if (val & _PAGE_PRESENT) {
unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
pteval_t flags = val & PTE_FLAGS_MASK;
- val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
+ unsigned long mfn = pfn_to_mfn(pfn);
+
+ /*
+ * If there's no mfn for the pfn, then just create an
+ * empty non-present pte. Unfortunately this loses
+ * information about the original pfn, so
+ * pte_mfn_to_pfn is asymmetric.
+ */
+ if (unlikely(mfn == INVALID_P2M_ENTRY)) {
+ mfn = 0;
+ flags = 0;
+ }
+
+ val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
+ }
+
+ return val;
+}
+
+static pteval_t iomap_pte(pteval_t val)
+{
+ if (val & _PAGE_PRESENT) {
+ unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+ pteval_t flags = val & PTE_FLAGS_MASK;
+
+ /* We assume the pte frame number is a MFN, so
+ just use it as-is. */
+ val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
}
return val;
@@ -524,7 +830,18 @@
pteval_t xen_pte_val(pte_t pte)
{
- return pte_mfn_to_pfn(pte.pte);
+ pteval_t pteval = pte.pte;
+
+ /* If this is a WC pte, convert back from Xen WC to Linux WC */
+ if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
+ WARN_ON(!pat_enabled);
+ pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
+ }
+
+ if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
+ return pteval;
+
+ return pte_mfn_to_pfn(pteval);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
@@ -534,9 +851,62 @@
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
+/*
+ * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
+ * are reserved for now, to correspond to the Intel-reserved PAT
+ * types.
+ *
+ * We expect Linux's PAT set as follows:
+ *
+ * Idx PTE flags Linux Xen Default
+ * 0 WB WB WB
+ * 1 PWT WC WT WT
+ * 2 PCD UC- UC- UC-
+ * 3 PCD PWT UC UC UC
+ * 4 PAT WB WC WB
+ * 5 PAT PWT WC WP WT
+ * 6 PAT PCD UC- UC UC-
+ * 7 PAT PCD PWT UC UC UC
+ */
+
+void xen_set_pat(u64 pat)
+{
+ /* We expect Linux to use a PAT setting of
+ * UC UC- WC WB (ignoring the PAT flag) */
+ WARN_ON(pat != 0x0007010600070106ull);
+}
+
pte_t xen_make_pte(pteval_t pte)
{
- pte = pte_pfn_to_mfn(pte);
+ phys_addr_t addr = (pte & PTE_PFN_MASK);
+
+ /* If Linux is trying to set a WC pte, then map to the Xen WC.
+ * If _PAGE_PAT is set, then it probably means it is really
+ * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
+ * things work out OK...
+ *
+ * (We should never see kernel mappings with _PAGE_PSE set,
+ * but we could see hugetlbfs mappings, I think.).
+ */
+ if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
+ if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
+ pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
+ }
+
+ /*
+ * Unprivileged domains are allowed to do IOMAPpings for
+ * PCI passthrough, but not map ISA space. The ISA
+ * mappings are just dummy local mappings to keep other
+ * parts of the kernel happy.
+ */
+ if (unlikely(pte & _PAGE_IOMAP) &&
+ (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
+ pte = iomap_pte(pte);
+ } else {
+ pte &= ~_PAGE_IOMAP;
+ pte = pte_pfn_to_mfn(pte);
+ }
+
return native_make_pte(pte);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
@@ -934,8 +1304,6 @@
read-only, and can be pinned. */
static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
{
- vm_unmap_aliases();
-
xen_mc_batch();
if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
@@ -1219,7 +1587,7 @@
spin_lock(&mm->page_table_lock);
/* pgd may not be pinned in the error exit path of execve */
- if (xen_page_pinned(mm->pgd))
+ if (xen_page_pinned(mm->pgd) && !mm->context.has_foreign_mappings)
xen_pgd_unpin(mm);
spin_unlock(&mm->page_table_lock);
@@ -1288,12 +1656,19 @@
preempt_enable();
}
+/*
+ * Flush tlb on other cpus. Xen can do this via a single hypercall
+ * rather than explicit IPIs, which has the nice property of avoiding
+ * any cpus which don't actually have dirty tlbs. Unfortunately it
+ * doesn't give us an opportunity to kick out cpus which are in lazy
+ * tlb state, so we may end up reflushing some cpus unnecessarily.
+ */
static void xen_flush_tlb_others(const struct cpumask *cpus,
struct mm_struct *mm, unsigned long va)
{
struct {
struct mmuext_op op;
- DECLARE_BITMAP(mask, NR_CPUS);
+ DECLARE_BITMAP(mask, num_processors);
} *args;
struct multicall_space mcs;
@@ -1417,6 +1792,13 @@
return ret;
}
+void xen_late_unpin_pgd(struct mm_struct *mm, pgd_t *pgd)
+{
+ if (xen_page_pinned(pgd))
+ __xen_pgd_unpin(mm, pgd);
+
+}
+
static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
#ifdef CONFIG_X86_64
@@ -1445,13 +1827,29 @@
}
#endif
-#ifdef CONFIG_X86_32
static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
{
- /* If there's an existing pte, then don't allow _PAGE_RW to be set */
- if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+ unsigned long pfn = pte_pfn(pte);
+ pte_t oldpte = *ptep;
+
+ if (pte_flags(oldpte) & _PAGE_PRESENT) {
+ /* Don't allow existing IO mappings to be overridden */
+ if (pte_flags(oldpte) & _PAGE_IOMAP)
+ pte = oldpte;
+
+ /* Don't allow _PAGE_RW to be set on existing pte */
pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
pte_val_ma(pte));
+ }
+
+ /*
+ * If the new pfn is within the range of the newly allocated
+ * kernel pagetable, and it isn't being mapped into an
+ * early_ioremap fixmap slot, make sure it is RO.
+ */
+ if (!is_early_ioremap_ptep(ptep) &&
+ pfn >= e820_table_start && pfn < e820_table_end)
+ pte = pte_wrprotect(pte);
return pte;
}
@@ -1464,7 +1862,6 @@
xen_set_pte(ptep, pte);
}
-#endif
static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
{
@@ -1517,7 +1914,6 @@
if (PagePinned(virt_to_page(mm->pgd))) {
SetPagePinned(page);
- vm_unmap_aliases();
if (!PageHighMem(page)) {
make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
if (level == PT_PTE && USE_SPLIT_PTLOCKS)
@@ -1620,6 +2016,7 @@
return __ka(m2p(maddr));
}
+/* Set the page permissions on an identity-mapped pages */
static void set_page_prot(void *addr, pgprot_t prot)
{
unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
@@ -1635,6 +2032,9 @@
unsigned ident_pte;
unsigned long pfn;
+ level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
+ PAGE_SIZE);
+
ident_pte = 0;
pfn = 0;
for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
@@ -1645,7 +2045,7 @@
pte_page = m2v(pmd[pmdidx].pmd);
else {
/* Check for free pte pages */
- if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
+ if (ident_pte == LEVEL1_IDENT_ENTRIES)
break;
pte_page = &level1_ident_pgt[ident_pte];
@@ -1677,6 +2077,20 @@
set_page_prot(pmd, PAGE_KERNEL_RO);
}
+void __init xen_setup_machphys_mapping(void)
+{
+ struct xen_machphys_mapping mapping;
+ unsigned long machine_to_phys_nr_ents;
+
+ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
+ machine_to_phys_mapping = (unsigned long *)mapping.v_start;
+ machine_to_phys_nr_ents = mapping.max_mfn + 1;
+ } else {
+ machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
+ }
+ machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
+}
+
#ifdef CONFIG_X86_64
static void convert_pfn_mfn(void *v)
{
@@ -1768,12 +2182,15 @@
return pgd;
}
#else /* !CONFIG_X86_64 */
-static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
+static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD);
__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
unsigned long max_pfn)
{
pmd_t *kernel_pmd;
+ int i;
+
+ level2_kernel_pgt = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
xen_start_info->nr_pt_frames * PAGE_SIZE +
@@ -1785,6 +2202,20 @@
xen_map_identity_early(level2_kernel_pgt, max_pfn);
memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+
+ /*
+ * When running a 32 bit domain 0 on a 64 bit hypervisor a
+ * pinned L3 (such as the initial pgd here) contains bits
+ * which are reserved in the PAE layout but not in the 64 bit
+ * layout. Unfortunately some versions of the hypervisor
+ * (incorrectly) validate compat mode guests against the PAE
+ * layout and hence will not allow such a pagetable to be
+ * pinned by the guest. Therefore we mask off only the PFN and
+ * Present bits of the supplied L3.
+ */
+ for (i = 0; i < PTRS_PER_PGD; i++)
+ swapper_pg_dir[i].pgd &= (PTE_PFN_MASK | _PAGE_PRESENT);
+
set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
@@ -1807,6 +2238,8 @@
}
#endif /* CONFIG_X86_64 */
+static unsigned char dummy_ioapic_mapping[PAGE_SIZE] __page_aligned_bss;
+
static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
{
pte_t pte;
@@ -1836,9 +2269,26 @@
pte = pfn_pte(phys, prot);
break;
- default:
+#ifdef CONFIG_X86_IO_APIC
+ case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
+ /*
+ * We just don't map the IO APIC - all access is via
+ * hypercalls. Keep the address in the pte for reference.
+ */
+ pte = pfn_pte(PFN_DOWN(__pa(dummy_ioapic_mapping)), PAGE_KERNEL);
+ break;
+#endif
+
+ case FIX_PARAVIRT_BOOTMAP:
+ /* This is an MFN, but it isn't an IO mapping from the
+ IO domain */
pte = mfn_pte(phys, prot);
break;
+
+ default:
+ /* By default, set_fixmap is used for hardware mappings */
+ pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
+ break;
}
__native_set_fixmap(idx, pte);
@@ -1853,6 +2303,29 @@
#endif
}
+__init void xen_ident_map_ISA(void)
+{
+ unsigned long pa;
+
+ /*
+ * If we're dom0, then linear map the ISA machine addresses into
+ * the kernel's address space.
+ */
+ if (!xen_initial_domain())
+ return;
+
+ xen_raw_printk("Xen: setup ISA identity maps\n");
+
+ for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
+ pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
+
+ if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
+ BUG();
+ }
+
+ xen_flush_tlb();
+}
+
static __init void xen_post_allocator_init(void)
{
pv_mmu_ops.set_pte = xen_set_pte;
@@ -1915,11 +2388,7 @@
.kmap_atomic_pte = xen_kmap_atomic_pte,
#endif
-#ifdef CONFIG_X86_64
- .set_pte = xen_set_pte,
-#else
.set_pte = xen_set_pte_init,
-#endif
.set_pte_at = xen_set_pte_at,
.set_pmd = xen_set_pmd_hyper,
@@ -1970,6 +2439,301 @@
pv_mmu_ops = xen_mmu_ops;
}
+/* Protected by xen_reservation_lock. */
+#define MAX_CONTIG_ORDER 9 /* 2MB */
+static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
+
+#define VOID_PTE (mfn_pte(0, __pgprot(0)))
+static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
+ unsigned long *in_frames,
+ unsigned long *out_frames)
+{
+ int i;
+ struct multicall_space mcs;
+
+ xen_mc_batch();
+ for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
+ mcs = __xen_mc_entry(0);
+
+ if (in_frames)
+ in_frames[i] = virt_to_mfn(vaddr);
+
+ MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
+ set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
+
+ if (out_frames)
+ out_frames[i] = virt_to_pfn(vaddr);
+ }
+ xen_mc_issue(0);
+}
+
+/*
+ * Update the pfn-to-mfn mappings for a virtual address range, either to
+ * point to an array of mfns, or contiguously from a single starting
+ * mfn.
+ */
+static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
+ unsigned long *mfns,
+ unsigned long first_mfn)
+{
+ unsigned i, limit;
+ unsigned long mfn;
+
+ xen_mc_batch();
+
+ limit = 1u << order;
+ for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
+ struct multicall_space mcs;
+ unsigned flags;
+
+ mcs = __xen_mc_entry(0);
+ if (mfns)
+ mfn = mfns[i];
+ else
+ mfn = first_mfn + i;
+
+ if (i < (limit - 1))
+ flags = 0;
+ else {
+ if (order == 0)
+ flags = UVMF_INVLPG | UVMF_ALL;
+ else
+ flags = UVMF_TLB_FLUSH | UVMF_ALL;
+ }
+
+ MULTI_update_va_mapping(mcs.mc, vaddr,
+ mfn_pte(mfn, PAGE_KERNEL), flags);
+
+ set_phys_to_machine(virt_to_pfn(vaddr), mfn);
+ }
+
+ xen_mc_issue(0);
+}
+
+/*
+ * Perform the hypercall to exchange a region of our pfns to point to
+ * memory with the required contiguous alignment. Takes the pfns as
+ * input, and populates mfns as output.
+ *
+ * Returns a success code indicating whether the hypervisor was able to
+ * satisfy the request or not.
+ */
+static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
+ unsigned long *pfns_in,
+ unsigned long extents_out, unsigned int order_out,
+ unsigned long *mfns_out,
+ unsigned int address_bits)
+{
+ long rc;
+ int success;
+
+ struct xen_memory_exchange exchange = {
+ .in = {
+ .nr_extents = extents_in,
+ .extent_order = order_in,
+ .extent_start = pfns_in,
+ .domid = DOMID_SELF
+ },
+ .out = {
+ .nr_extents = extents_out,
+ .extent_order = order_out,
+ .extent_start = mfns_out,
+ .address_bits = address_bits,
+ .domid = DOMID_SELF
+ }
+ };
+
+ BUG_ON(extents_in << order_in != extents_out << order_out);
+
+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+ success = (exchange.nr_exchanged == extents_in);
+
+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+ BUG_ON(success && (rc != 0));
+
+ return success;
+}
+
+int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
+ unsigned int address_bits)
+{
+ unsigned long *in_frames = discontig_frames, out_frame;
+ unsigned long flags;
+ int success;
+
+ /*
+ * Currently an auto-translated guest will not perform I/O, nor will
+ * it require PAE page directories below 4GB. Therefore any calls to
+ * this function are redundant and can be ignored.
+ */
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
+ return -ENOMEM;
+
+ memset((void *) vstart, 0, PAGE_SIZE << order);
+
+ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ /* 1. Zap current PTEs, remembering MFNs. */
+ xen_zap_pfn_range(vstart, order, in_frames, NULL);
+
+ /* 2. Get a new contiguous memory extent. */
+ out_frame = virt_to_pfn(vstart);
+ success = xen_exchange_memory(1UL << order, 0, in_frames,
+ 1, order, &out_frame,
+ address_bits);
+
+ /* 3. Map the new extent in place of old pages. */
+ if (success)
+ xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
+ else
+ xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
+
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+
+ return success ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
+
+void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
+{
+ unsigned long *out_frames = discontig_frames, in_frame;
+ unsigned long flags;
+ int success;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
+ return;
+
+ memset((void *) vstart, 0, PAGE_SIZE << order);
+
+ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ /* 1. Find start MFN of contiguous extent. */
+ in_frame = virt_to_mfn(vstart);
+
+ /* 2. Zap current PTEs. */
+ xen_zap_pfn_range(vstart, order, NULL, out_frames);
+
+ /* 3. Do the exchange for non-contiguous MFNs. */
+ success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
+ 0, out_frames, 0);
+
+ /* 4. Map new pages in place of old pages. */
+ if (success)
+ xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
+ else
+ xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
+
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+}
+EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
+
+#define REMAP_BATCH_SIZE 16
+
+struct remap_data {
+ unsigned long mfn;
+ pgprot_t prot;
+ struct mmu_update *mmu_update;
+};
+
+static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
+ unsigned long addr, void *data)
+{
+ struct remap_data *rmd = data;
+ pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
+
+ rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
+ rmd->mmu_update->val = pte_val_ma(pte);
+ rmd->mmu_update++;
+
+ return 0;
+}
+
+int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+ unsigned long addr,
+ unsigned long mfn, int nr,
+ pgprot_t prot, unsigned domid)
+{
+ struct remap_data rmd;
+ struct mmu_update mmu_update[REMAP_BATCH_SIZE];
+ int batch;
+ unsigned long range;
+ int err = 0;
+
+ prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
+
+ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+
+ rmd.mfn = mfn;
+ rmd.prot = prot;
+
+ while (nr) {
+ batch = min(REMAP_BATCH_SIZE, nr);
+ range = (unsigned long)batch << PAGE_SHIFT;
+
+ rmd.mmu_update = mmu_update;
+ err = apply_to_page_range(vma->vm_mm, addr, range,
+ remap_area_mfn_pte_fn, &rmd);
+ if (err)
+ goto out;
+
+ err = -EFAULT;
+ if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
+ goto out;
+
+ nr -= batch;
+ addr += range;
+ }
+
+ err = 0;
+out:
+
+ flush_tlb_all();
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+
+#ifdef CONFIG_XEN_PVHVM
+static void xen_hvm_exit_mmap(struct mm_struct *mm)
+{
+ struct xen_hvm_pagetable_dying a;
+ int rc;
+
+ a.domid = DOMID_SELF;
+ a.gpa = __pa(mm->pgd);
+ rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+ WARN_ON_ONCE(rc < 0);
+}
+
+static int is_pagetable_dying_supported(void)
+{
+ struct xen_hvm_pagetable_dying a;
+ int rc = 0;
+
+ a.domid = DOMID_SELF;
+ a.gpa = 0x00;
+ rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+ if (rc < 0) {
+ printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
+ return 0;
+ }
+ return 1;
+}
+
+void __init xen_hvm_init_mmu_ops(void)
+{
+ if (is_pagetable_dying_supported())
+ pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
+}
+#endif
+
#ifdef CONFIG_XEN_DEBUG_FS
static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 5fe6bc7..537bb9a 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -12,7 +12,6 @@
bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-bool install_p2mtop_page(unsigned long pfn, unsigned long *p);
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
@@ -60,4 +59,5 @@
unsigned long xen_read_cr2_direct(void);
extern void xen_init_mmu_ops(void);
+extern void xen_hvm_init_mmu_ops(void);
#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
new file mode 100644
index 0000000..4d55524
--- /dev/null
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -0,0 +1,52 @@
+/* Glue code to lib/swiotlb-xen.c */
+
+#include <linux/dma-mapping.h>
+#include <linux/swiotlb.h>
+
+#include <asm/xen/hypervisor.h>
+
+int xen_swiotlb __read_mostly;
+
+static struct dma_map_ops xen_swiotlb_dma_ops = {
+ .mapping_error = xen_swiotlb_dma_mapping_error,
+ .alloc_coherent = xen_swiotlb_alloc_coherent,
+ .free_coherent = xen_swiotlb_free_coherent,
+ .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
+ .sync_single_for_device = xen_swiotlb_sync_single_for_device,
+ .sync_single_range_for_cpu = xen_swiotlb_sync_single_range_for_cpu,
+ .sync_single_range_for_device = xen_swiotlb_sync_single_range_for_device,
+ .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
+ .sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
+ .map_sg = xen_swiotlb_map_sg_attrs,
+ .unmap_sg = xen_swiotlb_unmap_sg_attrs,
+ .map_page = xen_swiotlb_map_page,
+ .unmap_page = xen_swiotlb_unmap_page,
+ .dma_supported = xen_swiotlb_dma_supported,
+};
+
+/*
+ * pci_swiotlb_detect - set swiotlb to 1 if necessary
+ *
+ * This returns non-zero if we are forced to use swiotlb (by the boot
+ * option).
+ */
+int __init pci_xen_swiotlb_detect(void)
+{
+
+ if (xen_pv_domain() && (xen_initial_domain() || swiotlb))
+ xen_swiotlb = 1;
+
+ /* If we are running under Xen, we MUST disable the native SWIOTLB */
+ if (xen_pv_domain())
+ swiotlb = 0;
+
+ return xen_swiotlb;
+}
+
+void __init pci_xen_swiotlb_init(void)
+{
+ if (xen_swiotlb) {
+ xen_swiotlb_init(1);
+ dma_ops = &xen_swiotlb_dma_ops;
+ }
+}
diff --git a/arch/x86/xen/pci.c b/arch/x86/xen/pci.c
new file mode 100644
index 0000000..8ca31f1
--- /dev/null
+++ b/arch/x86/xen/pci.c
@@ -0,0 +1,296 @@
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/pci.h>
+#include <linux/msi.h>
+#include <linux/slab.h>
+
+#include <asm/mpspec.h>
+#include <asm/io_apic.h>
+#include <asm/pci_x86.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/pci.h>
+
+#include <xen/interface/xen.h>
+#include <xen/events.h>
+
+#include "xen-ops.h"
+
+int xen_register_pirq(u32 gsi, int triggering)
+{
+ int rc, irq;
+ struct physdev_map_pirq map_irq;
+ int shareable = 0;
+ char *name;
+
+ if (!xen_pv_domain())
+ return -1;
+
+ if (triggering == ACPI_EDGE_SENSITIVE) {
+ shareable = 0;
+ name = "ioapic-edge";
+ } else {
+ shareable = 1;
+ name = "ioapic-level";
+ }
+
+ irq = xen_allocate_pirq(gsi, shareable, name);
+
+ printk(KERN_DEBUG "xen: --> irq=%d\n", irq);
+
+ if (irq < 0)
+ goto out;
+
+ map_irq.domid = DOMID_SELF;
+ map_irq.type = MAP_PIRQ_TYPE_GSI;
+ map_irq.index = gsi;
+ map_irq.pirq = irq;
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+ if (rc) {
+ printk(KERN_WARNING "xen map irq failed %d\n", rc);
+ return -1;
+ }
+
+out:
+ return irq;
+}
+
+int xen_register_gsi(u32 gsi, int triggering, int polarity)
+{
+ int rc, irq;
+ struct physdev_setup_gsi setup_gsi;
+
+ if (!xen_pv_domain())
+ return -1;
+
+ printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
+ gsi, triggering, polarity);
+
+ irq = xen_register_pirq(gsi, triggering);
+
+ setup_gsi.gsi = gsi;
+ setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
+ setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
+ if (rc == -EEXIST)
+ printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
+ else if (rc) {
+ printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
+ gsi, rc);
+ }
+
+ return irq;
+}
+
+#ifdef CONFIG_ACPI
+#define BAD_MADT_ENTRY(entry, end) ( \
+ (!entry) || (unsigned long)entry + sizeof(*entry) > end || \
+ ((struct acpi_subtable_header *)entry)->length < sizeof(*entry))
+
+
+static int __init
+xen_acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
+ const unsigned long end)
+{
+ struct acpi_madt_interrupt_override *intsrc = NULL;
+
+ intsrc = (struct acpi_madt_interrupt_override *)header;
+
+ if (BAD_MADT_ENTRY(intsrc, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
+ int gsi;
+ int trigger, polarity;
+
+ trigger = intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK;
+ polarity = intsrc->inti_flags & ACPI_MADT_POLARITY_MASK;
+
+ /* Command-line over-ride via acpi_sci= */
+ if (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK)
+ trigger = acpi_sci_flags & ACPI_MADT_TRIGGER_MASK;
+
+ if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
+ polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
+
+ printk("xen: sci override: source_irq=%d global_irq=%d trigger=%x polarity=%x\n",
+ intsrc->source_irq, intsrc->global_irq,
+ trigger, polarity);
+
+ switch (polarity) {
+ case ACPI_MADT_POLARITY_CONFORMS:
+ case ACPI_MADT_POLARITY_ACTIVE_LOW:
+ polarity = ACPI_ACTIVE_LOW;
+ break;
+
+ case ACPI_MADT_POLARITY_ACTIVE_HIGH:
+ polarity = ACPI_ACTIVE_HIGH;
+ break;
+
+ default:
+ return 0;
+ }
+
+ switch (trigger) {
+ case ACPI_MADT_TRIGGER_CONFORMS:
+ case ACPI_MADT_TRIGGER_LEVEL:
+ trigger = ACPI_LEVEL_SENSITIVE;
+ break;
+
+ case ACPI_MADT_TRIGGER_EDGE:
+ trigger = ACPI_EDGE_SENSITIVE;
+ break;
+
+ default:
+ return 0;
+ }
+
+ gsi = xen_register_gsi(intsrc->global_irq,
+ trigger, polarity);
+ /*
+ * stash over-ride to indicate we've been here
+ * and for later update of acpi_gbl_FADT
+ */
+ acpi_sci_override_gsi = gsi;
+
+ printk("xen: acpi sci %d\n", gsi);
+ }
+
+ return 0;
+}
+
+static __init void xen_setup_acpi_sci(void)
+{
+ acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
+ xen_acpi_parse_int_src_ovr,
+ nr_irqs);
+}
+#else
+static __init void xen_setup_acpi_sci(void)
+{
+}
+#endif
+
+void __init xen_setup_pirqs(void)
+{
+ int irq;
+
+ if (0 == nr_ioapics) {
+ for (irq = 0; irq < NR_IRQS_LEGACY; irq++)
+ xen_allocate_pirq(irq, 0, "xt-pic");
+ return;
+ }
+
+ /* Pre-allocate legacy irqs */
+ for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
+ int trigger, polarity;
+
+ if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
+ continue;
+
+ xen_register_pirq(irq,
+ trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE);
+ }
+
+ xen_setup_acpi_sci();
+}
+
+#ifdef CONFIG_PCI_MSI
+int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ int irq, ret;
+ struct msi_desc *msidesc;
+
+ list_for_each_entry(msidesc, &dev->msi_list, list) {
+ irq = xen_create_msi_irq(dev, msidesc, type);
+ if (irq < 0)
+ return -1;
+
+ ret = set_irq_msi(irq, msidesc);
+ if (ret)
+ goto error;
+ }
+ return 0;
+
+error:
+ xen_destroy_irq(irq);
+ return ret;
+}
+#endif
+
+struct xen_device_domain_owner {
+ domid_t domain;
+ struct pci_dev *dev;
+ struct list_head list;
+};
+
+static DEFINE_SPINLOCK(dev_domain_list_spinlock);
+static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list);
+
+static struct xen_device_domain_owner *find_device(struct pci_dev *dev)
+{
+ struct xen_device_domain_owner *owner;
+
+ list_for_each_entry(owner, &dev_domain_list, list) {
+ if (owner->dev == dev)
+ return owner;
+ }
+ return NULL;
+}
+
+int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+ struct xen_device_domain_owner *owner;
+ int domain = -ENODEV;
+
+ spin_lock(&dev_domain_list_spinlock);
+ owner = find_device(dev);
+ if (owner)
+ domain = owner->domain;
+ spin_unlock(&dev_domain_list_spinlock);
+ return domain;
+}
+EXPORT_SYMBOL(xen_find_device_domain_owner);
+
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain)
+{
+ struct xen_device_domain_owner *owner;
+
+ owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL);
+ if (!owner)
+ return -ENODEV;
+
+ spin_lock(&dev_domain_list_spinlock);
+ if (find_device(dev)) {
+ spin_unlock(&dev_domain_list_spinlock);
+ kfree(owner);
+ return -EEXIST;
+ }
+ owner->domain = domain;
+ owner->dev = dev;
+ list_add_tail(&owner->list, &dev_domain_list);
+ spin_unlock(&dev_domain_list_spinlock);
+ return 0;
+}
+EXPORT_SYMBOL(xen_register_device_domain_owner);
+
+int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+ struct xen_device_domain_owner *owner;
+
+ spin_lock(&dev_domain_list_spinlock);
+ owner = find_device(dev);
+ if (!owner) {
+ spin_unlock(&dev_domain_list_spinlock);
+ return -ENODEV;
+ }
+ list_del(&owner->list);
+ spin_unlock(&dev_domain_list_spinlock);
+ kfree(owner);
+ return 0;
+}
+EXPORT_SYMBOL(xen_unregister_device_domain_owner);
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
new file mode 100644
index 0000000..0f45638
--- /dev/null
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -0,0 +1,143 @@
+/******************************************************************************
+ * platform-pci-unplug.c
+ *
+ * Xen platform PCI device driver
+ * Copyright (c) 2010, Citrix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+
+#include <xen/platform_pci.h>
+
+#define XEN_PLATFORM_ERR_MAGIC -1
+#define XEN_PLATFORM_ERR_PROTOCOL -2
+#define XEN_PLATFORM_ERR_BLACKLIST -3
+
+/* store the value of xen_emul_unplug after the unplug is done */
+int xen_platform_pci_unplug;
+EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
+#ifdef CONFIG_XEN_PVHVM
+static int xen_emul_unplug;
+
+static int __init check_platform_magic(void)
+{
+ short magic;
+ char protocol;
+
+ magic = inw(XEN_IOPORT_MAGIC);
+ if (magic != XEN_IOPORT_MAGIC_VAL) {
+ printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
+ return XEN_PLATFORM_ERR_MAGIC;
+ }
+
+ protocol = inb(XEN_IOPORT_PROTOVER);
+
+ printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
+ protocol);
+
+ switch (protocol) {
+ case 1:
+ outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
+ outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
+ if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
+ printk(KERN_ERR "Xen Platform: blacklisted by host\n");
+ return XEN_PLATFORM_ERR_BLACKLIST;
+ }
+ break;
+ default:
+ printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
+ return XEN_PLATFORM_ERR_PROTOCOL;
+ }
+
+ return 0;
+}
+
+void __init xen_unplug_emulated_devices(void)
+{
+ int r;
+
+ /* user explicitly requested no unplug */
+ if (xen_emul_unplug & XEN_UNPLUG_NEVER)
+ return;
+ /* check the version of the xen platform PCI device */
+ r = check_platform_magic();
+ /* If the version matches enable the Xen platform PCI driver.
+ * Also enable the Xen platform PCI driver if the host does
+ * not support the unplug protocol (XEN_PLATFORM_ERR_MAGIC)
+ * but the user told us that unplugging is unnecessary. */
+ if (r && !(r == XEN_PLATFORM_ERR_MAGIC &&
+ (xen_emul_unplug & XEN_UNPLUG_UNNECESSARY)))
+ return;
+ /* Set the default value of xen_emul_unplug depending on whether or
+ * not the Xen PV frontends and the Xen platform PCI driver have
+ * been compiled for this kernel (modules or built-in are both OK). */
+ if (!xen_emul_unplug) {
+ if (xen_must_unplug_nics()) {
+ printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
+ "been compiled for this kernel: unplug emulated NICs.\n");
+ xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+ }
+ if (xen_must_unplug_disks()) {
+ printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
+ "been compiled for this kernel: unplug emulated disks.\n"
+ "You might have to change the root device\n"
+ "from /dev/hd[a-d] to /dev/xvd[a-d]\n"
+ "in your root= kernel command line option\n");
+ xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+ }
+ }
+ /* Now unplug the emulated devices */
+ if (!(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY))
+ outw(xen_emul_unplug, XEN_IOPORT_UNPLUG);
+ xen_platform_pci_unplug = xen_emul_unplug;
+}
+
+static int __init parse_xen_emul_unplug(char *arg)
+{
+ char *p, *q;
+ int l;
+
+ for (p = arg; p; p = q) {
+ q = strchr(p, ',');
+ if (q) {
+ l = q - p;
+ q++;
+ } else {
+ l = strlen(p);
+ }
+ if (!strncmp(p, "all", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL;
+ else if (!strncmp(p, "ide-disks", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+ else if (!strncmp(p, "aux-ide-disks", l))
+ xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS;
+ else if (!strncmp(p, "nics", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+ else if (!strncmp(p, "unnecessary", l))
+ xen_emul_unplug |= XEN_UNPLUG_UNNECESSARY;
+ else if (!strncmp(p, "never", l))
+ xen_emul_unplug |= XEN_UNPLUG_NEVER;
+ else
+ printk(KERN_WARNING "unrecognised option '%s' "
+ "in parameter 'xen_emul_unplug'\n", p);
+ }
+ return 0;
+}
+early_param("xen_emul_unplug", parse_xen_emul_unplug);
+#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index ad0047f..c011a7d 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -10,6 +10,7 @@
#include <linux/pm.h>
#include <asm/elf.h>
+#include <asm/hpet.h>
#include <asm/vdso.h>
#include <asm/e820.h>
#include <asm/setup.h>
@@ -19,7 +20,9 @@
#include <xen/page.h>
#include <xen/interface/callback.h>
+#include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
+#include <xen/interface/memory.h>
#include <xen/features.h>
#include "xen-ops.h"
@@ -32,25 +35,192 @@
extern void xen_syscall_target(void);
extern void xen_syscall32_target(void);
+/* Amount of extra memory space we add to the e820 ranges */
+phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
+
+/*
+ * The maximum amount of extra memory compared to the base size. The
+ * main scaling factor is the size of struct page. At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ *
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define EXTRA_MEM_RATIO (10)
+
+static __init void xen_add_extra_mem(unsigned long pages)
+{
+ u64 size = (u64)pages * PAGE_SIZE;
+ u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
+
+ if (!pages)
+ return;
+
+ e820_add_region(extra_start, size, E820_RAM);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+
+ reserve_early(extra_start, extra_start + size, "XEN EXTRA");
+
+ xen_extra_mem_size += size;
+
+ xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
+}
+
+static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
+ phys_addr_t end_addr)
+{
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+ unsigned long start, end;
+ unsigned long len = 0;
+ unsigned long pfn;
+ int ret;
+
+ start = PFN_UP(start_addr);
+ end = PFN_DOWN(end_addr);
+
+ if (end <= start)
+ return 0;
+
+ printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ",
+ start, end);
+ for(pfn = start; pfn < end; pfn++) {
+ unsigned long mfn = pfn_to_mfn(pfn);
+
+ /* Make sure pfn exists to start with */
+ if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
+ continue;
+
+ set_xen_guest_handle(reservation.extent_start, &mfn);
+ reservation.nr_extents = 1;
+
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &reservation);
+ WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
+ start, end, ret);
+ if (ret == 1) {
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ len++;
+ }
+ }
+ printk(KERN_CONT "%ld pages freed\n", len);
+
+ return len;
+}
+
+static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
+ const struct e820map *e820)
+{
+ phys_addr_t max_addr = PFN_PHYS(max_pfn);
+ phys_addr_t last_end = ISA_END_ADDRESS;
+ unsigned long released = 0;
+ int i;
+
+ /* Free any unused memory above the low 1Mbyte. */
+ for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
+ phys_addr_t end = e820->map[i].addr;
+ end = min(max_addr, end);
+
+ if (last_end < end)
+ released += xen_release_chunk(last_end, end);
+ last_end = max(last_end, e820->map[i].addr + e820->map[i].size);
+ }
+
+ if (last_end < max_addr)
+ released += xen_release_chunk(last_end, max_addr);
+
+ printk(KERN_INFO "released %ld pages of unused memory\n", released);
+ return released;
+}
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
-
char * __init xen_memory_setup(void)
{
+ static struct e820entry map[E820MAX] __initdata;
+
unsigned long max_pfn = xen_start_info->nr_pages;
+ unsigned long long mem_end;
+ int rc;
+ struct xen_memory_map memmap;
+ unsigned long extra_pages = 0;
+ unsigned long extra_limit;
+ int op;
+ int i;
max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
+ mem_end = PFN_PHYS(max_pfn);
+
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, map);
+
+ op = xen_initial_domain() ?
+ XENMEM_machine_memory_map :
+ XENMEM_memory_map;
+ rc = HYPERVISOR_memory_op(op, &memmap);
+ if (rc == -ENOSYS) {
+ BUG_ON(xen_initial_domain());
+ memmap.nr_entries = 1;
+ map[0].addr = 0ULL;
+ map[0].size = mem_end;
+ /* 8MB slack (to balance backend allocations). */
+ map[0].size += 8ULL << 20;
+ map[0].type = E820_RAM;
+ rc = 0;
+ }
+ BUG_ON(rc);
e820.nr_map = 0;
+ xen_extra_mem_start = mem_end;
+ for (i = 0; i < memmap.nr_entries; i++) {
+ unsigned long long end;
- e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
+ /* Guard against non-page aligned E820 entries. */
+ if (map[i].type == E820_RAM)
+ map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE;
+
+ end = map[i].addr + map[i].size;
+ if (map[i].type == E820_RAM && end > mem_end) {
+ /* RAM off the end - may be partially included */
+ u64 delta = min(map[i].size, end - mem_end);
+
+ map[i].size -= delta;
+ end -= delta;
+
+ extra_pages += PFN_DOWN(delta);
+ /*
+ * Set RAM below 4GB that is not for us to be unusable.
+ * This prevents "System RAM" address space from being
+ * used as potential resource for I/O address (happens
+ * when 'allocate_resource' is called).
+ */
+ if (delta &&
+ (xen_initial_domain() && end < 0x100000000ULL))
+ e820_add_region(end, delta, E820_UNUSABLE);
+ }
+
+ if (map[i].size > 0 && end > xen_extra_mem_start)
+ xen_extra_mem_start = end;
+
+ /* Add region if any remains */
+ if (map[i].size > 0)
+ e820_add_region(map[i].addr, map[i].size, map[i].type);
+ }
/*
- * Even though this is normal, usable memory under Xen, reserve
- * ISA memory anyway because too many things think they can poke
+ * In domU, the ISA region is normal, usable memory, but we
+ * reserve ISA memory anyway because too many things poke
* about in there.
+ *
+ * In Dom0, the host E820 information can leave gaps in the
+ * ISA range, which would cause us to release those pages. To
+ * avoid this, we unconditionally reserve them here.
*/
e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
E820_RESERVED);
@@ -67,23 +237,32 @@
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
+
+ /*
+ * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
+ * factor the base size. On non-highmem systems, the base
+ * size is the full initial memory allocation; on highmem it
+ * is limited to the max size of lowmem, so that it doesn't
+ * get completely filled.
+ *
+ * In principle there could be a problem in lowmem systems if
+ * the initial memory is also very large with respect to
+ * lowmem, but we won't try to deal with that here.
+ */
+ extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+ max_pfn + extra_pages);
+
+ if (extra_limit >= max_pfn)
+ extra_pages = extra_limit - max_pfn;
+ else
+ extra_pages = 0;
+
+ xen_add_extra_mem(extra_pages);
+
return "Xen";
}
-static void xen_idle(void)
-{
- local_irq_disable();
-
- if (need_resched())
- local_irq_enable();
- else {
- current_thread_info()->status &= ~TS_POLLING;
- smp_mb__after_clear_bit();
- safe_halt();
- current_thread_info()->status |= TS_POLLING;
- }
-}
-
/*
* Set the bit indicating "nosegneg" library variants should be used.
* We only need to bother in pure 32-bit mode; compat 32-bit processes
@@ -156,6 +335,8 @@
struct physdev_set_iopl set_iopl;
int rc;
+ xen_panic_handler_init();
+
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
@@ -182,13 +363,21 @@
}
#endif
+ /*
+ * Xen hypervisor uses HPET to wakeup cpu from deep c-states,
+ * so the HPET usage in dom0 must be forbidden.
+ */
+ disable_hpet(NULL);
+
memcpy(boot_command_line, xen_start_info->cmd_line,
MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
- pm_idle = xen_idle;
-
- paravirt_disable_iospace();
+ /* Set up idle, making sure it calls safe_halt() pvop */
+#ifdef CONFIG_X86_32
+ boot_cpu_data.hlt_works_ok = 1;
+#endif
+ pm_idle = default_idle;
fiddle_vdso();
}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index a96204a..a873872 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -179,6 +179,7 @@
static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
+ unsigned int i;
if (skip_ioapic_setup) {
char *m = (max_cpus == 0) ?
@@ -193,6 +194,12 @@
smp_store_cpu_info(0);
cpu_data(0).x86_max_cores = 1;
+
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
+ }
set_cpu_sibling_map(0);
if (xen_smp_intr_init(0))
@@ -309,6 +316,8 @@
xen_setup_timer(cpu);
xen_init_lock_cpu(cpu);
+ cpumask_set_cpu(cpu, cpu_callout_mask);
+
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* make sure interrupts start blocked */
@@ -402,6 +411,8 @@
load_cr3(swapper_pg_dir);
/* should set up a minimal gdt */
+ set_cpu_online(cpu, false);
+
HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
BUG();
}
@@ -489,3 +500,41 @@
xen_fill_possible_map();
xen_init_spinlocks();
}
+
+static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
+{
+ native_smp_prepare_cpus(max_cpus);
+ WARN_ON(xen_smp_intr_init(0));
+
+ if (!xen_have_vector_callback)
+ return;
+ xen_init_lock_cpu(0);
+ xen_init_spinlocks();
+}
+
+static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
+{
+ int rc;
+ rc = native_cpu_up(cpu);
+ WARN_ON (xen_smp_intr_init(cpu));
+ return rc;
+}
+
+static void xen_hvm_cpu_die(unsigned int cpu)
+{
+ unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
+ native_cpu_die(cpu);
+}
+
+void __init xen_hvm_smp_init(void)
+{
+ smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
+ smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
+ smp_ops.cpu_up = xen_hvm_cpu_up;
+ smp_ops.cpu_die = xen_hvm_cpu_die;
+ smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
+ smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
+}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index a9c6611..6c43a90f 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,7 +12,7 @@
#include "xen-ops.h"
#include "mmu.h"
-void xen_pre_suspend(void)
+void xen_arch_pre_suspend(void)
{
xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
xen_start_info->console.domU.mfn =
@@ -26,7 +26,21 @@
BUG();
}
-void xen_post_suspend(int suspend_cancelled)
+void xen_arch_hvm_post_suspend(int suspend_cancelled)
+{
+#ifdef CONFIG_XEN_PVHVM
+ int cpu;
+ xen_hvm_init_shared_info();
+ xen_callback_vector();
+ if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
+ for_each_online_cpu(cpu) {
+ xen_setup_runstate_info(cpu);
+ }
+ }
+#endif
+}
+
+void xen_arch_post_suspend(int suspend_cancelled)
{
xen_build_mfn_list_list();
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 8f92188..07efc5b 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -19,6 +19,7 @@
#include <asm/xen/hypercall.h>
#include <xen/events.h>
+#include <xen/features.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
@@ -155,7 +156,7 @@
}
/* Get the TSC speed from Xen */
-unsigned long xen_tsc_khz(void)
+static unsigned long xen_tsc_khz(void)
{
struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
@@ -190,7 +191,7 @@
put_cpu_var(xen_vcpu);
}
-unsigned long xen_get_wallclock(void)
+static unsigned long xen_get_wallclock(void)
{
struct timespec ts;
@@ -198,10 +199,24 @@
return ts.tv_sec;
}
-int xen_set_wallclock(unsigned long now)
+static int xen_set_wallclock(unsigned long now)
{
+ struct xen_platform_op op;
+ int rc;
+
/* do nothing for domU */
- return -1;
+ if (!xen_initial_domain())
+ return -1;
+
+ op.cmd = XENPF_settime;
+ op.u.settime.secs = now;
+ op.u.settime.nsecs = 0;
+ op.u.settime.system_time = xen_clocksource_read();
+
+ rc = HYPERVISOR_dom0_op(&op);
+ WARN(rc != 0, "XENPF_settime failed: now=%ld\n", now);
+
+ return rc;
}
static struct clocksource xen_clocksource __read_mostly = {
@@ -405,6 +420,8 @@
evt->cpumask = cpumask_of(cpu);
evt->irq = irq;
+
+ xen_setup_runstate_info(cpu);
}
void xen_teardown_timer(int cpu)
@@ -437,7 +454,7 @@
}
}
-__init void xen_time_init(void)
+static __init void xen_time_init(void)
{
int cpu = smp_processor_id();
@@ -461,3 +478,52 @@
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
}
+
+static const struct pv_time_ops xen_time_ops __initdata = {
+ .sched_clock = xen_clocksource_read,
+};
+
+__init void xen_init_time_ops(void)
+{
+ pv_time_ops = xen_time_ops;
+
+ x86_init.timers.timer_init = xen_time_init;
+ x86_init.timers.setup_percpu_clockev = x86_init_noop;
+ x86_cpuinit.setup_percpu_clockev = x86_init_noop;
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+ x86_platform.set_wallclock = xen_set_wallclock;
+}
+
+#ifdef CONFIG_XEN_PVHVM
+static void xen_hvm_setup_cpu_clockevents(void)
+{
+ int cpu = smp_processor_id();
+ xen_setup_runstate_info(cpu);
+ xen_setup_timer(cpu);
+ xen_setup_cpu_clockevents();
+}
+
+__init void xen_hvm_init_time_ops(void)
+{
+ /* vector callback is needed otherwise we cannot receive interrupts
+ * on cpu > 0 and at this point we don't know how many cpus are
+ * available */
+ if (!xen_have_vector_callback)
+ return;
+ if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
+ printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
+ "disable pv timer\n");
+ return;
+ }
+
+ pv_time_ops = xen_time_ops;
+ x86_init.timers.setup_percpu_clockev = xen_time_init;
+ x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+ x86_platform.set_wallclock = xen_set_wallclock;
+}
+#endif
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
new file mode 100644
index 0000000..1cd7f4d
--- /dev/null
+++ b/arch/x86/xen/vga.c
@@ -0,0 +1,67 @@
+#include <linux/screen_info.h>
+#include <linux/init.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+
+#include <xen/interface/xen.h>
+
+#include "xen-ops.h"
+
+void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
+{
+ struct screen_info *screen_info = &boot_params.screen_info;
+
+ /* This is drawn from a dump from vgacon:startup in
+ * standard Linux. */
+ screen_info->orig_video_mode = 3;
+ screen_info->orig_video_isVGA = 1;
+ screen_info->orig_video_lines = 25;
+ screen_info->orig_video_cols = 80;
+ screen_info->orig_video_ega_bx = 3;
+ screen_info->orig_video_points = 16;
+ screen_info->orig_y = screen_info->orig_video_lines - 1;
+
+ switch (info->video_type) {
+ case XEN_VGATYPE_TEXT_MODE_3:
+ if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
+ + sizeof(info->u.text_mode_3))
+ break;
+ screen_info->orig_video_lines = info->u.text_mode_3.rows;
+ screen_info->orig_video_cols = info->u.text_mode_3.columns;
+ screen_info->orig_x = info->u.text_mode_3.cursor_x;
+ screen_info->orig_y = info->u.text_mode_3.cursor_y;
+ screen_info->orig_video_points =
+ info->u.text_mode_3.font_height;
+ break;
+
+ case XEN_VGATYPE_VESA_LFB:
+ if (size < offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps))
+ break;
+ screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB;
+ screen_info->lfb_width = info->u.vesa_lfb.width;
+ screen_info->lfb_height = info->u.vesa_lfb.height;
+ screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel;
+ screen_info->lfb_base = info->u.vesa_lfb.lfb_base;
+ screen_info->lfb_size = info->u.vesa_lfb.lfb_size;
+ screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line;
+ screen_info->red_size = info->u.vesa_lfb.red_size;
+ screen_info->red_pos = info->u.vesa_lfb.red_pos;
+ screen_info->green_size = info->u.vesa_lfb.green_size;
+ screen_info->green_pos = info->u.vesa_lfb.green_pos;
+ screen_info->blue_size = info->u.vesa_lfb.blue_size;
+ screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
+ screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
+ screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps)
+ + sizeof(info->u.vesa_lfb.gbl_caps))
+ screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.mode_attrs)
+ + sizeof(info->u.vesa_lfb.mode_attrs))
+ screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
+ break;
+ }
+}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f9153a3..e860de7 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,6 +30,10 @@
pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_ident_map_ISA(void);
void xen_reserve_top(void);
+void xen_ident_map_ISA(void);
+extern unsigned long xen_max_p2m_pfn;
+
+void xen_set_pat(u64);
char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
@@ -38,6 +42,10 @@
void xen_enable_syscall(void);
void xen_vcpu_restore(void);
+void xen_callback_vector(void);
+void xen_hvm_init_shared_info(void);
+void __init xen_unplug_emulated_devices(void);
+
void __init xen_build_dynamic_phys_to_machine(void);
void xen_init_irq_ops(void);
@@ -46,11 +54,8 @@
void xen_teardown_timer(int cpu);
cycle_t xen_clocksource_read(void);
void xen_setup_cpu_clockevents(void);
-unsigned long xen_tsc_khz(void);
-void __init xen_time_init(void);
-unsigned long xen_get_wallclock(void);
-int xen_set_wallclock(unsigned long time);
-unsigned long long xen_sched_clock(void);
+void __init xen_init_time_ops(void);
+void __init xen_hvm_init_time_ops(void);
irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
@@ -60,10 +65,12 @@
#ifdef CONFIG_SMP
void xen_smp_init(void);
+void __init xen_hvm_smp_init(void);
extern cpumask_var_t xen_cpu_initialized_map;
#else
static inline void xen_smp_init(void) {}
+static inline void xen_hvm_smp_init(void) {}
#endif
#ifdef CONFIG_PARAVIRT_SPINLOCKS
@@ -82,6 +89,23 @@
}
#endif
+struct dom0_vga_console_info;
+
+#ifdef CONFIG_XEN_DOM0
+void xen_init_vga(const struct dom0_vga_console_info *, size_t size);
+#else
+static inline void xen_init_vga(const struct dom0_vga_console_info *info,
+ size_t size)
+{
+}
+#endif
+
+#ifdef CONFIG_XEN_DOM0
+void xen_init_apic(void);
+#else
+static inline void xen_init_apic(void) {}
+#endif
+
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \
@@ -101,4 +125,6 @@
void xen_sysret64(void);
void xen_adjust_exception_frame(void);
+extern int xen_panic_handler_init(void);
+
#endif /* XEN_OPS_H */
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 7702118..1be123c 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -61,6 +61,7 @@
# processor has its own "processor." module_param namespace
processor-y := processor_core.o processor_throttling.o
processor-y += processor_idle.o processor_thermal.o
+processor-y += processor_xen.o
processor-$(CONFIG_CPU_FREQ) += processor_perflib.o
obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 28ccdbc..b0f9ed6 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -31,6 +31,7 @@
#include <linux/types.h>
#include <linux/memory_hotplug.h>
#include <acpi/acpi_drivers.h>
+#include <xen/acpi.h>
#define ACPI_MEMORY_DEVICE_CLASS "memory"
#define ACPI_MEMORY_DEVICE_HID "PNP0C80"
@@ -70,21 +71,6 @@
},
};
-struct acpi_memory_info {
- struct list_head list;
- u64 start_addr; /* Memory Range start physical addr */
- u64 length; /* Memory Range length */
- unsigned short caching; /* memory cache attribute */
- unsigned short write_protect; /* memory read/write attribute */
- unsigned int enabled:1;
-};
-
-struct acpi_memory_device {
- struct acpi_device * device;
- unsigned int state; /* State of the memory device */
- struct list_head res_list;
-};
-
static int acpi_hotmem_initialized;
static acpi_status
@@ -228,6 +214,9 @@
return result;
}
+ if (xen_initial_domain())
+ return xen_hotadd_memory(mem_device);
+
node = acpi_get_node(mem_device->device->handle);
/*
* Tell the VM there is more memory here...
diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c
index cc22f9a..747d96f 100644
--- a/drivers/acpi/acpica/hwsleep.c
+++ b/drivers/acpi/acpica/hwsleep.c
@@ -47,6 +47,9 @@
#include "actables.h"
#include <linux/tboot.h>
+#include <xen/acpi.h>
+#include <asm/xen/hypervisor.h>
+
#define _COMPONENT ACPI_HARDWARE
ACPI_MODULE_NAME("hwsleep")
@@ -346,6 +349,19 @@
tboot_sleep(sleep_state, pm1a_control, pm1b_control);
/* Write #2: Write both SLP_TYP + SLP_EN */
+ if (xen_pv_acpi()) {
+ int err;
+
+ err = acpi_notify_hypervisor_state(sleep_state,
+ pm1a_control, pm1b_control);
+ if (err) {
+ ACPI_DEBUG_PRINT((ACPI_DB_INIT,
+ "Hypervisor failure [%d]\n", err));
+ return_ACPI_STATUS(AE_ERROR);
+ }
+
+ return_ACPI_STATUS(AE_OK);
+ }
status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control);
if (ACPI_FAILURE(status)) {
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 7102474..2428cc0 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -58,6 +58,7 @@
#include <acpi/acpi_bus.h>
#include <acpi/acpi_drivers.h>
#include <acpi/processor.h>
+#include <xen/acpi.h>
#define PREFIX "ACPI: "
@@ -81,11 +82,9 @@
MODULE_LICENSE("GPL");
static int acpi_processor_add(struct acpi_device *device);
-static int acpi_processor_remove(struct acpi_device *device, int type);
#ifdef CONFIG_ACPI_PROCFS
static int acpi_processor_info_open_fs(struct inode *inode, struct file *file);
#endif
-static void acpi_processor_notify(struct acpi_device *device, u32 event);
static acpi_status acpi_processor_hotadd_init(acpi_handle handle, int *p_cpu);
static int acpi_processor_handle_eject(struct acpi_processor *pr);
@@ -247,7 +246,7 @@
return 0;
}
-static int acpi_processor_errata(struct acpi_processor *pr)
+int acpi_processor_errata(struct acpi_processor *pr)
{
int result = 0;
struct pci_dev *dev = NULL;
@@ -278,7 +277,7 @@
* _PDC is required for a BIOS-OS handshake for most of the newer
* ACPI processor features.
*/
-static int acpi_processor_set_pdc(struct acpi_processor *pr)
+int acpi_processor_set_pdc(struct acpi_processor *pr)
{
struct acpi_object_list *pdc_in = pr->pdc;
acpi_status status = AE_OK;
@@ -347,7 +346,7 @@
PDE(inode)->data);
}
-static int acpi_processor_add_fs(struct acpi_device *device)
+int acpi_processor_add_fs(struct acpi_device *device)
{
struct proc_dir_entry *entry = NULL;
@@ -386,7 +385,7 @@
return -EIO;
return 0;
}
-static int acpi_processor_remove_fs(struct acpi_device *device)
+int acpi_processor_remove_fs(struct acpi_device *device)
{
if (acpi_device_dir(device)) {
@@ -402,15 +401,6 @@
return 0;
}
-#else
-static inline int acpi_processor_add_fs(struct acpi_device *device)
-{
- return 0;
-}
-static inline int acpi_processor_remove_fs(struct acpi_device *device)
-{
- return 0;
-}
#endif
/* Use the acpiid in MADT to map cpus in case of SMP */
@@ -705,7 +695,7 @@
static DEFINE_PER_CPU(void *, processor_device_array);
-static void acpi_processor_notify(struct acpi_device *device, u32 event)
+void acpi_processor_notify(struct acpi_device *device, u32 event)
{
struct acpi_processor *pr = acpi_driver_data(device);
int saved;
@@ -873,7 +863,7 @@
return result;
}
-static int acpi_processor_remove(struct acpi_device *device, int type)
+int acpi_processor_remove(struct acpi_device *device, int type)
{
struct acpi_processor *pr = NULL;
@@ -1148,7 +1138,11 @@
if (result < 0)
goto out_proc;
- result = acpi_bus_register_driver(&acpi_processor_driver);
+ if (xen_initial_domain())
+ result = xen_acpi_processor_init();
+ else
+ result = acpi_bus_register_driver(&acpi_processor_driver);
+
if (result < 0)
goto out_cpuidle;
@@ -1184,7 +1178,10 @@
acpi_processor_uninstall_hotplug_notify();
- acpi_bus_unregister_driver(&acpi_processor_driver);
+ if (xen_initial_domain())
+ xen_acpi_processor_exit();
+ else
+ acpi_bus_unregister_driver(&acpi_processor_driver);
cpuidle_unregister_driver(&acpi_idle_driver);
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index a6ad608..3c32e87 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -58,6 +58,7 @@
#include <acpi/acpi_bus.h>
#include <acpi/processor.h>
+#include <xen/acpi.h>
#include <asm/processor.h>
#define PREFIX "ACPI: "
@@ -439,7 +440,8 @@
cx.entry_method = ACPI_CSTATE_HALT;
snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
} else {
- continue;
+ if (!xen_initial_domain())
+ continue;
}
if (cx.type == ACPI_STATE_C1 &&
(idle_halt || idle_nomwait)) {
@@ -477,6 +479,9 @@
cx.power = obj->integer.value;
+ /* cache control methods to notify xen*/
+ processor_cntl_xen_power_cache(pr->acpi_id, i, reg);
+
current_count++;
memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx));
@@ -653,7 +658,7 @@
return (working);
}
-static int acpi_processor_get_power_info(struct acpi_processor *pr)
+int acpi_processor_get_power_info(struct acpi_processor *pr)
{
unsigned int i;
int result;
@@ -1223,9 +1228,14 @@
* platforms that only support C1.
*/
if (pr->flags.power) {
- acpi_processor_setup_cpuidle(pr);
- if (cpuidle_register_device(&pr->power.dev))
- return -EIO;
+ if (xen_initial_domain()) {
+ processor_cntl_xen_notify(pr,
+ PROCESSOR_PM_INIT, PM_TYPE_IDLE);
+ } else {
+ acpi_processor_setup_cpuidle(pr);
+ if (cpuidle_register_device(&pr->power.dev))
+ return -EIO;
+ }
}
#ifdef CONFIG_ACPI_PROCFS
/* 'power' [R] */
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 40d395e..7ba143d 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -332,7 +332,7 @@
return result;
}
-static int acpi_processor_get_performance_info(struct acpi_processor *pr)
+int acpi_processor_get_performance_info(struct acpi_processor *pr)
{
int result = 0;
acpi_status status = AE_OK;
@@ -438,7 +438,7 @@
EXPORT_SYMBOL(acpi_processor_notify_smm);
-static int acpi_processor_get_psd(struct acpi_processor *pr)
+int acpi_processor_get_psd(struct acpi_processor *pr)
{
int result = 0;
acpi_status status = AE_OK;
diff --git a/drivers/acpi/processor_xen.c b/drivers/acpi/processor_xen.c
new file mode 100644
index 0000000..305398d
--- /dev/null
+++ b/drivers/acpi/processor_xen.c
@@ -0,0 +1,651 @@
+/*
+ * processor_xen.c - ACPI Processor Driver for xen
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/pm.h>
+#include <linux/cpufreq.h>
+#include <linux/cpu.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/dmi.h>
+#include <linux/moduleparam.h>
+#include <linux/cpuidle.h>
+#include <linux/acpi.h>
+
+#include <acpi/acpi_bus.h>
+#include <acpi/acpi_drivers.h>
+#include <acpi/processor.h>
+#include <xen/acpi.h>
+#include <xen/pcpu.h>
+
+#define PREFIX "ACPI: "
+
+#define ACPI_PROCESSOR_CLASS "processor"
+#define ACPI_PROCESSOR_DEVICE_NAME "Processor"
+#define ACPI_PROCESSOR_FILE_INFO "info"
+#define ACPI_PROCESSOR_FILE_THROTTLING "throttling"
+#define ACPI_PROCESSOR_FILE_LIMIT "limit"
+#define ACPI_PROCESSOR_NOTIFY_PERFORMANCE 0x80
+#define ACPI_PROCESSOR_NOTIFY_POWER 0x81
+#define ACPI_PROCESSOR_NOTIFY_THROTTLING 0x82
+
+#define _COMPONENT ACPI_PROCESSOR_COMPONENT
+ACPI_MODULE_NAME("processor_xen");
+
+static const struct acpi_device_id processor_device_ids[] = {
+ {ACPI_PROCESSOR_OBJECT_HID, 0},
+ {"ACPI0007", 0},
+ {"", 0},
+};
+
+/*
+ * Xen ACPI processor driver
+ */
+
+/* from processor_core.c */
+
+static int xen_acpi_processor_add(struct acpi_device *device);
+static void xen_acpi_processor_notify(struct acpi_device *device, u32 event);
+
+struct acpi_driver xen_acpi_processor_driver = {
+ .name = "processor",
+ .class = ACPI_PROCESSOR_CLASS,
+ .ids = processor_device_ids,
+ .ops = {
+ .add = xen_acpi_processor_add,
+ .remove = acpi_processor_remove,
+ .suspend = acpi_processor_suspend,
+ .resume = acpi_processor_resume,
+ .notify = xen_acpi_processor_notify,
+ },
+};
+
+static int is_processor_present(acpi_handle handle)
+{
+ acpi_status status;
+ unsigned long long sta = 0;
+
+
+ status = acpi_evaluate_integer(handle, "_STA", NULL, &sta);
+
+ if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT))
+ return 1;
+
+ /*
+ * _STA is mandatory for a processor that supports hot plug
+ */
+ if (status == AE_NOT_FOUND)
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "Processor does not support hot plug\n"));
+ else
+ ACPI_EXCEPTION((AE_INFO, status,
+ "Processor Device is not present"));
+ return 0;
+}
+
+static acpi_status
+xen_acpi_processor_hotadd_init(struct acpi_processor *pr, int *p_cpu)
+{
+ if (!is_processor_present(pr->handle))
+ return AE_ERROR;
+
+ if (processor_cntl_xen_notify(pr,
+ PROCESSOR_HOTPLUG, HOTPLUG_TYPE_ADD))
+ return AE_ERROR;
+
+ return AE_OK;
+}
+
+static int xen_acpi_processor_get_info(struct acpi_device *device)
+{
+ acpi_status status = 0;
+ union acpi_object object = { 0 };
+ struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
+ struct acpi_processor *pr;
+ int cpu_index, device_declaration = 0;
+ static int cpu0_initialized;
+
+ pr = acpi_driver_data(device);
+ if (!pr)
+ return -EINVAL;
+
+ if (num_online_cpus() > 1)
+ errata.smp = TRUE;
+
+ acpi_processor_errata(pr);
+
+ /*
+ * Check to see if we have bus mastering arbitration control. This
+ * is required for proper C3 usage (to maintain cache coherency).
+ */
+ if (acpi_gbl_FADT.pm2_control_block &&
+ acpi_gbl_FADT.pm2_control_length) {
+ pr->flags.bm_control = 1;
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "Bus mastering arbitration control present\n"
+ ));
+ } else
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "No bus mastering arbitration control\n"));
+
+ if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_OBJECT_HID)) {
+ /* Declared with "Processor" statement; match ProcessorID */
+ status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer);
+ if (ACPI_FAILURE(status)) {
+ printk(KERN_ERR PREFIX "Evaluating processor object\n");
+ return -ENODEV;
+ }
+
+ /*
+ * TBD: Synch processor ID (via LAPIC/LSAPIC structures) on SMP.
+ * >>> 'acpi_get_processor_id(acpi_id, &id)' in
+ * arch/xxx/acpi.c
+ */
+ pr->acpi_id = object.processor.proc_id;
+ } else {
+ /*
+ * Declared with "Device" statement; match _UID.
+ * Note that we don't handle string _UIDs yet.
+ */
+ unsigned long long value;
+ status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID,
+ NULL, &value);
+ if (ACPI_FAILURE(status)) {
+ printk(KERN_ERR PREFIX
+ "Evaluating processor _UID [%#x]\n", status);
+ return -ENODEV;
+ }
+ device_declaration = 1;
+ pr->acpi_id = value;
+ }
+
+ /* TBD: add Xen specific code to query cpu_index */
+ cpu_index = -1;
+
+ /* Handle UP system running SMP kernel, with no LAPIC in MADT */
+ if (!cpu0_initialized && (cpu_index == -1) &&
+ (num_online_cpus() == 1)) {
+ cpu_index = 0;
+ }
+
+ cpu0_initialized = 1;
+
+ pr->id = cpu_index;
+
+ /*
+ * Extra Processor objects may be enumerated on MP systems with
+ * less than the max # of CPUs, or Xen vCPU < pCPU.
+ * They should be ignored _iff they are physically not present.
+ *
+ */
+ if (xen_pcpu_index(pr->acpi_id, 1) == -1) {
+ if (ACPI_FAILURE
+ (xen_acpi_processor_hotadd_init(pr, &pr->id))) {
+ return -ENODEV;
+ }
+ }
+
+ /*
+ * On some boxes several processors use the same processor bus id.
+ * But they are located in different scope. For example:
+ * \_SB.SCK0.CPU0
+ * \_SB.SCK1.CPU0
+ * Rename the processor device bus id. And the new bus id will be
+ * generated as the following format:
+ * CPU+CPU ID.
+ */
+ sprintf(acpi_device_bid(device), "CPU%X", pr->id);
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Processor [%d:%d]\n", pr->id,
+ pr->acpi_id));
+
+ if (!object.processor.pblk_address)
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No PBLK (NULL address)\n"));
+ else if (object.processor.pblk_length != 6)
+ printk(KERN_ERR PREFIX "Invalid PBLK length [%d]\n",
+ object.processor.pblk_length);
+ else {
+ pr->throttling.address = object.processor.pblk_address;
+ pr->throttling.duty_offset = acpi_gbl_FADT.duty_offset;
+ pr->throttling.duty_width = acpi_gbl_FADT.duty_width;
+
+ pr->pblk = object.processor.pblk_address;
+
+ /*
+ * We don't care about error returns - we just try to mark
+ * these reserved so that nobody else is confused into thinking
+ * that this region might be unused..
+ *
+ * (In particular, allocating the IO range for Cardbus)
+ */
+ request_region(pr->throttling.address, 6, "ACPI CPU throttle");
+ }
+
+ /*
+ * If ACPI describes a slot number for this CPU, we can use it
+ * ensure we get the right value in the "physical id" field
+ * of /proc/cpuinfo
+ */
+ status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer);
+ if (ACPI_SUCCESS(status))
+ arch_fix_phys_package_id(pr->id, object.integer.value);
+
+ return 0;
+}
+
+static struct acpi_device *processor_device_array[XEN_MAX_ACPI_ID + 1];
+
+static int __cpuinit xen_acpi_processor_add(struct acpi_device *device)
+{
+ struct acpi_processor *pr = NULL;
+ int result = 0;
+ struct sys_device *sysdev;
+
+ pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
+ if (!pr)
+ return -ENOMEM;
+
+ if (!zalloc_cpumask_var(&pr->throttling.shared_cpu_map, GFP_KERNEL)) {
+ kfree(pr);
+ return -ENOMEM;
+ }
+
+ pr->handle = device->handle;
+ strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME);
+ strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS);
+ device->driver_data = pr;
+
+ result = xen_acpi_processor_get_info(device);
+ if (result) {
+ /* Processor is physically not present */
+ return 0;
+ }
+
+ /*
+ * Buggy BIOS check
+ * ACPI id of processors can be reported wrongly by the BIOS.
+ * Don't trust it blindly
+ */
+ if (pr->acpi_id > XEN_MAX_ACPI_ID ||
+ (processor_device_array[pr->acpi_id] != NULL &&
+ processor_device_array[pr->acpi_id] != device)) {
+ printk(KERN_WARNING "BIOS reported wrong ACPI id "
+ "for the processor\n");
+ result = -ENODEV;
+ goto err_free_cpumask;
+ }
+
+ processor_device_array[pr->acpi_id] = device;
+
+ if (pr->id != -1) {
+ per_cpu(processors, pr->id) = pr;
+
+ result = acpi_processor_add_fs(device);
+ if (result)
+ goto err_free_cpumask;
+
+ sysdev = get_cpu_sysdev(pr->id);
+ if (sysdev != NULL && sysfs_create_link(&device->dev.kobj,
+ &sysdev->kobj, "sysdev")) {
+ result = -EFAULT;
+ goto err_remove_fs;
+ }
+ }
+
+ /* _PDC call should be done before doing anything else (if reqd.). */
+ xen_arch_acpi_processor_init_pdc(pr);
+ acpi_processor_set_pdc(pr);
+ arch_acpi_processor_cleanup_pdc(pr);
+
+#ifdef CONFIG_CPU_FREQ
+ xen_acpi_processor_ppc_has_changed(pr);
+ result = xen_acpi_processor_get_performance(pr);
+ if (result)
+ goto err_remove_fs;
+#endif
+
+ if (pr->id != -1) {
+ acpi_processor_get_throttling_info(pr);
+ acpi_processor_get_limit_info(pr);
+ }
+
+ xen_acpi_processor_power_init(pr, device);
+
+ if (pr->id != -1) {
+ pr->cdev = thermal_cooling_device_register("Processor", device,
+ &processor_cooling_ops);
+ if (IS_ERR(pr->cdev)) {
+ result = PTR_ERR(pr->cdev);
+ goto err_power_exit;
+ }
+
+ dev_info(&device->dev, "registered as cooling_device%d\n",
+ pr->cdev->id);
+
+ result = sysfs_create_link(&device->dev.kobj,
+ &pr->cdev->device.kobj,
+ "thermal_cooling");
+ if (result) {
+ printk(KERN_ERR PREFIX "Create sysfs link\n");
+ goto err_thermal_unregister;
+ }
+ result = sysfs_create_link(&pr->cdev->device.kobj,
+ &device->dev.kobj,
+ "device");
+ if (result) {
+ printk(KERN_ERR PREFIX "Create sysfs link\n");
+ goto err_remove_sysfs;
+ }
+ }
+
+ return 0;
+
+err_remove_sysfs:
+ sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
+err_thermal_unregister:
+ thermal_cooling_device_unregister(pr->cdev);
+err_power_exit:
+ acpi_processor_power_exit(pr, device);
+err_remove_fs:
+ acpi_processor_remove_fs(device);
+err_free_cpumask:
+ free_cpumask_var(pr->throttling.shared_cpu_map);
+
+ return result;
+}
+
+static void xen_acpi_processor_notify(struct acpi_device *device, u32 event)
+{
+ struct acpi_processor *pr = acpi_driver_data(device);
+ int saved;
+
+ if (!pr)
+ return;
+
+ switch (event) {
+ case ACPI_PROCESSOR_NOTIFY_PERFORMANCE:
+ saved = pr->performance_platform_limit;
+ xen_acpi_processor_ppc_has_changed(pr);
+ if (saved == pr->performance_platform_limit)
+ break;
+ acpi_bus_generate_proc_event(device, event,
+ pr->performance_platform_limit);
+ acpi_bus_generate_netlink_event(device->pnp.device_class,
+ dev_name(&device->dev), event,
+ pr->performance_platform_limit);
+ break;
+ case ACPI_PROCESSOR_NOTIFY_POWER:
+ xen_acpi_processor_cst_has_changed(pr);
+ acpi_bus_generate_proc_event(device, event, 0);
+ acpi_bus_generate_netlink_event(device->pnp.device_class,
+ dev_name(&device->dev), event, 0);
+ break;
+ case ACPI_PROCESSOR_NOTIFY_THROTTLING:
+ acpi_processor_tstate_has_changed(pr);
+ acpi_bus_generate_proc_event(device, event, 0);
+ acpi_bus_generate_netlink_event(device->pnp.device_class,
+ dev_name(&device->dev), event, 0);
+ default:
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "Unsupported event [0x%x]\n", event));
+ break;
+ }
+
+ return;
+}
+
+/* from processor_idle.c */
+
+static int xen_acpi_processor_get_power_info(struct acpi_processor *pr)
+{
+ int ret;
+ int invalid_pr_id = 0;
+
+ /*
+ * acpi_processor_get_power_info need valid pr->id
+ * so set pr->id=0 temporarily
+ */
+ if (pr->id == -1) {
+ invalid_pr_id = 1;
+ pr->id = 0;
+ }
+
+ ret = acpi_processor_get_power_info(pr);
+
+ if (invalid_pr_id)
+ pr->id = -1;
+
+ return ret;
+}
+
+int xen_acpi_processor_cst_has_changed(struct acpi_processor *pr)
+{
+ if (!pr)
+ return -EINVAL;
+
+ if (!pr->flags.power_setup_done)
+ return -ENODEV;
+
+ xen_acpi_processor_get_power_info(pr);
+
+ processor_cntl_xen_notify(pr,
+ PROCESSOR_PM_CHANGE, PM_TYPE_IDLE);
+
+ return 0;
+}
+
+
+int __cpuinit xen_acpi_processor_power_init(struct acpi_processor *pr,
+ struct acpi_device *device)
+{
+ acpi_status status = 0;
+ unsigned int i;
+
+ if (!pr)
+ return -EINVAL;
+
+ if (acpi_gbl_FADT.cst_control) {
+ status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
+ acpi_gbl_FADT.cst_control, 8);
+ if (ACPI_FAILURE(status)) {
+ ACPI_EXCEPTION((AE_INFO, status,
+ "Notifying BIOS of _CST ability failed"));
+ }
+ }
+
+ xen_acpi_processor_get_power_info(pr);
+
+ pr->flags.power_setup_done = 1;
+
+ if (pr->flags.power) {
+ processor_cntl_xen_notify(pr,
+ PROCESSOR_PM_INIT, PM_TYPE_IDLE);
+
+ printk(KERN_INFO PREFIX "CPU%d (power states:", pr->id);
+ for (i = 1; i <= pr->power.count; i++)
+ if (pr->power.states[i].valid)
+ printk(" C%d[C%d]", i,
+ pr->power.states[i].type);
+ printk(")\n");
+ }
+
+ return 0;
+}
+
+/* from processor_perflib.c */
+
+#ifdef CONFIG_CPU_FREQ
+static int xen_processor_notify_smm(void)
+{
+ acpi_status status;
+ static int is_done;
+
+ /* only need successfully notify BIOS once */
+ /* avoid double notification which may lead to unexpected result */
+ if (is_done)
+ return 0;
+
+ /* Can't write pstate_cnt to smi_cmd if either value is zero */
+ if ((!acpi_gbl_FADT.smi_command) || (!acpi_gbl_FADT.pstate_control)) {
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No SMI port or pstate_cnt\n"));
+ return 0;
+ }
+
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "Writing pstate_cnt [0x%x] to smi_cmd [0x%x]\n",
+ acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
+
+ status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
+ (u32) acpi_gbl_FADT.pstate_control, 8);
+ if (ACPI_FAILURE(status))
+ return status;
+
+ is_done = 1;
+
+ return 0;
+}
+
+static int xen_acpi_processor_get_platform_limit(struct acpi_processor *pr)
+{
+ acpi_status status = 0;
+ unsigned long long ppc = 0;
+
+ if (!pr)
+ return -EINVAL;
+
+ /*
+ * _PPC indicates the maximum state currently supported by the platform
+ * (e.g. 0 = states 0..n; 1 = states 1..n; etc.
+ */
+ status = acpi_evaluate_integer(pr->handle, "_PPC", NULL, &ppc);
+
+ if (ACPI_FAILURE(status) && status != AE_NOT_FOUND) {
+ ACPI_EXCEPTION((AE_INFO, status, "Evaluating _PPC"));
+ return -ENODEV;
+ }
+
+ pr->performance_platform_limit = (int)ppc;
+
+ return 0;
+}
+
+int xen_acpi_processor_ppc_has_changed(struct acpi_processor *pr)
+{
+ int ret;
+
+ ret = xen_acpi_processor_get_platform_limit(pr);
+
+ if (ret < 0)
+ return ret;
+ else
+ return processor_cntl_xen_notify(pr,
+ PROCESSOR_PM_CHANGE, PM_TYPE_PERF);
+}
+
+/*
+ * Existing ACPI module does parse performance states at some point,
+ * when acpi-cpufreq driver is loaded which however is something
+ * we'd like to disable to avoid confliction with xen PM
+ * logic. So we have to collect raw performance information here
+ * when ACPI processor object is found and started.
+ */
+int xen_acpi_processor_get_performance(struct acpi_processor *pr)
+{
+ int ret;
+ struct acpi_processor_performance *perf;
+ struct acpi_psd_package *pdomain;
+
+ if (pr->performance)
+ return -EBUSY;
+
+ perf = kzalloc(sizeof(struct acpi_processor_performance), GFP_KERNEL);
+ if (!perf)
+ return -ENOMEM;
+
+ pr->performance = perf;
+ /* Get basic performance state information */
+ ret = acpi_processor_get_performance_info(pr);
+ if (ret < 0)
+ goto err_out;
+
+ /*
+ * Well, here we need retrieve performance dependency information
+ * from _PSD object. The reason why existing interface is not used
+ * is due to the reason that existing interface sticks to Linux cpu
+ * id to construct some bitmap, however we want to split ACPI
+ * processor objects from Linux cpu id logic. For example, even
+ * when Linux is configured as UP, we still want to parse all ACPI
+ * processor objects to xen. In this case, it's preferred
+ * to use ACPI ID instead.
+ */
+ pdomain = &pr->performance->domain_info;
+ pdomain->num_processors = 0;
+ ret = acpi_processor_get_psd(pr);
+ if (ret < 0) {
+ /*
+ * _PSD is optional - assume no coordination if absent (or
+ * broken), matching native kernels' behavior.
+ */
+ pdomain->num_entries = ACPI_PSD_REV0_ENTRIES;
+ pdomain->revision = ACPI_PSD_REV0_REVISION;
+ pdomain->domain = pr->acpi_id;
+ pdomain->coord_type = DOMAIN_COORD_TYPE_SW_ALL;
+ pdomain->num_processors = 1;
+ }
+
+ /* Some sanity check */
+ if ((pdomain->revision != ACPI_PSD_REV0_REVISION) ||
+ (pdomain->num_entries != ACPI_PSD_REV0_ENTRIES) ||
+ ((pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ALL) &&
+ (pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ANY) &&
+ (pdomain->coord_type != DOMAIN_COORD_TYPE_HW_ALL))) {
+ ret = -EINVAL;
+ goto err_out;
+ }
+
+ /* Last step is to notify BIOS that xen exists */
+ xen_processor_notify_smm();
+
+ processor_cntl_xen_notify(pr, PROCESSOR_PM_INIT, PM_TYPE_PERF);
+
+ return 0;
+err_out:
+ pr->performance = NULL;
+ kfree(perf);
+ return ret;
+}
+#endif /* CONFIG_CPU_FREQ */
+
+/* init and exit */
+
+int xen_acpi_processor_init(void)
+{
+ return acpi_bus_register_driver(&xen_acpi_processor_driver);
+}
+
+void xen_acpi_processor_exit(void)
+{
+ acpi_bus_unregister_driver(&xen_acpi_processor_driver);
+}
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 0458094..85a1308 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -19,6 +19,8 @@
#include <asm/io.h>
+#include <xen/acpi.h>
+
#include <acpi/acpi_bus.h>
#include <acpi/acpi_drivers.h>
@@ -200,6 +202,21 @@
return error;
}
+static void do_suspend(void)
+{
+ if (!xen_pv_acpi()) {
+ do_suspend_lowlevel();
+ return;
+ }
+
+ /*
+ * Xen will save and restore CPU context, so
+ * we can skip that and just go straight to
+ * the suspend.
+ */
+ acpi_enter_sleep_state(ACPI_STATE_S3);
+}
+
/**
* acpi_suspend_enter - Actually enter a sleep state.
* @pm_state: ignored
@@ -233,7 +250,7 @@
break;
case ACPI_STATE_S3:
- do_suspend_lowlevel();
+ do_suspend();
break;
}
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1d886e0..f4a2b10 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -462,6 +462,7 @@
tristate "Xen virtual block device support"
depends on XEN
default y
+ select XEN_XENBUS_FRONTEND
help
This driver implements the front-end of the Xen virtual
block device driver. It communicates with a back-end driver
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index a2e8977..6147a4e 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -42,10 +42,12 @@
#include <linux/module.h>
#include <linux/scatterlist.h>
+#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/grant_table.h>
#include <xen/events.h>
#include <xen/page.h>
+#include <xen/platform_pci.h>
#include <xen/interface/grant_table.h>
#include <xen/interface/io/blkif.h>
@@ -67,7 +69,7 @@
static const struct block_device_operations xlvbd_block_fops;
-#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
+#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
/*
* We have one of these per vbd, whether ide, scsi or 'other'. They
@@ -76,6 +78,7 @@
*/
struct blkfront_info
{
+ struct mutex mutex;
struct xenbus_device *xbdev;
struct gendisk *gd;
int vdevice;
@@ -85,6 +88,7 @@
struct blkif_front_ring ring;
struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
unsigned int evtchn, irq;
+ struct tasklet_struct tasklet;
struct request_queue *rq;
struct work_struct work;
struct gnttab_free_callback callback;
@@ -93,14 +97,12 @@
int feature_barrier;
int is_ready;
- /**
- * The number of people holding this device open. We won't allow a
- * hot-unplug unless this is 0.
- */
- int users;
+ spinlock_t io_lock;
};
-static DEFINE_SPINLOCK(blkif_io_lock);
+static unsigned int nr_minors;
+static unsigned long *minors;
+static DEFINE_SPINLOCK(minor_lock);
#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
(BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
@@ -116,6 +118,10 @@
#define EXTENDED (1<<EXT_SHIFT)
#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
+#define EMULATED_HD_DISK_MINOR_OFFSET (0)
+#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
+#define EMULATED_SD_DISK_MINOR_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET + (4 * 16))
+#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_HD_DISK_NAME_OFFSET + 4)
#define DEV_NAME "xvd" /* name in /dev */
@@ -136,6 +142,55 @@
info->shadow_free = id;
}
+static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
+{
+ unsigned int end = minor + nr;
+ int rc;
+
+ if (end > nr_minors) {
+ unsigned long *bitmap, *old;
+
+ bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap),
+ GFP_KERNEL);
+ if (bitmap == NULL)
+ return -ENOMEM;
+
+ spin_lock(&minor_lock);
+ if (end > nr_minors) {
+ old = minors;
+ memcpy(bitmap, minors,
+ BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
+ minors = bitmap;
+ nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
+ } else
+ old = bitmap;
+ spin_unlock(&minor_lock);
+ kfree(old);
+ }
+
+ spin_lock(&minor_lock);
+ if (find_next_bit(minors, end, minor) >= end) {
+ for (; minor < end; ++minor)
+ __set_bit(minor, minors);
+ rc = 0;
+ } else
+ rc = -EBUSY;
+ spin_unlock(&minor_lock);
+
+ return rc;
+}
+
+static void xlbd_release_minors(unsigned int minor, unsigned int nr)
+{
+ unsigned int end = minor + nr;
+
+ BUG_ON(end > nr_minors);
+ spin_lock(&minor_lock);
+ for (; minor < end; ++minor)
+ __clear_bit(minor, minors);
+ spin_unlock(&minor_lock);
+}
+
static void blkif_restart_queue_callback(void *arg)
{
struct blkfront_info *info = (struct blkfront_info *)arg;
@@ -333,11 +388,12 @@
flush_requests(info);
}
-static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
+static int xlvbd_init_blk_queue(struct blkfront_info *info,
+ struct gendisk *gd, u16 sector_size)
{
struct request_queue *rq;
- rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
+ rq = blk_init_queue(do_blkif_request, &info->io_lock);
if (rq == NULL)
return -1;
@@ -370,20 +426,84 @@
static int xlvbd_barrier(struct blkfront_info *info)
{
int err;
+ const char *barrier;
- err = blk_queue_ordered(info->rq,
- info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
- NULL);
+ switch (info->feature_barrier) {
+ case QUEUE_ORDERED_DRAIN: barrier = "enabled (drain)"; break;
+ case QUEUE_ORDERED_TAG: barrier = "enabled (tag)"; break;
+ case QUEUE_ORDERED_NONE: barrier = "disabled"; break;
+ default: return -EINVAL;
+ }
+
+ err = blk_queue_ordered(info->rq, info->feature_barrier, NULL);
if (err)
return err;
printk(KERN_INFO "blkfront: %s: barriers %s\n",
- info->gd->disk_name,
- info->feature_barrier ? "enabled" : "disabled");
+ info->gd->disk_name, barrier);
return 0;
}
+static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
+{
+ int major;
+ major = BLKIF_MAJOR(vdevice);
+ *minor = BLKIF_MINOR(vdevice);
+ switch (major) {
+ case XEN_IDE0_MAJOR:
+ *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
+ *minor = ((*minor / 64) * PARTS_PER_DISK) +
+ EMULATED_HD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_IDE1_MAJOR:
+ *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
+ *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
+ EMULATED_HD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_SCSI_DISK0_MAJOR:
+ *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
+ *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_SCSI_DISK1_MAJOR:
+ case XEN_SCSI_DISK2_MAJOR:
+ case XEN_SCSI_DISK3_MAJOR:
+ case XEN_SCSI_DISK4_MAJOR:
+ case XEN_SCSI_DISK5_MAJOR:
+ case XEN_SCSI_DISK6_MAJOR:
+ case XEN_SCSI_DISK7_MAJOR:
+ *offset = (*minor / PARTS_PER_DISK) +
+ ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
+ EMULATED_SD_DISK_NAME_OFFSET;
+ *minor = *minor +
+ ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
+ EMULATED_SD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_SCSI_DISK8_MAJOR:
+ case XEN_SCSI_DISK9_MAJOR:
+ case XEN_SCSI_DISK10_MAJOR:
+ case XEN_SCSI_DISK11_MAJOR:
+ case XEN_SCSI_DISK12_MAJOR:
+ case XEN_SCSI_DISK13_MAJOR:
+ case XEN_SCSI_DISK14_MAJOR:
+ case XEN_SCSI_DISK15_MAJOR:
+ *offset = (*minor / PARTS_PER_DISK) +
+ ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
+ EMULATED_SD_DISK_NAME_OFFSET;
+ *minor = *minor +
+ ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
+ EMULATED_SD_DISK_MINOR_OFFSET;
+ break;
+ case XENVBD_MAJOR:
+ *offset = *minor / PARTS_PER_DISK;
+ break;
+ default:
+ printk(KERN_WARNING "blkfront: your disk configuration is "
+ "incorrect, please use an xvd device instead\n");
+ return -ENODEV;
+ }
+ return 0;
+}
static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
struct blkfront_info *info,
@@ -391,7 +511,7 @@
{
struct gendisk *gd;
int nr_minors = 1;
- int err = -ENODEV;
+ int err;
unsigned int offset;
int minor;
int nr_parts;
@@ -406,21 +526,32 @@
}
if (!VDEV_IS_EXTENDED(info->vdevice)) {
- minor = BLKIF_MINOR(info->vdevice);
- nr_parts = PARTS_PER_DISK;
+ err = xen_translate_vdev(info->vdevice, &minor, &offset);
+ if (err)
+ return err;
+ nr_parts = PARTS_PER_DISK;
} else {
minor = BLKIF_MINOR_EXT(info->vdevice);
nr_parts = PARTS_PER_EXT_DISK;
+ offset = minor / nr_parts;
+ if (xen_hvm_domain() && offset <= EMULATED_HD_DISK_NAME_OFFSET + 4)
+ printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
+ "emulated IDE disks,\n\t choose an xvd device name"
+ "from xvde on\n", info->vdevice);
}
+ err = -ENODEV;
if ((minor % nr_parts) == 0)
nr_minors = nr_parts;
+ err = xlbd_reserve_minors(minor, nr_minors);
+ if (err)
+ goto out;
+ err = -ENODEV;
+
gd = alloc_disk(nr_minors);
if (gd == NULL)
- goto out;
-
- offset = minor / nr_parts;
+ goto release;
if (nr_minors > 1) {
if (offset < 26)
@@ -447,16 +578,15 @@
gd->driverfs_dev = &(info->xbdev->dev);
set_capacity(gd, capacity);
- if (xlvbd_init_blk_queue(gd, sector_size)) {
+ if (xlvbd_init_blk_queue(info, gd, sector_size)) {
del_gendisk(gd);
- goto out;
+ goto release;
}
info->rq = gd->queue;
info->gd = gd;
- if (info->feature_barrier)
- xlvbd_barrier(info);
+ xlvbd_barrier(info);
if (vdisk_info & VDISK_READONLY)
set_disk_ro(gd, 1);
@@ -469,10 +599,45 @@
return 0;
+ release:
+ xlbd_release_minors(minor, nr_minors);
out:
return err;
}
+static void xlvbd_release_gendisk(struct blkfront_info *info)
+{
+ unsigned int minor, nr_minors;
+ unsigned long flags;
+
+ if (info->rq == NULL)
+ return;
+
+ spin_lock_irqsave(&info->io_lock, flags);
+
+ /* No more blkif_request(). */
+ blk_stop_queue(info->rq);
+
+ /* No more gnttab callback work. */
+ gnttab_cancel_free_callback(&info->callback);
+ spin_unlock_irqrestore(&info->io_lock, flags);
+
+ /* Flush gnttab callback work. Must be done with no locks held. */
+ flush_scheduled_work();
+
+ del_gendisk(info->gd);
+
+ minor = info->gd->first_minor;
+ nr_minors = info->gd->minors;
+ xlbd_release_minors(minor, nr_minors);
+
+ blk_cleanup_queue(info->rq);
+ info->rq = NULL;
+
+ put_disk(info->gd);
+ info->gd = NULL;
+}
+
static void kick_pending_request_queues(struct blkfront_info *info)
{
if (!RING_FULL(&info->ring)) {
@@ -487,16 +652,16 @@
{
struct blkfront_info *info = container_of(work, struct blkfront_info, work);
- spin_lock_irq(&blkif_io_lock);
+ spin_lock_irq(&info->io_lock);
if (info->connected == BLKIF_STATE_CONNECTED)
kick_pending_request_queues(info);
- spin_unlock_irq(&blkif_io_lock);
+ spin_unlock_irq(&info->io_lock);
}
static void blkif_free(struct blkfront_info *info, int suspend)
{
/* Prevent new requests being issued until we fix things up. */
- spin_lock_irq(&blkif_io_lock);
+ spin_lock_irq(&info->io_lock);
info->connected = suspend ?
BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
/* No more blkif_request(). */
@@ -504,7 +669,7 @@
blk_stop_queue(info->rq);
/* No more gnttab callback work. */
gnttab_cancel_free_callback(&info->callback);
- spin_unlock_irq(&blkif_io_lock);
+ spin_unlock_irq(&info->io_lock);
/* Flush gnttab callback work. Must be done with no locks held. */
flush_scheduled_work();
@@ -529,21 +694,20 @@
gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
}
-static irqreturn_t blkif_interrupt(int irq, void *dev_id)
+static void
+blkif_do_interrupt(unsigned long data)
{
+ struct blkfront_info *info = (struct blkfront_info *)data;
struct request *req;
struct blkif_response *bret;
RING_IDX i, rp;
unsigned long flags;
- struct blkfront_info *info = (struct blkfront_info *)dev_id;
int error;
- spin_lock_irqsave(&blkif_io_lock, flags);
+ spin_lock_irqsave(&info->io_lock, flags);
- if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
- spin_unlock_irqrestore(&blkif_io_lock, flags);
- return IRQ_HANDLED;
- }
+ if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+ goto out;
again:
rp = info->ring.sring->rsp_prod;
@@ -567,7 +731,7 @@
printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
info->gd->disk_name);
error = -EOPNOTSUPP;
- info->feature_barrier = 0;
+ info->feature_barrier = QUEUE_ORDERED_NONE;
xlvbd_barrier(info);
}
/* fall through */
@@ -596,7 +760,17 @@
kick_pending_request_queues(info);
- spin_unlock_irqrestore(&blkif_io_lock, flags);
+out:
+ spin_unlock_irqrestore(&info->io_lock, flags);
+}
+
+
+static irqreturn_t
+blkif_interrupt(int irq, void *dev_id)
+{
+ struct blkfront_info *info = (struct blkfront_info *)dev_id;
+
+ tasklet_schedule(&info->tasklet);
return IRQ_HANDLED;
}
@@ -650,7 +824,7 @@
/* Common code used when first setting up, and when resuming. */
-static int talk_to_backend(struct xenbus_device *dev,
+static int talk_to_blkback(struct xenbus_device *dev,
struct blkfront_info *info)
{
const char *message = NULL;
@@ -710,7 +884,6 @@
return err;
}
-
/**
* Entry point to this code when a new device is created. Allocate the basic
* structures and the ring buffer for communication with the backend, and
@@ -736,16 +909,48 @@
}
}
+ if (xen_hvm_domain()) {
+ char *type;
+ int len;
+ /* no unplug has been done: do not hook devices != xen vbds */
+ if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) {
+ int major;
+
+ if (!VDEV_IS_EXTENDED(vdevice))
+ major = BLKIF_MAJOR(vdevice);
+ else
+ major = XENVBD_MAJOR;
+
+ if (major != XENVBD_MAJOR) {
+ printk(KERN_INFO
+ "%s: HVM does not support vbd %d as xen block device\n",
+ __FUNCTION__, vdevice);
+ return -ENODEV;
+ }
+ }
+ /* do not create a PV cdrom device if we are an HVM guest */
+ type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len);
+ if (IS_ERR(type))
+ return -ENODEV;
+ if (strncmp(type, "cdrom", 5) == 0) {
+ kfree(type);
+ return -ENODEV;
+ }
+ kfree(type);
+ }
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
return -ENOMEM;
}
+ mutex_init(&info->mutex);
info->xbdev = dev;
info->vdevice = vdevice;
info->connected = BLKIF_STATE_DISCONNECTED;
INIT_WORK(&info->work, blkif_restart_queue);
+ spin_lock_init(&info->io_lock);
+ tasklet_init(&info->tasklet, blkif_do_interrupt, (unsigned long)info);
for (i = 0; i < BLK_RING_SIZE; i++)
info->shadow[i].req.id = i+1;
@@ -755,7 +960,7 @@
info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
dev_set_drvdata(&dev->dev, info);
- err = talk_to_backend(dev, info);
+ err = talk_to_blkback(dev, info);
if (err) {
kfree(info);
dev_set_drvdata(&dev->dev, NULL);
@@ -819,7 +1024,7 @@
xenbus_switch_state(info->xbdev, XenbusStateConnected);
- spin_lock_irq(&blkif_io_lock);
+ spin_lock_irq(&info->io_lock);
/* Now safe for us to use the shared ring */
info->connected = BLKIF_STATE_CONNECTED;
@@ -830,7 +1035,7 @@
/* Kick any other new requests queued since we resumed */
kick_pending_request_queues(info);
- spin_unlock_irq(&blkif_io_lock);
+ spin_unlock_irq(&info->io_lock);
return 0;
}
@@ -850,13 +1055,50 @@
blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
- err = talk_to_backend(dev, info);
+ err = talk_to_blkback(dev, info);
if (info->connected == BLKIF_STATE_SUSPENDED && !err)
err = blkif_recover(info);
return err;
}
+static void
+blkfront_closing(struct blkfront_info *info)
+{
+ struct xenbus_device *xbdev = info->xbdev;
+ struct block_device *bdev = NULL;
+
+ mutex_lock(&info->mutex);
+
+ if (xbdev->state == XenbusStateClosing) {
+ mutex_unlock(&info->mutex);
+ return;
+ }
+
+ if (info->gd)
+ bdev = bdget_disk(info->gd, 0);
+
+ mutex_unlock(&info->mutex);
+
+ if (!bdev) {
+ xenbus_frontend_closed(xbdev);
+ return;
+ }
+
+ mutex_lock(&bdev->bd_mutex);
+
+ if (bdev->bd_openers) {
+ xenbus_dev_error(xbdev, -EBUSY,
+ "Device in use; refusing to close");
+ xenbus_switch_state(xbdev, XenbusStateClosing);
+ } else {
+ xlvbd_release_gendisk(info);
+ xenbus_frontend_closed(xbdev);
+ }
+
+ mutex_unlock(&bdev->bd_mutex);
+ bdput(bdev);
+}
/*
* Invoked when the backend is finally 'ready' (and has told produced
@@ -868,11 +1110,31 @@
unsigned long sector_size;
unsigned int binfo;
int err;
+ int barrier;
- if ((info->connected == BLKIF_STATE_CONNECTED) ||
- (info->connected == BLKIF_STATE_SUSPENDED) )
+ switch (info->connected) {
+ case BLKIF_STATE_CONNECTED:
+ /*
+ * Potentially, the back-end may be signalling
+ * a capacity change; update the capacity.
+ */
+ err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "sectors", "%Lu", §ors);
+ if (XENBUS_EXIST_ERR(err))
+ return;
+ printk(KERN_INFO "Setting capacity to %Lu\n",
+ sectors);
+ set_capacity(info->gd, sectors);
+ revalidate_disk(info->gd);
+
+ /* fall through */
+ case BLKIF_STATE_SUSPENDED:
return;
+ default:
+ break;
+ }
+
dev_dbg(&info->xbdev->dev, "%s:%s.\n",
__func__, info->xbdev->otherend);
@@ -889,10 +1151,26 @@
}
err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
- "feature-barrier", "%d", &info->feature_barrier,
+ "feature-barrier", "%d", &barrier,
NULL);
+
+ /*
+ * If there's no "feature-barrier" defined, then it means
+ * we're dealing with a very old backend which writes
+ * synchronously; draining will do what needs to get done.
+ *
+ * If there are barriers, then we can do full queued writes
+ * with tagged barriers.
+ *
+ * If barriers are not supported, then there's no much we can
+ * do, so just set ordering to NONE.
+ */
if (err)
- info->feature_barrier = 0;
+ info->feature_barrier = QUEUE_ORDERED_DRAIN;
+ else if (barrier)
+ info->feature_barrier = QUEUE_ORDERED_TAG;
+ else
+ info->feature_barrier = QUEUE_ORDERED_NONE;
err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
if (err) {
@@ -904,10 +1182,10 @@
xenbus_switch_state(info->xbdev, XenbusStateConnected);
/* Kick pending requests. */
- spin_lock_irq(&blkif_io_lock);
+ spin_lock_irq(&info->io_lock);
info->connected = BLKIF_STATE_CONNECTED;
kick_pending_request_queues(info);
- spin_unlock_irq(&blkif_io_lock);
+ spin_unlock_irq(&info->io_lock);
add_disk(info->gd);
@@ -915,57 +1193,21 @@
}
/**
- * Handle the change of state of the backend to Closing. We must delete our
- * device-layer structures now, to ensure that writes are flushed through to
- * the backend. Once is this done, we can switch to Closed in
- * acknowledgement.
- */
-static void blkfront_closing(struct xenbus_device *dev)
-{
- struct blkfront_info *info = dev_get_drvdata(&dev->dev);
- unsigned long flags;
-
- dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
-
- if (info->rq == NULL)
- goto out;
-
- spin_lock_irqsave(&blkif_io_lock, flags);
-
- /* No more blkif_request(). */
- blk_stop_queue(info->rq);
-
- /* No more gnttab callback work. */
- gnttab_cancel_free_callback(&info->callback);
- spin_unlock_irqrestore(&blkif_io_lock, flags);
-
- /* Flush gnttab callback work. Must be done with no locks held. */
- flush_scheduled_work();
-
- blk_cleanup_queue(info->rq);
- info->rq = NULL;
-
- del_gendisk(info->gd);
-
- out:
- xenbus_frontend_closed(dev);
-}
-
-/**
* Callback received when the backend's state changes.
*/
-static void backend_changed(struct xenbus_device *dev,
+static void blkback_changed(struct xenbus_device *dev,
enum xenbus_state backend_state)
{
struct blkfront_info *info = dev_get_drvdata(&dev->dev);
- struct block_device *bd;
- dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
+ dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
switch (backend_state) {
case XenbusStateInitialising:
case XenbusStateInitWait:
case XenbusStateInitialised:
+ case XenbusStateReconfiguring:
+ case XenbusStateReconfigured:
case XenbusStateUnknown:
case XenbusStateClosed:
break;
@@ -975,35 +1217,56 @@
break;
case XenbusStateClosing:
- if (info->gd == NULL) {
- xenbus_frontend_closed(dev);
- break;
- }
- bd = bdget_disk(info->gd, 0);
- if (bd == NULL)
- xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
-
- mutex_lock(&bd->bd_mutex);
- if (info->users > 0)
- xenbus_dev_error(dev, -EBUSY,
- "Device in use; refusing to close");
- else
- blkfront_closing(dev);
- mutex_unlock(&bd->bd_mutex);
- bdput(bd);
+ blkfront_closing(info);
break;
}
}
-static int blkfront_remove(struct xenbus_device *dev)
+static int blkfront_remove(struct xenbus_device *xbdev)
{
- struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+ struct blkfront_info *info = dev_get_drvdata(&xbdev->dev);
+ struct block_device *bdev = NULL;
+ struct gendisk *disk;
- dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename);
+ dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename);
blkif_free(info, 0);
- kfree(info);
+ mutex_lock(&info->mutex);
+
+ disk = info->gd;
+ if (disk)
+ bdev = bdget_disk(disk, 0);
+
+ info->xbdev = NULL;
+ mutex_unlock(&info->mutex);
+
+ if (!bdev) {
+ kfree(info);
+ return 0;
+ }
+
+ /*
+ * The xbdev was removed before we reached the Closed
+ * state. See if it's safe to remove the disk. If the bdev
+ * isn't closed yet, we let release take care of it.
+ */
+
+ mutex_lock(&bdev->bd_mutex);
+ info = disk->private_data;
+
+ dev_warn(disk_to_dev(disk),
+ "%s was hot-unplugged, %d stale handles\n",
+ xbdev->nodename, bdev->bd_openers);
+
+ if (info && !bdev->bd_openers) {
+ xlvbd_release_gendisk(info);
+ disk->private_data = NULL;
+ kfree(info);
+ }
+
+ mutex_unlock(&bdev->bd_mutex);
+ bdput(bdev);
return 0;
}
@@ -1012,30 +1275,68 @@
{
struct blkfront_info *info = dev_get_drvdata(&dev->dev);
- return info->is_ready;
+ return info->is_ready && info->xbdev;
}
static int blkif_open(struct block_device *bdev, fmode_t mode)
{
- struct blkfront_info *info = bdev->bd_disk->private_data;
- info->users++;
- return 0;
+ struct gendisk *disk = bdev->bd_disk;
+ struct blkfront_info *info;
+ int err = 0;
+
+ info = disk->private_data;
+ if (!info)
+ /* xbdev gone */
+ return -ERESTARTSYS;
+
+ mutex_lock(&info->mutex);
+
+ if (!info->gd)
+ /* xbdev is closed */
+ err = -ERESTARTSYS;
+
+ mutex_unlock(&info->mutex);
+
+ return err;
}
static int blkif_release(struct gendisk *disk, fmode_t mode)
{
struct blkfront_info *info = disk->private_data;
- info->users--;
- if (info->users == 0) {
- /* Check whether we have been instructed to close. We will
- have ignored this request initially, as the device was
- still mounted. */
- struct xenbus_device *dev = info->xbdev;
- enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
+ struct block_device *bdev;
+ struct xenbus_device *xbdev;
- if (state == XenbusStateClosing && info->is_ready)
- blkfront_closing(dev);
+ bdev = bdget_disk(disk, 0);
+ bdput(bdev);
+
+ if (bdev->bd_openers)
+ return 0;
+
+ /*
+ * Check if we have been instructed to close. We will have
+ * deferred this request, because the bdev was still open.
+ */
+
+ mutex_lock(&info->mutex);
+ xbdev = info->xbdev;
+
+ if (xbdev && xbdev->state == XenbusStateClosing) {
+ /* pending switch to state closed */
+ dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
+ xlvbd_release_gendisk(info);
+ xenbus_frontend_closed(info->xbdev);
}
+
+ mutex_unlock(&info->mutex);
+
+ if (!xbdev) {
+ /* sudden device removal */
+ dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
+ xlvbd_release_gendisk(info);
+ disk->private_data = NULL;
+ kfree(info);
+ }
+
return 0;
}
@@ -1061,7 +1362,7 @@
.probe = blkfront_probe,
.remove = blkfront_remove,
.resume = blkfront_resume,
- .otherend_changed = backend_changed,
+ .otherend_changed = blkback_changed,
.is_ready = blkfront_is_ready,
};
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index c496c8a..4064d95 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -18,6 +18,8 @@
#include <asm/k8.h>
#include <asm/gart.h>
#include "agp.h"
+#include <xen/page.h>
+#include <asm/xen/page.h>
/* NVIDIA K8 registers */
#define NVIDIA_X86_64_0_APBASE 0x10
@@ -78,8 +80,21 @@
}
for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
+ phys_addr_t phys = page_to_phys(mem->pages[i]);
+ if (xen_pv_domain()) {
+ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+ page_to_pfn(mem->pages[i])));
+ if (phys != xen_phys) {
+ printk(KERN_ERR "Fixing up GART: (0x%lx->0x%lx)." \
+ " CODE UNTESTED!\n",
+ (unsigned long)phys,
+ (unsigned long)xen_phys);
+ WARN_ON_ONCE(phys != xen_phys);
+ phys = xen_phys;
+ }
+ }
tmp = agp_bridge->driver->mask_memory(agp_bridge,
- page_to_phys(mem->pages[i]),
+ phys,
mask_type);
BUG_ON(tmp & 0xffffff0000000ffcULL);
@@ -181,6 +196,20 @@
unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
int i;
+ if (xen_pv_domain()) {
+ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+ virt_to_pfn(agp_bridge->gatt_table_real)));
+ /* Future thoughts: Perhaps use the gatt_table_bus that
+ * agp_generic_create_gatt_table has setup instead of
+ * doing the virt_to_phys once more? */
+ if (gatt_bus != xen_phys) {
+ printk(KERN_ERR "Fixing up GATT: (0x%lx->0x%lx)." \
+ " CODE UNTESTED!\n", gatt_bus,
+ (unsigned long)xen_phys);
+ WARN_ON_ONCE(gatt_bus != xen_phys);
+ gatt_bus = xen_phys;
+ }
+ }
/* Configure AGP regs in each x86-64 host bridge. */
for (i = 0; i < num_k8_northbridges; i++) {
agp_bridge->gart_bus_addr =
diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c
index a56ca08..30fc4b6 100644
--- a/drivers/char/agp/backend.c
+++ b/drivers/char/agp/backend.c
@@ -38,6 +38,8 @@
#include <linux/vmalloc.h>
#include <asm/io.h>
#include "agp.h"
+#include <xen/page.h>
+#include <asm/xen/page.h>
/* Due to XFree86 brain-damage, we can't go to 1.0 until they
* fix some real stupidity. It's only by chance we can bump
@@ -160,8 +162,13 @@
}
} else {
bridge->scratch_page_dma = page_to_phys(page);
+ if (xen_pv_domain()) {
+ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+ page_to_pfn(page)));
+ if (bridge->scratch_page_dma != xen_phys)
+ bridge->scratch_page_dma = xen_phys;
+ }
}
-
bridge->scratch_page = bridge->driver->mask_memory(bridge,
bridge->scratch_page_dma, 0);
}
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 0d8d60c..c20ef5c 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -42,6 +42,8 @@
#include <asm/cacheflush.h>
#include <asm/pgtable.h>
#include "agp.h"
+#include <xen/page.h>
+#include <asm/xen/page.h>
__u32 *agp_gatt_table;
int agp_memory_reserved;
@@ -1008,6 +1010,14 @@
return -ENOMEM;
}
bridge->gatt_bus_addr = virt_to_phys(bridge->gatt_table_real);
+ /* KRW: virt_to_phys under Xen is not safe. */
+ if (xen_pv_domain()) {
+ /* Use back-door to get the "real" PFN. */
+ phys_addr_t pfn = virt_to_pfn(bridge->gatt_table_real);
+ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(pfn));
+ if (bridge->gatt_bus_addr != xen_phys)
+ bridge->gatt_bus_addr = xen_phys;
+ }
/* AK: bogus, should encode addresses > 4GB */
for (i = 0; i < num_entries; i++) {
@@ -1147,8 +1157,17 @@
}
for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
+ phys_addr_t phys = page_to_phys(mem->pages[i]);
+
+ /* HACK: Via a back-door we get the bus address. */
+ if (xen_pv_domain()) {
+ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+ page_to_pfn(mem->pages[i])));
+ if (phys != xen_phys)
+ phys = xen_phys;
+ }
writel(bridge->driver->mask_memory(bridge,
- page_to_phys(mem->pages[i]),
+ phys,
mask_type),
bridge->gatt_table+j);
}
@@ -1246,7 +1265,16 @@
int i, ret = -ENOMEM;
for (i = 0; i < num_pages; i++) {
- page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO);
+ if (xen_pv_domain()) {
+ void *addr;
+ dma_addr_t _d;
+
+ addr = dma_alloc_coherent(NULL, PAGE_SIZE, &_d, GFP_KERNEL);
+ if (!addr)
+ goto out;
+ page = virt_to_page(addr);
+ } else
+ page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO);
/* agp_free_memory() needs gart address */
if (page == NULL)
goto out;
@@ -1274,7 +1302,17 @@
{
struct page * page;
- page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO);
+ if (xen_pv_domain()) {
+ void *addr;
+ dma_addr_t _d;
+
+ addr = dma_alloc_coherent(NULL, PAGE_SIZE, &_d, GFP_KERNEL);
+ if (!addr)
+ return NULL;
+ page = virt_to_page(addr);
+ } else
+ page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO);
+
if (page == NULL)
return NULL;
@@ -1305,7 +1343,12 @@
unmap_page_from_agp(page);
#endif
put_page(page);
- __free_page(page);
+ if (xen_pv_domain()) {
+ void *addr = page_address(page);
+ dma_free_coherent(NULL, PAGE_SIZE, addr,
+ virt_to_bus(addr));
+ } else
+ __free_page(page);
atomic_dec(&agp_bridge->current_memory_agp);
mem->pages[i] = NULL;
}
@@ -1322,7 +1365,12 @@
if (flags & AGP_PAGE_DESTROY_FREE) {
put_page(page);
- __free_page(page);
+ if (xen_pv_domain()) {
+ void *addr = page_address(page);
+ dma_free_coherent(NULL, PAGE_SIZE, addr,
+ virt_to_bus(addr));
+ } else
+ __free_page(page);
atomic_dec(&agp_bridge->current_memory_agp);
}
}
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index b8e0219..7a62c3c 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -10,14 +10,20 @@
#include <linux/agp_backend.h>
#include <asm/smp.h>
#include "agp.h"
+#include <xen/page.h>
+#include <asm/xen/page.h>
/*
* If we have Intel graphics, we're not going to have anything other than
* an Intel IOMMU. So make the correct use of the PCI DMA API contingent
* on the Intel IOMMU support (CONFIG_DMAR).
* Only newer chipsets need to bother with this, of course.
+ *
+ * Xen guests accessing graphics hardware also need proper translation
+ * between pseudo-physical addresses and real machine addresses, which
+ * is also achieved by using the DMA API.
*/
-#ifdef CONFIG_DMAR
+#if defined(CONFIG_DMAR) || defined(CONFIG_XEN)
#define USE_PCI_DMA_API 1
#endif
@@ -296,8 +302,20 @@
int i, j;
for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
+ phys_addr_t phys = page_to_phys(mem->pages[i]);
+ if (xen_pv_domain()) {
+ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+ page_to_pfn(mem->pages[i])));
+ if (xen_phys != phys) {
+ printk(KERN_ERR "Compile kernel with " \
+ "CONFIG_DMAR to get rid of this " \
+ "warning!\n");
+ WARN_ON_ONCE(xen_phys != phys);
+ /* Fixup: */
+ phys = xen_phys;
+ }
writel(agp_bridge->driver->mask_memory(agp_bridge,
- page_to_phys(mem->pages[i]), mask_type),
+ phys, mask_type),
intel_private.gtt+j);
}
@@ -395,15 +413,19 @@
/* Exists to support ARGB cursors */
static struct page *i8xx_alloc_pages(void)
{
+ void *addr;
+ dma_addr_t _d;
struct page *page;
- page = alloc_pages(GFP_KERNEL | GFP_DMA32, 2);
- if (page == NULL)
+ addr = dma_alloc_coherent(NULL, 4 * PAGE_SIZE, &_d, GFP_KERNEL);
+ if (addr == NULL)
return NULL;
+ page = virt_to_page(addr);
+
if (set_pages_uc(page, 4) < 0) {
set_pages_wb(page, 4);
- __free_pages(page, 2);
+ dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, _d);
return NULL;
}
get_page(page);
@@ -413,12 +435,17 @@
static void i8xx_destroy_pages(struct page *page)
{
+ void *addr;
+
if (page == NULL)
return;
set_pages_wb(page, 4);
put_page(page);
- __free_pages(page, 2);
+
+ addr = page_address(page);
+
+ dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, virt_to_bus(addr));
atomic_dec(&agp_bridge->current_memory_agp);
}
@@ -478,8 +505,16 @@
if (!mem->is_flushed)
global_cache_flush();
for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
+ phys_addr_t phys = page_to_phys(mem->pages[i]);
+ if (xen_pv_domain()) {
+ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+ page_to_pfn(mem->pages[i])));
+ /* Fixup: */
+ if (xen_phys != phys)
+ phys = xen_phys;
+ }
writel(agp_bridge->driver->mask_memory(agp_bridge,
- page_to_phys(mem->pages[i]), mask_type),
+ phys, mask_type),
intel_private.registers+I810_PTE_BASE+(j*4));
}
readl(intel_private.registers+I810_PTE_BASE+((j-1)*4));
@@ -552,6 +587,12 @@
new->num_scratch_pages = pg_count;
new->type = AGP_PHYS_MEMORY;
new->physical = page_to_phys(new->pages[0]);
+ if (xen_pv_domain()) {
+ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+ page_to_pfn(new->pages[0])));
+ if (xen_phys != new->physical)
+ new->physical = xen_phys;
+ }
return new;
}
@@ -992,8 +1033,16 @@
global_cache_flush();
for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
+ phys_addr_t phys = page_to_phys(mem->pages[i]);
+ if (xen_pv_domain()) {
+ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
+ page_to_pfn(mem->pages[i])));
+ /* Fixup: */
+ if (xen_phys != phys)
+ phys = xen_phys;
+ }
writel(agp_bridge->driver->mask_memory(agp_bridge,
- page_to_phys(mem->pages[i]), mask_type),
+ phys, mask_type),
intel_private.registers+I810_PTE_BASE+(j*4));
}
readl(intel_private.registers+I810_PTE_BASE+((j-1)*4));
diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c
index a6ee32b..5620de5 100644
--- a/drivers/char/hvc_xen.c
+++ b/drivers/char/hvc_xen.c
@@ -25,6 +25,8 @@
#include <linux/types.h>
#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
#include <xen/page.h>
#include <xen/events.h>
#include <xen/interface/io/console.h>
@@ -72,11 +74,12 @@
wmb(); /* write ring before updating pointer */
intf->out_prod = prod;
- notify_daemon();
+ if (sent)
+ notify_daemon();
return sent;
}
-static int write_console(uint32_t vtermno, const char *data, int len)
+static int domU_write_console(uint32_t vtermno, const char *data, int len)
{
int ret = len;
@@ -99,7 +102,7 @@
return ret;
}
-static int read_console(uint32_t vtermno, char *buf, int len)
+static int domU_read_console(uint32_t vtermno, char *buf, int len)
{
struct xencons_interface *intf = xencons_interface();
XENCONS_RING_IDX cons, prod;
@@ -120,28 +123,65 @@
return recv;
}
-static struct hv_ops hvc_ops = {
- .get_chars = read_console,
- .put_chars = write_console,
+static struct hv_ops domU_hvc_ops = {
+ .get_chars = domU_read_console,
+ .put_chars = domU_write_console,
.notifier_add = notifier_add_irq,
.notifier_del = notifier_del_irq,
.notifier_hangup = notifier_hangup_irq,
};
-static int __init xen_init(void)
+static int dom0_read_console(uint32_t vtermno, char *buf, int len)
+{
+ return HYPERVISOR_console_io(CONSOLEIO_read, len, buf);
+}
+
+/*
+ * Either for a dom0 to write to the system console, or a domU with a
+ * debug version of Xen
+ */
+static int dom0_write_console(uint32_t vtermno, const char *str, int len)
+{
+ int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str);
+ if (rc < 0)
+ return 0;
+
+ return len;
+}
+
+static struct hv_ops dom0_hvc_ops = {
+ .get_chars = dom0_read_console,
+ .put_chars = dom0_write_console,
+ .notifier_add = notifier_add_irq,
+ .notifier_del = notifier_del_irq,
+ .notifier_hangup = notifier_hangup_irq,
+};
+
+static int __init xen_hvc_init(void)
{
struct hvc_struct *hp;
+ struct hv_ops *ops;
- if (!xen_pv_domain() ||
- xen_initial_domain() ||
- !xen_start_info->console.domU.evtchn)
+ if (!xen_pv_domain())
return -ENODEV;
- xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
+ if (xen_initial_domain()) {
+ ops = &dom0_hvc_ops;
+ xencons_irq = bind_virq_to_irq(VIRQ_CONSOLE, 0);
+ } else {
+ if (!xen_start_info->console.domU.evtchn)
+ return -ENODEV;
+
+ ops = &domU_hvc_ops;
+ xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
+ }
+
if (xencons_irq < 0)
xencons_irq = 0; /* NO_IRQ */
+ else
+ set_irq_noprobe(xencons_irq);
- hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256);
+ hp = hvc_alloc(HVC_COOKIE, xencons_irq, ops, 256);
if (IS_ERR(hp))
return PTR_ERR(hp);
@@ -158,7 +198,7 @@
rebind_evtchn_irq(xen_start_info->console.domU.evtchn, xencons_irq);
}
-static void __exit xen_fini(void)
+static void __exit xen_hvc_fini(void)
{
if (hvc)
hvc_remove(hvc);
@@ -166,29 +206,24 @@
static int xen_cons_init(void)
{
+ struct hv_ops *ops;
+
if (!xen_pv_domain())
return 0;
- hvc_instantiate(HVC_COOKIE, 0, &hvc_ops);
+ ops = &domU_hvc_ops;
+ if (xen_initial_domain())
+ ops = &dom0_hvc_ops;
+
+ hvc_instantiate(HVC_COOKIE, 0, ops);
+
return 0;
}
-module_init(xen_init);
-module_exit(xen_fini);
+module_init(xen_hvc_init);
+module_exit(xen_hvc_fini);
console_initcall(xen_cons_init);
-static void raw_console_write(const char *str, int len)
-{
- while(len > 0) {
- int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str);
- if (rc <= 0)
- break;
-
- str += rc;
- len -= rc;
- }
-}
-
#ifdef CONFIG_EARLY_PRINTK
static void xenboot_write_console(struct console *console, const char *string,
unsigned len)
@@ -196,19 +231,22 @@
unsigned int linelen, off = 0;
const char *pos;
- raw_console_write(string, len);
+ dom0_write_console(0, string, len);
- write_console(0, "(early) ", 8);
+ if (xen_initial_domain())
+ return;
+
+ domU_write_console(0, "(early) ", 8);
while (off < len && NULL != (pos = strchr(string+off, '\n'))) {
linelen = pos-string+off;
if (off + linelen > len)
break;
- write_console(0, string+off, linelen);
- write_console(0, "\r\n", 2);
+ domU_write_console(0, string+off, linelen);
+ domU_write_console(0, "\r\n", 2);
off += linelen + 1;
}
if (off < len)
- write_console(0, string+off, len-off);
+ domU_write_console(0, string+off, len-off);
}
struct console xenboot_console = {
@@ -220,7 +258,7 @@
void xen_raw_console_write(const char *str)
{
- raw_console_write(str, strlen(str));
+ dom0_write_console(0, str, strlen(str));
}
void xen_raw_printk(const char *fmt, ...)
diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index cbaf420..163459d 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -8,7 +8,6 @@
#include <linux/bug.h>
#include <linux/device.h>
-#include <linux/ethtool.h>
#include <linux/firewire.h>
#include <linux/firewire-constants.h>
#include <linux/highmem.h>
@@ -1333,17 +1332,6 @@
return 0;
}
-static void fwnet_get_drvinfo(struct net_device *net,
- struct ethtool_drvinfo *info)
-{
- strcpy(info->driver, KBUILD_MODNAME);
- strcpy(info->bus_info, "ieee1394");
-}
-
-static const struct ethtool_ops fwnet_ethtool_ops = {
- .get_drvinfo = fwnet_get_drvinfo,
-};
-
static const struct net_device_ops fwnet_netdev_ops = {
.ndo_open = fwnet_open,
.ndo_stop = fwnet_stop,
@@ -1362,7 +1350,6 @@
net->hard_header_len = FWNET_HLEN;
net->type = ARPHRD_IEEE1394;
net->tx_queue_len = 10;
- SET_ETHTOOL_OPS(net, &fwnet_ethtool_ops);
}
/* caller must hold fwnet_device_mutex */
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 0e27d98..f5e2572 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -201,7 +201,7 @@
}
if (drm_core_check_feature(dev, DRIVER_SG) && dev->sg &&
!drm_core_check_feature(dev, DRIVER_MODESET)) {
- drm_sg_cleanup(dev->sg);
+ drm_sg_cleanup(dev, dev->sg);
dev->sg = NULL;
}
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 8bf3770..dde5f66 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -539,7 +539,7 @@
vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND;
vma->vm_ops = obj->dev->driver->gem_vm_ops;
vma->vm_private_data = map->handle;
- vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
+ vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
/* Take a ref for this mapping of the object, so that the fault
* handler can dereference the mmap offset's pointer to the object.
diff --git a/drivers/gpu/drm/drm_scatter.c b/drivers/gpu/drm/drm_scatter.c
index c7823c8..95ffb8a 100644
--- a/drivers/gpu/drm/drm_scatter.c
+++ b/drivers/gpu/drm/drm_scatter.c
@@ -32,20 +32,73 @@
*/
#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include "drmP.h"
#define DEBUG_SCATTER 0
-static inline void *drm_vmalloc_dma(unsigned long size)
+static void *drm_vmalloc_dma(struct drm_device *drmdev, unsigned long size)
{
#if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE)
return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL | _PAGE_NO_CACHE);
#else
- return vmalloc_32(size);
+ struct device *dev = &drmdev->pdev->dev;
+ struct page **pages;
+ void *addr;
+ const int npages = PFN_UP(size);
+ int i;
+
+ pages = kmalloc(npages * sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ goto fail;
+
+ for (i = 0; i < npages; i++) {
+ dma_addr_t phys;
+ void *addr;
+ addr = dma_alloc_coherent(dev, PAGE_SIZE, &phys, GFP_KERNEL);
+ if (addr == NULL)
+ goto out_free_pages;
+
+ pages[i] = virt_to_page(addr);
+ }
+
+ addr = vmap(pages, npages, VM_MAP | VM_IOREMAP, PAGE_KERNEL);
+
+ kfree(pages);
+
+ return addr;
+
+out_free_pages:
+ while (i > 0) {
+ void *addr = page_address(pages[--i]);
+ dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr));
+ }
+
+ kfree(pages);
+
+fail:
+ return NULL;
#endif
}
-void drm_sg_cleanup(struct drm_sg_mem * entry)
+static void drm_vfree_dma(struct drm_device *drmdev, void *addr, int npages,
+ struct page **pages)
+{
+#if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE)
+ vfree(addr);
+#else
+ struct device *dev = &drmdev->pdev->dev;
+ int i;
+
+ for (i = 0; i < npages; i++) {
+ void *addr = page_address(pages[i]);
+ dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr));
+ }
+ vunmap(addr);
+#endif
+}
+
+void drm_sg_cleanup(struct drm_device *drmdev, struct drm_sg_mem * entry)
{
struct page *page;
int i;
@@ -56,7 +109,7 @@
ClearPageReserved(page);
}
- vfree(entry->virtual);
+ drm_vfree_dma(drmdev, entry->virtual, entry->pages, entry->pagelist);
kfree(entry->busaddr);
kfree(entry->pagelist);
@@ -107,7 +160,7 @@
}
memset((void *)entry->busaddr, 0, pages * sizeof(*entry->busaddr));
- entry->virtual = drm_vmalloc_dma(pages << PAGE_SHIFT);
+ entry->virtual = drm_vmalloc_dma(dev, pages << PAGE_SHIFT);
if (!entry->virtual) {
kfree(entry->busaddr);
kfree(entry->pagelist);
@@ -180,7 +233,7 @@
return 0;
failed:
- drm_sg_cleanup(entry);
+ drm_sg_cleanup(dev, entry);
return -ENOMEM;
}
EXPORT_SYMBOL(drm_sg_alloc);
@@ -212,7 +265,7 @@
DRM_DEBUG("virtual = %p\n", entry->virtual);
- drm_sg_cleanup(entry);
+ drm_sg_cleanup(dev, entry);
return 0;
}
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 1c040d0..e3555bf 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -87,6 +87,9 @@
bool is_iomem;
unsigned long address = (unsigned long)vmf->virtual_address;
int retval = VM_FAULT_NOPAGE;
+ bool vm_io = (vma->vm_flags & VM_IO) && VM_IO;
+ bool pte_iomap = (pgprot_val(vma->vm_page_prot) & _PAGE_IOMAP)
+ && _PAGE_IOMAP;
/*
* Work around locking order reversal in fault / nopfn
@@ -158,11 +161,30 @@
if (is_iomem) {
vma->vm_page_prot = ttm_io_prot(bo->mem.placement,
vma->vm_page_prot);
+ if (!vm_io || !pte_iomap) {
+ vma->vm_flags |= VM_IO;
+ pgprot_val(vma->vm_page_prot) |= _PAGE_IOMAP;
+ }
} else {
ttm = bo->ttm;
vma->vm_page_prot = (bo->mem.placement & TTM_PL_FLAG_CACHED) ?
vm_get_page_prot(vma->vm_flags) :
ttm_io_prot(bo->mem.placement, vma->vm_page_prot);
+ /*
+ * During PCI suspend the graphic cards purge their VRAM and
+ * move their graphic objects to the TT. They also unmap all
+ * of the objects, meaning that when an user application is
+ * unfrozen it will re-fault and call here.
+ *
+ * What this means is that the VMA for the graphic object might
+ * have been set for VRAM TTM but now it is with the TT
+ * (normal RAM) meaning that the vma->vm_flags could be
+ * inappropiate (say, VM_IO on TT - no good).
+ */
+ if (vm_io || pte_iomap) {
+ vma->vm_flags &= ~VM_IO;
+ pgprot_val(vma->vm_page_prot) &= ~_PAGE_IOMAP;
+ }
}
/*
@@ -239,6 +261,7 @@
{
struct ttm_bo_driver *driver;
struct ttm_buffer_object *bo;
+ struct ttm_mem_type_manager *man;
int ret;
read_lock(&bdev->vm_lock);
@@ -271,7 +294,11 @@
*/
vma->vm_private_data = bo;
- vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
+ vma->vm_flags |= VM_RESERVED | VM_MIXEDMAP | VM_DONTEXPAND;
+ man = &bdev->man[bo->mem.mem_type];
+ if (man->flags & TTM_MEMTYPE_FLAG_NEEDS_IOREMAP)
+ vma->vm_flags |= VM_IO;
+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
return 0;
out_unref:
ttm_bo_unref(&bo);
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index 3d5b8b0..8b05e38 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -38,7 +38,8 @@
#include "ttm/ttm_module.h"
#include "ttm/ttm_bo_driver.h"
#include "ttm/ttm_placement.h"
-
+#include <linux/dma-mapping.h>
+#include <xen/xen.h>
static int ttm_tt_swapin(struct ttm_tt *ttm);
/**
@@ -84,6 +85,16 @@
else
gfp_flags |= __GFP_HIGHMEM;
+ if ((page_flags & TTM_PAGE_FLAG_DMA32) && xen_pv_domain())
+ {
+ void *addr;
+ dma_addr_t _d;
+
+ addr = dma_alloc_coherent(NULL, PAGE_SIZE, &_d, GFP_KERNEL);
+ if (addr == NULL)
+ return NULL;
+ return virt_to_page(addr);
+ }
return alloc_page(gfp_flags);
}
@@ -286,6 +297,7 @@
int i;
struct page *cur_page;
struct ttm_backend *be = ttm->be;
+ void *addr;
if (be)
be->func->clear(be);
@@ -300,7 +312,16 @@
"Leaking pages.\n");
ttm_mem_global_free_page(ttm->glob->mem_glob,
cur_page);
- __free_page(cur_page);
+
+ if ((ttm->page_flags & TTM_PAGE_FLAG_DMA32) &&
+ xen_pv_domain()) {
+ addr = page_address(cur_page);
+ WARN_ON(!addr);
+ if (addr)
+ dma_free_coherent(NULL, PAGE_SIZE, addr,
+ virt_to_bus(addr));
+ } else
+ __free_page(cur_page);
}
}
ttm->state = tt_unpopulated;
diff --git a/drivers/ieee1394/eth1394.c b/drivers/ieee1394/eth1394.c
index a4e9dcb..62ab09e 100644
--- a/drivers/ieee1394/eth1394.c
+++ b/drivers/ieee1394/eth1394.c
@@ -58,7 +58,6 @@
#include <linux/tcp.h>
#include <linux/skbuff.h>
#include <linux/bitops.h>
-#include <linux/ethtool.h>
#include <asm/uaccess.h>
#include <asm/delay.h>
#include <asm/unaligned.h>
@@ -173,8 +172,6 @@
struct net_device *dev);
static void ether1394_iso(struct hpsb_iso *iso);
-static const struct ethtool_ops ethtool_ops;
-
static int ether1394_write(struct hpsb_host *host, int srcid, int destid,
quadlet_t *data, u64 addr, size_t len, u16 flags);
static void ether1394_add_host(struct hpsb_host *host);
@@ -525,8 +522,6 @@
dev->header_ops = ðer1394_header_ops;
dev->netdev_ops = ðer1394_netdev_ops;
- SET_ETHTOOL_OPS(dev, ðtool_ops);
-
dev->watchdog_timeo = ETHER1394_TIMEOUT;
dev->flags = IFF_BROADCAST | IFF_MULTICAST;
dev->features = NETIF_F_HIGHDMA;
@@ -1698,17 +1693,6 @@
return NETDEV_TX_OK;
}
-static void ether1394_get_drvinfo(struct net_device *dev,
- struct ethtool_drvinfo *info)
-{
- strcpy(info->driver, driver_name);
- strcpy(info->bus_info, "ieee1394"); /* FIXME provide more detail? */
-}
-
-static const struct ethtool_ops ethtool_ops = {
- .get_drvinfo = ether1394_get_drvinfo
-};
-
static int __init ether1394_init_module(void)
{
int err;
diff --git a/drivers/input/xen-kbdfront.c b/drivers/input/xen-kbdfront.c
index 480d5d2..546838f 100644
--- a/drivers/input/xen-kbdfront.c
+++ b/drivers/input/xen-kbdfront.c
@@ -21,7 +21,10 @@
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/input.h>
+
#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
#include <xen/events.h>
#include <xen/page.h>
#include <xen/interface/io/fbif.h>
@@ -286,6 +289,8 @@
switch (backend_state) {
case XenbusStateInitialising:
case XenbusStateInitialised:
+ case XenbusStateReconfiguring:
+ case XenbusStateReconfigured:
case XenbusStateUnknown:
case XenbusStateClosed:
break;
@@ -348,7 +353,7 @@
static int __init xenkbd_init(void)
{
- if (!xen_domain())
+ if (!xen_domain() || xen_hvm_domain())
return -ENODEV;
/* Nothing to do if running in dom0. */
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index b2f71f7..b7feb84 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2787,6 +2787,7 @@
config XEN_NETDEV_FRONTEND
tristate "Xen network device frontend driver"
depends on XEN
+ select XEN_XENBUS_FRONTEND
default y
help
The network device frontend driver allows the kernel to
diff --git a/drivers/net/bmac.c b/drivers/net/bmac.c
index 406f064..c063b53 100644
--- a/drivers/net/bmac.c
+++ b/drivers/net/bmac.c
@@ -1236,15 +1236,8 @@
}
spin_unlock_irqrestore(&bp->lock, flags);
}
-static void bmac_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
-{
- struct bmac_data *bp = netdev_priv(dev);
- strcpy(info->driver, "bmac");
- strcpy(info->bus_info, dev_name(&bp->mdev->ofdev.dev));
-}
static const struct ethtool_ops bmac_ethtool_ops = {
- .get_drvinfo = bmac_get_drvinfo,
.get_link = ethtool_op_get_link,
};
diff --git a/drivers/net/fec_mpc52xx.c b/drivers/net/fec_mpc52xx.c
index 66dace6..8238fa2 100644
--- a/drivers/net/fec_mpc52xx.c
+++ b/drivers/net/fec_mpc52xx.c
@@ -772,11 +772,6 @@
/* ethtool interface */
-static void mpc52xx_fec_get_drvinfo(struct net_device *dev,
- struct ethtool_drvinfo *info)
-{
- strcpy(info->driver, DRIVER_NAME);
-}
static int mpc52xx_fec_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
@@ -811,7 +806,6 @@
}
static const struct ethtool_ops mpc52xx_fec_ethtool_ops = {
- .get_drvinfo = mpc52xx_fec_get_drvinfo,
.get_settings = mpc52xx_fec_get_settings,
.set_settings = mpc52xx_fec_set_settings,
.get_link = ethtool_op_get_link,
diff --git a/drivers/net/pasemi_mac_ethtool.c b/drivers/net/pasemi_mac_ethtool.c
index 28a8622..29ff9ad 100644
--- a/drivers/net/pasemi_mac_ethtool.c
+++ b/drivers/net/pasemi_mac_ethtool.c
@@ -77,21 +77,6 @@
return phy_ethtool_gset(phydev, cmd);
}
-static void
-pasemi_mac_ethtool_get_drvinfo(struct net_device *netdev,
- struct ethtool_drvinfo *drvinfo)
-{
- struct pasemi_mac *mac;
- mac = netdev_priv(netdev);
-
- /* clear and fill out info */
- memset(drvinfo, 0, sizeof(struct ethtool_drvinfo));
- strncpy(drvinfo->driver, "pasemi_mac", 12);
- strcpy(drvinfo->version, "N/A");
- strcpy(drvinfo->fw_version, "N/A");
- strncpy(drvinfo->bus_info, pci_name(mac->pdev), 32);
-}
-
static u32
pasemi_mac_ethtool_get_msglevel(struct net_device *netdev)
{
@@ -150,7 +135,6 @@
const struct ethtool_ops pasemi_mac_ethtool_ops = {
.get_settings = pasemi_mac_ethtool_get_settings,
- .get_drvinfo = pasemi_mac_ethtool_get_drvinfo,
.get_msglevel = pasemi_mac_ethtool_get_msglevel,
.set_msglevel = pasemi_mac_ethtool_set_msglevel,
.get_link = ethtool_op_get_link,
diff --git a/drivers/net/pcmcia/3c574_cs.c b/drivers/net/pcmcia/3c574_cs.c
index b58965a..7f9a4f4 100644
--- a/drivers/net/pcmcia/3c574_cs.c
+++ b/drivers/net/pcmcia/3c574_cs.c
@@ -83,7 +83,6 @@
#include <linux/skbuff.h>
#include <linux/if_arp.h>
#include <linux/ioport.h>
-#include <linux/ethtool.h>
#include <linux/bitops.h>
#include <linux/mii.h>
@@ -249,7 +248,6 @@
static int el3_close(struct net_device *dev);
static void el3_tx_timeout(struct net_device *dev);
static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
-static const struct ethtool_ops netdev_ethtool_ops;
static void set_rx_mode(struct net_device *dev);
static void set_multicast_list(struct net_device *dev);
@@ -300,7 +298,6 @@
link->conf.ConfigIndex = 1;
dev->netdev_ops = &el3_netdev_ops;
- SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
dev->watchdog_timeo = TX_TIMEOUT;
return tc574_config(link);
@@ -1083,16 +1080,6 @@
return worklimit;
}
-static void netdev_get_drvinfo(struct net_device *dev,
- struct ethtool_drvinfo *info)
-{
- strcpy(info->driver, "3c574_cs");
-}
-
-static const struct ethtool_ops netdev_ethtool_ops = {
- .get_drvinfo = netdev_get_drvinfo,
-};
-
/* Provide ioctl() calls to examine the MII xcvr state. */
static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
diff --git a/drivers/net/pcmcia/axnet_cs.c b/drivers/net/pcmcia/axnet_cs.c
index 3131a59..40e5e7c 100644
--- a/drivers/net/pcmcia/axnet_cs.c
+++ b/drivers/net/pcmcia/axnet_cs.c
@@ -33,7 +33,6 @@
#include <linux/timer.h>
#include <linux/delay.h>
#include <linux/spinlock.h>
-#include <linux/ethtool.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/crc32.h>
@@ -98,7 +97,6 @@
static struct net_device_stats *get_stats(struct net_device *dev);
static void set_multicast_list(struct net_device *dev);
static void axnet_tx_timeout(struct net_device *dev);
-static const struct ethtool_ops netdev_ethtool_ops;
static irqreturn_t ei_irq_wrapper(int irq, void *dev_id);
static void ei_watchdog(u_long arg);
static void axnet_reset_8390(struct net_device *dev);
@@ -186,7 +184,6 @@
dev->netdev_ops = &axnet_netdev_ops;
- SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
dev->watchdog_timeo = TX_TIMEOUT;
return axnet_config(link);
@@ -683,16 +680,6 @@
add_timer(&info->watchdog);
}
-static void netdev_get_drvinfo(struct net_device *dev,
- struct ethtool_drvinfo *info)
-{
- strcpy(info->driver, "axnet_cs");
-}
-
-static const struct ethtool_ops netdev_ethtool_ops = {
- .get_drvinfo = netdev_get_drvinfo,
-};
-
/*====================================================================*/
static int axnet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
diff --git a/drivers/net/pcmcia/ibmtr_cs.c b/drivers/net/pcmcia/ibmtr_cs.c
index 06618af..db0c890 100644
--- a/drivers/net/pcmcia/ibmtr_cs.c
+++ b/drivers/net/pcmcia/ibmtr_cs.c
@@ -52,7 +52,6 @@
#include <linux/string.h>
#include <linux/timer.h>
#include <linux/module.h>
-#include <linux/ethtool.h>
#include <linux/netdevice.h>
#include <linux/trdevice.h>
#include <linux/ibmtr.h>
@@ -120,16 +119,6 @@
struct tok_info *ti;
} ibmtr_dev_t;
-static void netdev_get_drvinfo(struct net_device *dev,
- struct ethtool_drvinfo *info)
-{
- strcpy(info->driver, "ibmtr_cs");
-}
-
-static const struct ethtool_ops netdev_ethtool_ops = {
- .get_drvinfo = netdev_get_drvinfo,
-};
-
/*======================================================================
ibmtr_attach() creates an "instance" of the driver, allocating
@@ -170,8 +159,6 @@
link->irq.Instance = info->dev = dev;
- SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
-
return ibmtr_config(link);
} /* ibmtr_attach */
diff --git a/drivers/net/pcmcia/pcnet_cs.c b/drivers/net/pcmcia/pcnet_cs.c
index 4696844..02e2d05 100644
--- a/drivers/net/pcmcia/pcnet_cs.c
+++ b/drivers/net/pcmcia/pcnet_cs.c
@@ -36,7 +36,6 @@
#include <linux/string.h>
#include <linux/timer.h>
#include <linux/delay.h>
-#include <linux/ethtool.h>
#include <linux/netdevice.h>
#include <linux/log2.h>
#include <linux/etherdevice.h>
@@ -111,7 +110,6 @@
static int pcnet_open(struct net_device *dev);
static int pcnet_close(struct net_device *dev);
static int ei_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
-static const struct ethtool_ops netdev_ethtool_ops;
static irqreturn_t ei_irq_wrapper(int irq, void *dev_id);
static void ei_watchdog(u_long arg);
static void pcnet_reset_8390(struct net_device *dev);
@@ -654,8 +652,6 @@
ei_status.word16 = 1;
ei_status.reset_8390 = &pcnet_reset_8390;
- SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
-
if (info->flags & (IS_DL10019|IS_DL10022))
mii_phy_probe(dev);
@@ -1175,18 +1171,6 @@
/*====================================================================*/
-static void netdev_get_drvinfo(struct net_device *dev,
- struct ethtool_drvinfo *info)
-{
- strcpy(info->driver, "pcnet_cs");
-}
-
-static const struct ethtool_ops netdev_ethtool_ops = {
- .get_drvinfo = netdev_get_drvinfo,
-};
-
-/*====================================================================*/
-
static int ei_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
diff --git a/drivers/net/sc92031.c b/drivers/net/sc92031.c
index 8d60300..0926832 100644
--- a/drivers/net/sc92031.c
+++ b/drivers/net/sc92031.c
@@ -1255,16 +1255,6 @@
return 0;
}
-static void sc92031_ethtool_get_drvinfo(struct net_device *dev,
- struct ethtool_drvinfo *drvinfo)
-{
- struct sc92031_priv *priv = netdev_priv(dev);
- struct pci_dev *pdev = priv->pdev;
-
- strcpy(drvinfo->driver, SC92031_NAME);
- strcpy(drvinfo->bus_info, pci_name(pdev));
-}
-
static void sc92031_ethtool_get_wol(struct net_device *dev,
struct ethtool_wolinfo *wolinfo)
{
@@ -1386,7 +1376,6 @@
static const struct ethtool_ops sc92031_ethtool_ops = {
.get_settings = sc92031_ethtool_get_settings,
.set_settings = sc92031_ethtool_set_settings,
- .get_drvinfo = sc92031_ethtool_get_drvinfo,
.get_wol = sc92031_ethtool_get_wol,
.set_wol = sc92031_ethtool_set_wol,
.nway_reset = sc92031_ethtool_nway_reset,
diff --git a/drivers/net/tulip/xircom_cb.c b/drivers/net/tulip/xircom_cb.c
index 0f2ca598..44159be 100644
--- a/drivers/net/tulip/xircom_cb.c
+++ b/drivers/net/tulip/xircom_cb.c
@@ -27,7 +27,6 @@
#include <linux/skbuff.h>
#include <linux/delay.h>
#include <linux/init.h>
-#include <linux/ethtool.h>
#include <linux/bitops.h>
#include <asm/uaccess.h>
@@ -179,19 +178,6 @@
}
#endif
-static void netdev_get_drvinfo(struct net_device *dev,
- struct ethtool_drvinfo *info)
-{
- struct xircom_private *private = netdev_priv(dev);
-
- strcpy(info->driver, "xircom_cb");
- strcpy(info->bus_info, pci_name(private->pdev));
-}
-
-static const struct ethtool_ops netdev_ethtool_ops = {
- .get_drvinfo = netdev_get_drvinfo,
-};
-
static const struct net_device_ops netdev_ops = {
.ndo_open = xircom_open,
.ndo_stop = xircom_close,
@@ -277,7 +263,6 @@
setup_descriptors(private);
dev->netdev_ops = &netdev_ops;
- SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
pci_set_drvdata(pdev, dev);
if (register_netdev(dev)) {
diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index f450bc9..2109514 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -820,17 +820,7 @@
return NETDEV_TX_OK;
}
-static void hso_get_drvinfo(struct net_device *net, struct ethtool_drvinfo *info)
-{
- struct hso_net *odev = netdev_priv(net);
-
- strncpy(info->driver, driver_name, ETHTOOL_BUSINFO_LEN);
- strncpy(info->version, DRIVER_VERSION, ETHTOOL_BUSINFO_LEN);
- usb_make_path(odev->parent->usb, info->bus_info, sizeof info->bus_info);
-}
-
static const struct ethtool_ops ops = {
- .get_drvinfo = hso_get_drvinfo,
.get_link = ethtool_op_get_link
};
diff --git a/drivers/net/usb/kaweth.c b/drivers/net/usb/kaweth.c
index e391ef9..47d1926 100644
--- a/drivers/net/usb/kaweth.c
+++ b/drivers/net/usb/kaweth.c
@@ -767,14 +767,6 @@
return 0;
}
-static void kaweth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
-{
- struct kaweth_device *kaweth = netdev_priv(dev);
-
- strlcpy(info->driver, driver_name, sizeof(info->driver));
- usb_make_path(kaweth->dev, info->bus_info, sizeof (info->bus_info));
-}
-
static u32 kaweth_get_link(struct net_device *dev)
{
struct kaweth_device *kaweth = netdev_priv(dev);
@@ -783,7 +775,6 @@
}
static const struct ethtool_ops ops = {
- .get_drvinfo = kaweth_get_drvinfo,
.get_link = kaweth_get_link
};
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index 1c88c2e..2e65100 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -44,7 +44,6 @@
#include <linux/if_arp.h>
#include <linux/ioport.h>
#include <linux/skbuff.h>
-#include <linux/ethtool.h>
#include <linux/ieee80211.h>
#include <pcmcia/cs_types.h>
@@ -101,8 +100,6 @@
static struct net_device_stats *ray_get_stats(struct net_device *dev);
static int ray_dev_init(struct net_device *dev);
-static const struct ethtool_ops netdev_ethtool_ops;
-
static int ray_open(struct net_device *dev);
static netdev_tx_t ray_dev_start_xmit(struct sk_buff *skb,
struct net_device *dev);
@@ -362,7 +359,6 @@
/* Raylink entries in the device structure */
dev->netdev_ops = &ray_netdev_ops;
- SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
dev->wireless_handlers = &ray_handler_def;
#ifdef WIRELESS_SPY
local->wireless_data.spy_data = &local->spy_data;
@@ -1106,18 +1102,6 @@
}
} /* end encapsulate_frame */
-/*===========================================================================*/
-
-static void netdev_get_drvinfo(struct net_device *dev,
- struct ethtool_drvinfo *info)
-{
- strcpy(info->driver, "ray_cs");
-}
-
-static const struct ethtool_ops netdev_ethtool_ops = {
- .get_drvinfo = netdev_get_drvinfo,
-};
-
/*====================================================================*/
/*------------------------------------------------------------------*/
diff --git a/drivers/net/wireless/wl3501_cs.c b/drivers/net/wireless/wl3501_cs.c
index 4f1e0cf..22b2b43 100644
--- a/drivers/net/wireless/wl3501_cs.c
+++ b/drivers/net/wireless/wl3501_cs.c
@@ -29,7 +29,6 @@
#include <linux/delay.h>
#include <linux/types.h>
-#include <linux/ethtool.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/in.h>
@@ -1436,15 +1435,6 @@
return wstats;
}
-static void wl3501_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
-{
- strlcpy(info->driver, wl3501_dev_info, sizeof(info->driver));
-}
-
-static const struct ethtool_ops ops = {
- .get_drvinfo = wl3501_get_drvinfo
-};
-
/**
* wl3501_detach - deletes a driver "instance"
* @link - FILL_IN
@@ -1936,7 +1926,6 @@
this->p_dev = p_dev;
dev->wireless_data = &this->wireless_data;
dev->wireless_handlers = &wl3501_handler_def;
- SET_ETHTOOL_OPS(dev, &ops);
netif_stop_queue(dev);
p_dev->priv = p_dev->irq.Instance = dev;
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 1a11d95..3f71199 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -42,6 +42,7 @@
#include <linux/mm.h>
#include <net/ip.h>
+#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/events.h>
#include <xen/page.h>
@@ -53,19 +54,36 @@
static const struct ethtool_ops xennet_ethtool_ops;
+static int use_smartpoll = 0;
+module_param(use_smartpoll, int, 0600);
+MODULE_PARM_DESC (use_smartpoll, "Use smartpoll mechanism if available");
+
struct netfront_cb {
struct page *page;
unsigned offset;
};
+#define MICRO_SECOND 1000000UL
+#define NANO_SECOND 1000000000UL
+#define DEFAULT_SMART_POLL_FREQ 1000UL
+
+struct netfront_smart_poll {
+ struct hrtimer timer;
+ struct net_device *netdev;
+ unsigned int smart_poll_freq;
+ unsigned int feature_smart_poll;
+ unsigned int active;
+ unsigned long counter;
+};
+
#define NETFRONT_SKB_CB(skb) ((struct netfront_cb *)((skb)->cb))
#define RX_COPY_THRESHOLD 256
#define GRANT_INVALID_REF 0
-#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
-#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
+#define NET_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
+#define NET_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE)
#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
struct netfront_info {
@@ -104,7 +122,7 @@
/* Receive-ring batched refills. */
#define RX_MIN_TARGET 8
-#define RX_DFL_MIN_TARGET 64
+#define RX_DFL_MIN_TARGET 80
#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
unsigned rx_min_target, rx_max_target, rx_target;
struct sk_buff_head rx_batch;
@@ -118,6 +136,8 @@
unsigned long rx_pfn_array[NET_RX_RING_SIZE];
struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
struct mmu_update rx_mmu[NET_RX_RING_SIZE];
+
+ struct netfront_smart_poll smart_poll;
};
struct netfront_rx_info {
@@ -337,15 +357,17 @@
return 0;
}
-static void xennet_tx_buf_gc(struct net_device *dev)
+static int xennet_tx_buf_gc(struct net_device *dev)
{
RING_IDX cons, prod;
+ RING_IDX cons_begin, cons_end;
unsigned short id;
struct netfront_info *np = netdev_priv(dev);
struct sk_buff *skb;
BUG_ON(!netif_carrier_ok(dev));
+ cons_begin = np->tx.rsp_cons;
do {
prod = np->tx.sring->rsp_prod;
rmb(); /* Ensure we see responses up to 'rp'. */
@@ -390,7 +412,11 @@
mb(); /* update shared area */
} while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
+ cons_end = np->tx.rsp_cons;
+
xennet_maybe_wake_tx(dev);
+
+ return (cons_begin == cons_end);
}
static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
@@ -1267,6 +1293,14 @@
info->rx.sring = NULL;
}
+static int netfront_suspend(struct xenbus_device *dev, pm_message_t state)
+{
+ struct netfront_info *info = dev_get_drvdata(&dev->dev);
+ struct hrtimer *timer = &info->smart_poll.timer;
+ hrtimer_cancel(timer);
+ return 0;
+}
+
/**
* We are reconnecting to the backend, due to a suspend/resume, or a backend
* driver restart. We tear down our netif structure and recreate it, but
@@ -1305,6 +1339,59 @@
return 0;
}
+static enum hrtimer_restart smart_poll_function(struct hrtimer *timer)
+{
+ struct netfront_smart_poll *psmart_poll;
+ struct net_device *dev;
+ struct netfront_info *np;
+ unsigned long flags;
+ unsigned int tx_active = 0, rx_active = 0;
+
+ psmart_poll = container_of(timer, struct netfront_smart_poll, timer);
+ dev = psmart_poll->netdev;
+ np = netdev_priv(dev);
+
+ spin_lock_irqsave(&np->tx_lock, flags);
+
+ if (!np->rx.sring)
+ goto end;
+
+ np->smart_poll.counter++;
+
+ if (likely(netif_carrier_ok(dev))) {
+ tx_active = !(xennet_tx_buf_gc(dev));
+ /* Under tx_lock: protects access to rx shared-ring indexes. */
+ if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) {
+ rx_active = 1;
+ napi_schedule(&np->napi);
+ }
+ }
+
+ np->smart_poll.active |= (tx_active || rx_active);
+ if (np->smart_poll.counter %
+ (np->smart_poll.smart_poll_freq / 10) == 0) {
+ if (!np->smart_poll.active) {
+ np->rx.sring->private.netif.smartpoll_active = 0;
+ goto end;
+ }
+ np->smart_poll.active = 0;
+ }
+
+ if (np->rx.sring->private.netif.smartpoll_active) {
+ if ( hrtimer_start(timer,
+ ktime_set(0, NANO_SECOND/psmart_poll->smart_poll_freq),
+ HRTIMER_MODE_REL) ) {
+ printk(KERN_DEBUG "Failed to start hrtimer,"
+ "use interrupt mode for this packet\n");
+ np->rx.sring->private.netif.smartpoll_active = 0;
+ }
+ }
+
+end:
+ spin_unlock_irqrestore(&np->tx_lock, flags);
+ return HRTIMER_NORESTART;
+}
+
static irqreturn_t xennet_interrupt(int irq, void *dev_id)
{
struct net_device *dev = dev_id;
@@ -1320,6 +1407,16 @@
napi_schedule(&np->napi);
}
+ if (np->smart_poll.feature_smart_poll) {
+ if ( hrtimer_start(&np->smart_poll.timer,
+ ktime_set(0,NANO_SECOND/np->smart_poll.smart_poll_freq),
+ HRTIMER_MODE_REL) ) {
+ printk(KERN_DEBUG "Failed to start hrtimer,"
+ "use interrupt mode for this packet\n");
+ np->rx.sring->private.netif.smartpoll_active = 0;
+ }
+ }
+
spin_unlock_irqrestore(&np->tx_lock, flags);
return IRQ_HANDLED;
@@ -1393,7 +1490,7 @@
}
/* Common code used when first setting up, and when resuming. */
-static int talk_to_backend(struct xenbus_device *dev,
+static int talk_to_netback(struct xenbus_device *dev,
struct netfront_info *info)
{
const char *message;
@@ -1456,6 +1553,12 @@
goto abort_transaction;
}
+ err = xenbus_printf(xbt, dev->nodename, "feature-smart-poll", "%d", use_smartpoll);
+ if (err) {
+ message = "writing feature-smart-poll";
+ goto abort_transaction;
+ }
+
err = xenbus_transaction_end(xbt, 0);
if (err) {
if (err == -EAGAIN)
@@ -1543,7 +1646,26 @@
return -ENODEV;
}
- err = talk_to_backend(np->xbdev, np);
+ np->smart_poll.feature_smart_poll = 0;
+ if (use_smartpoll) {
+ err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+ "feature-smart-poll", "%u",
+ &np->smart_poll.feature_smart_poll);
+ if (err != 1)
+ np->smart_poll.feature_smart_poll = 0;
+ }
+
+ hrtimer_init(&np->smart_poll.timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ if (np->smart_poll.feature_smart_poll) {
+ np->smart_poll.timer.function = smart_poll_function;
+ np->smart_poll.netdev = dev;
+ np->smart_poll.smart_poll_freq = DEFAULT_SMART_POLL_FREQ;
+ np->smart_poll.active = 0;
+ np->smart_poll.counter = 0;
+ }
+
+ err = talk_to_netback(np->xbdev, np);
if (err)
return err;
@@ -1597,7 +1719,7 @@
/**
* Callback received when the backend's state changes.
*/
-static void backend_changed(struct xenbus_device *dev,
+static void netback_changed(struct xenbus_device *dev,
enum xenbus_state backend_state)
{
struct netfront_info *np = dev_get_drvdata(&dev->dev);
@@ -1608,6 +1730,8 @@
switch (backend_state) {
case XenbusStateInitialising:
case XenbusStateInitialised:
+ case XenbusStateReconfiguring:
+ case XenbusStateReconfigured:
case XenbusStateConnected:
case XenbusStateUnknown:
case XenbusStateClosed:
@@ -1628,12 +1752,30 @@
}
}
+static int xennet_get_coalesce(struct net_device *netdev,
+ struct ethtool_coalesce *ec)
+{
+ struct netfront_info *np = netdev_priv(netdev);
+ ec->rx_coalesce_usecs = MICRO_SECOND / np->smart_poll.smart_poll_freq;
+ return 0;
+}
+
+static int xennet_set_coalesce(struct net_device *netdev,
+ struct ethtool_coalesce *ec)
+{
+ struct netfront_info *np = netdev_priv(netdev);
+ np->smart_poll.smart_poll_freq = MICRO_SECOND / ec->rx_coalesce_usecs;
+ return 0;
+}
+
static const struct ethtool_ops xennet_ethtool_ops =
{
.set_tx_csum = ethtool_op_set_tx_csum,
.set_sg = xennet_set_sg,
.set_tso = xennet_set_tso,
.get_link = ethtool_op_get_link,
+ .get_coalesce = xennet_get_coalesce,
+ .set_coalesce = xennet_set_coalesce,
};
#ifdef CONFIG_SYSFS
@@ -1798,8 +1940,9 @@
.ids = netfront_ids,
.probe = netfront_probe,
.remove = __devexit_p(xennet_remove),
+ .suspend = netfront_suspend,
.resume = netfront_resume,
- .otherend_changed = backend_changed,
+ .otherend_changed = netback_changed,
};
static int __init netif_init(void)
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index fdc864f..7802fcd 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -51,6 +51,16 @@
When in doubt, say N.
+config XEN_PCIDEV_FRONTEND
+ tristate "Xen PCI Frontend"
+ depends on XEN && PCI && X86
+ select HOTPLUG
+ select XEN_XENBUS_FRONTEND
+ default y
+ help
+ The PCI device frontend driver allows the kernel to import arbitrary
+ PCI devices from a PCI backend to support PCI driver domains.
+
config HT_IRQ
bool "Interrupts on hypertransport devices"
default y
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 4a7f11d..b70aa4d 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -31,6 +31,8 @@
# Build Intel IOMMU support
obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
+# Build Xen IOMMU support
+obj-$(CONFIG_PCI_XEN) += xen-iommu.o
obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
obj-$(CONFIG_PCI_IOV) += iov.o
@@ -60,6 +62,8 @@
obj-$(CONFIG_PCI_STUB) += pci-stub.o
+obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
+
ifeq ($(CONFIG_PCI_DEBUG),y)
EXTRA_CFLAGS += -DDEBUG
endif
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index cef28a7..1940183 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -249,6 +249,7 @@
up_read(&pci_bus_sem);
}
+EXPORT_SYMBOL_GPL(pci_walk_bus);
EXPORT_SYMBOL(pci_bus_alloc_resource);
EXPORT_SYMBOL_GPL(pci_bus_add_device);
EXPORT_SYMBOL(pci_bus_add_devices);
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 91d0390..24f6f28 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -673,10 +673,13 @@
"x2apic and Intr-remapping.\n");
#endif
#ifdef CONFIG_DMAR
- if (ret && !no_iommu && !iommu_detected && !swiotlb &&
- !dmar_disabled)
+ if (ret && !no_iommu && !iommu_detected && !dmar_disabled)
iommu_detected = 1;
#endif
+#ifdef CONFIG_X86
+ if (ret)
+ x86_init.iommu.iommu_init = intel_iommu_init;
+#endif
}
early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size);
dmar_tbl = NULL;
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index 73e7d8e..1ebd547 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -1018,6 +1018,13 @@
list_for_each_entry(func, &slot->funcs, sibling)
acpiphp_configure_ioapics(func->handle);
pci_enable_bridges(bus);
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ /* Assume that newly added devices are powered on already. */
+ if (!dev->is_added)
+ dev->current_state = PCI_D0;
+ }
+
pci_bus_add_devices(bus);
list_for_each (l, &slot->funcs) {
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 5b680df..02bf548 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3284,7 +3284,7 @@
* Check the need for DMA-remapping initialization now.
* Above initialization will also be used by Interrupt-remapping.
*/
- if (no_iommu || swiotlb || dmar_disabled)
+ if (no_iommu || dmar_disabled)
return -ENODEV;
iommu_init_mempool();
@@ -3305,7 +3305,9 @@
"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
init_timer(&unmap_timer);
- force_iommu = 1;
+#ifdef CONFIG_SWIOTLB
+ swiotlb = 0;
+#endif
dma_ops = &intel_dma_ops;
init_iommu_sysfs();
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index e03fe98..f9db891 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -706,6 +706,21 @@
}
EXPORT_SYMBOL_GPL(pci_sriov_migration);
+/**
+ * pci_num_vf - return number of VFs associated with a PF device_release_driver
+ * @dev: the PCI device
+ *
+ * Returns number of VFs, or 0 if SR-IOV is not enabled.
+ */
+int pci_num_vf(struct pci_dev *dev)
+{
+ if (!dev || !dev->is_physfn)
+ return 0;
+ else
+ return dev->sriov->nr_virtfn;
+}
+EXPORT_SYMBOL_GPL(pci_num_vf);
+
static int ats_alloc_one(struct pci_dev *dev, int ps)
{
int pos;
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 0fb1d05..c7e8a69 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -19,6 +19,9 @@
#include <linux/errno.h>
#include <linux/io.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
#include "pci.h"
#include "msi.h"
@@ -391,6 +394,20 @@
void pci_restore_msi_state(struct pci_dev *dev)
{
+ if (xen_initial_domain()) {
+ struct physdev_restore_msi physdev;
+
+ if (!dev->msi_enabled && !dev->msix_enabled)
+ return;
+
+ pci_intx_for_msi(dev, 0);
+
+ physdev.bus = dev->bus->number;
+ physdev.devfn = dev->devfn;
+ HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &physdev);
+
+ return;
+ }
__pci_restore_msi_state(dev);
__pci_restore_msix_state(dev);
}
diff --git a/drivers/pci/xen-iommu.c b/drivers/pci/xen-iommu.c
new file mode 100644
index 0000000..ac6bcdb
--- /dev/null
+++ b/drivers/pci/xen-iommu.c
@@ -0,0 +1,271 @@
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/scatterlist.h>
+#include <linux/io.h>
+#include <linux/bug.h>
+
+#include <xen/interface/xen.h>
+#include <xen/grant_table.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+
+#include <asm/iommu.h>
+#include <asm/swiotlb.h>
+#include <asm/tlbflush.h>
+
+#define IOMMU_BUG_ON(test) \
+do { \
+ if (unlikely(test)) { \
+ printk(KERN_ALERT "Fatal DMA error! " \
+ "Please use 'swiotlb=force'\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+/* Print address range with message */
+#define PAR(msg, addr, size) \
+do { \
+ printk(msg "[%#llx - %#llx]\n", \
+ (unsigned long long)addr, \
+ (unsigned long long)addr + size); \
+} while (0)
+
+static inline int address_needs_mapping(struct device *hwdev,
+ dma_addr_t addr)
+{
+ dma_addr_t mask = DMA_BIT_MASK(32);
+ int ret;
+
+ /* If the device has a mask, use it, otherwise default to 32 bits */
+ if (hwdev)
+ mask = *hwdev->dma_mask;
+
+ ret = (addr & ~mask) != 0;
+
+ if (ret) {
+ printk(KERN_ERR "dma address needs mapping\n");
+ printk(KERN_ERR "mask: %#llx\n address: [%#llx]\n", mask, addr);
+ }
+ return ret;
+}
+
+static int check_pages_physically_contiguous(unsigned long pfn,
+ unsigned int offset,
+ size_t length)
+{
+ unsigned long next_mfn;
+ int i;
+ int nr_pages;
+
+ next_mfn = pfn_to_mfn(pfn);
+ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+ for (i = 1; i < nr_pages; i++) {
+ if (pfn_to_mfn(++pfn) != ++next_mfn)
+ return 0;
+ }
+ return 1;
+}
+
+static int range_straddles_page_boundary(phys_addr_t p, size_t size)
+{
+ unsigned long pfn = PFN_DOWN(p);
+ unsigned int offset = p & ~PAGE_MASK;
+
+ if (offset + size <= PAGE_SIZE)
+ return 0;
+ if (check_pages_physically_contiguous(pfn, offset, size))
+ return 0;
+ return 1;
+}
+
+static inline void xen_dma_unmap_page(struct page *page)
+{
+ /* Xen TODO: 2.6.18 xen calls __gnttab_dma_unmap_page here
+ * to deal with foreign pages. We'll need similar logic here at
+ * some point.
+ */
+}
+
+/* Gets dma address of a page */
+static inline dma_addr_t xen_dma_map_page(struct page *page)
+{
+ /* Xen TODO: 2.6.18 xen calls __gnttab_dma_map_page here to deal
+ * with foreign pages. We'll need similar logic here at some
+ * point.
+ */
+ return ((dma_addr_t)pfn_to_mfn(page_to_pfn(page))) << PAGE_SHIFT;
+}
+
+static int xen_map_sg(struct device *hwdev, struct scatterlist *sg,
+ int nents,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ struct scatterlist *s;
+ struct page *page;
+ int i, rc;
+
+ BUG_ON(direction == DMA_NONE);
+ WARN_ON(nents == 0 || sg[0].length == 0);
+
+ for_each_sg(sg, s, nents, i) {
+ BUG_ON(!sg_page(s));
+ page = sg_page(s);
+ s->dma_address = xen_dma_map_page(page) + s->offset;
+ s->dma_length = s->length;
+ IOMMU_BUG_ON(range_straddles_page_boundary(
+ page_to_phys(page), s->length));
+ }
+
+ rc = nents;
+
+ flush_write_buffers();
+ return rc;
+}
+
+static void xen_unmap_sg(struct device *hwdev, struct scatterlist *sg,
+ int nents,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ struct scatterlist *s;
+ struct page *page;
+ int i;
+
+ for_each_sg(sg, s, nents, i) {
+ page = pfn_to_page(mfn_to_pfn(PFN_DOWN(s->dma_address)));
+ xen_dma_unmap_page(page);
+ }
+}
+
+static void *xen_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp)
+{
+ void *ret;
+ unsigned int order = get_order(size);
+ unsigned long vstart;
+ u64 mask;
+
+ /* ignore region specifiers */
+ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
+
+ if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
+ return ret;
+
+ if (dev == NULL || (dev->coherent_dma_mask < DMA_BIT_MASK(32)))
+ gfp |= GFP_DMA;
+
+ vstart = __get_free_pages(gfp, order);
+ ret = (void *)vstart;
+
+ if (dev != NULL && dev->coherent_dma_mask)
+ mask = dev->coherent_dma_mask;
+ else
+ mask = DMA_BIT_MASK(32);
+
+ if (ret != NULL) {
+ if (xen_create_contiguous_region(vstart, order,
+ fls64(mask)) != 0) {
+ free_pages(vstart, order);
+ return NULL;
+ }
+ memset(ret, 0, size);
+ *dma_handle = virt_to_machine(ret).maddr;
+ }
+ return ret;
+}
+
+static void xen_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_addr)
+{
+ int order = get_order(size);
+
+ if (dma_release_from_coherent(dev, order, vaddr))
+ return;
+
+ xen_destroy_contiguous_region((unsigned long)vaddr, order);
+ free_pages((unsigned long)vaddr, order);
+}
+
+static dma_addr_t xen_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ dma_addr_t dma;
+
+ BUG_ON(direction == DMA_NONE);
+
+ WARN_ON(size == 0);
+
+ dma = xen_dma_map_page(page) + offset;
+
+ IOMMU_BUG_ON(address_needs_mapping(dev, dma));
+ flush_write_buffers();
+ return dma;
+}
+
+static void xen_unmap_page(struct device *dev, dma_addr_t dma_addr,
+ size_t size,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ BUG_ON(direction == DMA_NONE);
+ xen_dma_unmap_page(pfn_to_page(mfn_to_pfn(PFN_DOWN(dma_addr))));
+}
+
+static struct dma_map_ops xen_dma_ops = {
+ .dma_supported = NULL,
+
+ .alloc_coherent = xen_alloc_coherent,
+ .free_coherent = xen_free_coherent,
+
+ .map_page = xen_map_page,
+ .unmap_page = xen_unmap_page,
+
+ .map_sg = xen_map_sg,
+ .unmap_sg = xen_unmap_sg,
+
+ .mapping_error = NULL,
+
+ .is_phys = 0,
+};
+
+static struct dma_map_ops xen_swiotlb_dma_ops = {
+ .dma_supported = swiotlb_dma_supported,
+
+ .alloc_coherent = xen_alloc_coherent,
+ .free_coherent = xen_free_coherent,
+
+ .map_page = swiotlb_map_page,
+ .unmap_page = swiotlb_unmap_page,
+
+ .map_sg = swiotlb_map_sg_attrs,
+ .unmap_sg = swiotlb_unmap_sg_attrs,
+
+ .mapping_error = swiotlb_dma_mapping_error,
+
+ .is_phys = 0,
+};
+
+void __init xen_iommu_init(void)
+{
+ if (!xen_pv_domain())
+ return;
+
+ printk(KERN_INFO "Xen: Initializing Xen DMA ops\n");
+
+ force_iommu = 0;
+ dma_ops = &xen_dma_ops;
+
+ if (swiotlb) {
+ printk(KERN_INFO "Xen: Enabling DMA fallback to swiotlb\n");
+ dma_ops = &xen_swiotlb_dma_ops;
+ }
+}
+
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
new file mode 100644
index 0000000..76d0bdd
--- /dev/null
+++ b/drivers/pci/xen-pcifront.c
@@ -0,0 +1,1157 @@
+/*
+ * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn)
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include <xen/page.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/msi.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/pciif.h>
+#include <asm/xen/pci.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+#include <linux/time.h>
+
+
+#ifndef __init_refok
+#define __init_refok
+#endif
+
+#define INVALID_GRANT_REF (0)
+#define INVALID_EVTCHN (-1)
+
+
+struct pci_bus_entry {
+ struct list_head list;
+ struct pci_bus *bus;
+};
+
+#define _PDEVB_op_active (0)
+#define PDEVB_op_active (1 << (_PDEVB_op_active))
+
+struct pcifront_device {
+ struct xenbus_device *xdev;
+ struct list_head root_buses;
+
+ int evtchn;
+ int gnt_ref;
+
+ int irq;
+
+ /* Lock this when doing any operations in sh_info */
+ spinlock_t sh_info_lock;
+ struct xen_pci_sharedinfo *sh_info;
+ struct work_struct op_work;
+ unsigned long flags;
+
+};
+
+struct pcifront_sd {
+ int domain;
+ struct pcifront_device *pdev;
+};
+
+static inline struct pcifront_device *
+pcifront_get_pdev(struct pcifront_sd *sd)
+{
+ return sd->pdev;
+}
+
+static inline void pcifront_init_sd(struct pcifront_sd *sd,
+ unsigned int domain, unsigned int bus,
+ struct pcifront_device *pdev)
+{
+ sd->domain = domain;
+ sd->pdev = pdev;
+}
+
+static inline void pcifront_setup_root_resources(struct pci_bus *bus,
+ struct pcifront_sd *sd)
+{
+}
+
+
+DEFINE_SPINLOCK(pcifront_dev_lock);
+static struct pcifront_device *pcifront_dev;
+
+static int verbose_request;
+module_param(verbose_request, int, 0644);
+
+static int errno_to_pcibios_err(int errno)
+{
+ switch (errno) {
+ case XEN_PCI_ERR_success:
+ return PCIBIOS_SUCCESSFUL;
+
+ case XEN_PCI_ERR_dev_not_found:
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ case XEN_PCI_ERR_invalid_offset:
+ case XEN_PCI_ERR_op_failed:
+ return PCIBIOS_BAD_REGISTER_NUMBER;
+
+ case XEN_PCI_ERR_not_implemented:
+ return PCIBIOS_FUNC_NOT_SUPPORTED;
+
+ case XEN_PCI_ERR_access_denied:
+ return PCIBIOS_SET_FAILED;
+ }
+ return errno;
+}
+
+static inline void schedule_pcifront_aer_op(struct pcifront_device *pdev)
+{
+ if (test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
+ && !test_and_set_bit(_PDEVB_op_active, &pdev->flags)) {
+ dev_dbg(&pdev->xdev->dev, "schedule aer frontend job\n");
+ schedule_work(&pdev->op_work);
+ }
+}
+
+static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op)
+{
+ int err = 0;
+ struct xen_pci_op *active_op = &pdev->sh_info->op;
+ unsigned long irq_flags;
+ evtchn_port_t port = pdev->evtchn;
+ unsigned irq = pdev->irq;
+ s64 ns, ns_timeout;
+ struct timeval tv;
+
+ spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
+
+ memcpy(active_op, op, sizeof(struct xen_pci_op));
+
+ /* Go */
+ wmb();
+ set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
+ notify_remote_via_evtchn(port);
+
+ /*
+ * We set a poll timeout of 3 seconds but give up on return after
+ * 2 seconds. It is better to time out too late rather than too early
+ * (in the latter case we end up continually re-executing poll() with a
+ * timeout in the past). 1s difference gives plenty of slack for error.
+ */
+ do_gettimeofday(&tv);
+ ns_timeout = timeval_to_ns(&tv) + 2 * (s64)NSEC_PER_SEC;
+
+ xen_clear_irq_pending(irq);
+
+ while (test_bit(_XEN_PCIF_active,
+ (unsigned long *)&pdev->sh_info->flags)) {
+ xen_poll_irq_timeout(irq, jiffies + 3*HZ);
+ xen_clear_irq_pending(irq);
+ do_gettimeofday(&tv);
+ ns = timeval_to_ns(&tv);
+ if (ns > ns_timeout) {
+ dev_err(&pdev->xdev->dev,
+ "pciback not responding!!!\n");
+ clear_bit(_XEN_PCIF_active,
+ (unsigned long *)&pdev->sh_info->flags);
+ err = XEN_PCI_ERR_dev_not_found;
+ goto out;
+ }
+ }
+
+ /*
+ * We might lose backend service request since we
+ * reuse same evtchn with pci_conf backend response. So re-schedule
+ * aer pcifront service.
+ */
+ if (test_bit(_XEN_PCIB_active,
+ (unsigned long *)&pdev->sh_info->flags)) {
+ dev_err(&pdev->xdev->dev,
+ "schedule aer pcifront service\n");
+ schedule_pcifront_aer_op(pdev);
+ }
+
+ memcpy(op, active_op, sizeof(struct xen_pci_op));
+
+ err = op->err;
+out:
+ spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags);
+ return err;
+}
+
+/* Access to this function is spinlocked in drivers/pci/access.c */
+static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn,
+ int where, int size, u32 *val)
+{
+ int err = 0;
+ struct xen_pci_op op = {
+ .cmd = XEN_PCI_OP_conf_read,
+ .domain = pci_domain_nr(bus),
+ .bus = bus->number,
+ .devfn = devfn,
+ .offset = where,
+ .size = size,
+ };
+ struct pcifront_sd *sd = bus->sysdata;
+ struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+ if (verbose_request)
+ dev_info(&pdev->xdev->dev,
+ "read dev=%04x:%02x:%02x.%01x - offset %x size %d\n",
+ pci_domain_nr(bus), bus->number, PCI_SLOT(devfn),
+ PCI_FUNC(devfn), where, size);
+
+ err = do_pci_op(pdev, &op);
+
+ if (likely(!err)) {
+ if (verbose_request)
+ dev_info(&pdev->xdev->dev, "read got back value %x\n",
+ op.value);
+
+ *val = op.value;
+ } else if (err == -ENODEV) {
+ /* No device here, pretend that it just returned 0 */
+ err = 0;
+ *val = 0;
+ }
+
+ return errno_to_pcibios_err(err);
+}
+
+/* Access to this function is spinlocked in drivers/pci/access.c */
+static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn,
+ int where, int size, u32 val)
+{
+ struct xen_pci_op op = {
+ .cmd = XEN_PCI_OP_conf_write,
+ .domain = pci_domain_nr(bus),
+ .bus = bus->number,
+ .devfn = devfn,
+ .offset = where,
+ .size = size,
+ .value = val,
+ };
+ struct pcifront_sd *sd = bus->sysdata;
+ struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+ if (verbose_request)
+ dev_info(&pdev->xdev->dev,
+ "write dev=%04x:%02x:%02x.%01x - "
+ "offset %x size %d val %x\n",
+ pci_domain_nr(bus), bus->number,
+ PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val);
+
+ return errno_to_pcibios_err(do_pci_op(pdev, &op));
+}
+
+struct pci_ops pcifront_bus_ops = {
+ .read = pcifront_bus_read,
+ .write = pcifront_bus_write,
+};
+
+#ifdef CONFIG_PCI_MSI
+static int pci_frontend_enable_msix(struct pci_dev *dev,
+ int **vector, int nvec)
+{
+ int err;
+ int i;
+ struct xen_pci_op op = {
+ .cmd = XEN_PCI_OP_enable_msix,
+ .domain = pci_domain_nr(dev->bus),
+ .bus = dev->bus->number,
+ .devfn = dev->devfn,
+ .value = nvec,
+ };
+ struct pcifront_sd *sd = dev->bus->sysdata;
+ struct pcifront_device *pdev = pcifront_get_pdev(sd);
+ struct msi_desc *entry;
+
+ if (nvec > SH_INFO_MAX_VEC) {
+ dev_err(&dev->dev, "too much vector for pci frontend: %x."
+ " Increase SH_INFO_MAX_VEC.\n", nvec);
+ return -EINVAL;
+ }
+
+ i = 0;
+ list_for_each_entry(entry, &dev->msi_list, list) {
+ op.msix_entries[i].entry = entry->msi_attrib.entry_nr;
+ /* Vector is useless at this point. */
+ op.msix_entries[i].vector = -1;
+ i++;
+ }
+
+ err = do_pci_op(pdev, &op);
+
+ if (likely(!err)) {
+ if (likely(!op.value)) {
+ /* we get the result */
+ for (i = 0; i < nvec; i++)
+ *(*vector+i) = op.msix_entries[i].vector;
+ return 0;
+ } else {
+ printk(KERN_DEBUG "enable msix get value %x\n",
+ op.value);
+ return op.value;
+ }
+ } else {
+ dev_err(&dev->dev, "enable msix get err %x\n", err);
+ return err;
+ }
+}
+
+static void pci_frontend_disable_msix(struct pci_dev *dev)
+{
+ int err;
+ struct xen_pci_op op = {
+ .cmd = XEN_PCI_OP_disable_msix,
+ .domain = pci_domain_nr(dev->bus),
+ .bus = dev->bus->number,
+ .devfn = dev->devfn,
+ };
+ struct pcifront_sd *sd = dev->bus->sysdata;
+ struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+ err = do_pci_op(pdev, &op);
+
+ /* What should do for error ? */
+ if (err)
+ dev_err(&dev->dev, "pci_disable_msix get err %x\n", err);
+}
+
+static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector)
+{
+ int err;
+ struct xen_pci_op op = {
+ .cmd = XEN_PCI_OP_enable_msi,
+ .domain = pci_domain_nr(dev->bus),
+ .bus = dev->bus->number,
+ .devfn = dev->devfn,
+ };
+ struct pcifront_sd *sd = dev->bus->sysdata;
+ struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+ err = do_pci_op(pdev, &op);
+ if (likely(!err)) {
+ *(*vector) = op.value;
+ } else {
+ dev_err(&dev->dev, "pci frontend enable msi failed for dev "
+ "%x:%x \n", op.bus, op.devfn);
+ err = -EINVAL;
+ }
+ return err;
+}
+
+static void pci_frontend_disable_msi(struct pci_dev *dev)
+{
+ int err;
+ struct xen_pci_op op = {
+ .cmd = XEN_PCI_OP_disable_msi,
+ .domain = pci_domain_nr(dev->bus),
+ .bus = dev->bus->number,
+ .devfn = dev->devfn,
+ };
+ struct pcifront_sd *sd = dev->bus->sysdata;
+ struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+ err = do_pci_op(pdev, &op);
+ if (err == XEN_PCI_ERR_dev_not_found) {
+ /* XXX No response from backend, what shall we do? */
+ printk(KERN_DEBUG "get no response from backend for disable MSI\n");
+ return;
+ }
+ if (err)
+ /* how can pciback notify us fail? */
+ printk(KERN_DEBUG "get fake response frombackend \n");
+}
+
+static struct xen_pci_frontend_ops pci_frontend_ops = {
+ .enable_msi = pci_frontend_enable_msi,
+ .disable_msi = pci_frontend_disable_msi,
+ .enable_msix = pci_frontend_enable_msix,
+ .disable_msix = pci_frontend_disable_msix,
+};
+
+static void pci_frontend_registrar(int enable)
+{
+ if (enable)
+ xen_pci_frontend = &pci_frontend_ops;
+ else
+ xen_pci_frontend = NULL;
+};
+#else
+static inline void pci_frontend_registrar(int enable) { };
+#endif /* CONFIG_PCI_MSI */
+
+/* Claim resources for the PCI frontend as-is, backend won't allow changes */
+static int pcifront_claim_resource(struct pci_dev *dev, void *data)
+{
+ struct pcifront_device *pdev = data;
+ int i;
+ struct resource *r;
+
+ for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+ r = &dev->resource[i];
+
+ if (!r->parent && r->start && r->flags) {
+ dev_info(&pdev->xdev->dev, "claiming resource %s/%d\n",
+ pci_name(dev), i);
+ if (pci_claim_resource(dev, i)) {
+ dev_err(&pdev->xdev->dev, "Could not claim "
+ "resource %s/%d! Device offline. Try "
+ "giving less than 4GB to domain.\n",
+ pci_name(dev), i);
+ }
+ }
+ }
+
+ return 0;
+}
+
+int __devinit pcifront_scan_bus(struct pcifront_device *pdev,
+ unsigned int domain, unsigned int bus,
+ struct pci_bus *b)
+{
+ struct pci_dev *d;
+ unsigned int devfn;
+ int err;
+
+ /* Scan the bus for functions and add.
+ * We omit handling of PCI bridge attachment because pciback prevents
+ * bridges from being exported.
+ */
+ for (devfn = 0; devfn < 0x100; devfn++) {
+ d = pci_get_slot(b, devfn);
+ if (d) {
+ /* Device is already known. */
+ pci_dev_put(d);
+ continue;
+ }
+
+ d = pci_scan_single_device(b, devfn);
+ if (d)
+ dev_info(&pdev->xdev->dev, "New device on "
+ "%04x:%02x:%02x.%02x found.\n", domain, bus,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+ }
+
+ return 0;
+}
+
+int __devinit pcifront_scan_root(struct pcifront_device *pdev,
+ unsigned int domain, unsigned int bus)
+{
+ struct pci_bus *b;
+ struct pcifront_sd *sd = NULL;
+ struct pci_bus_entry *bus_entry = NULL;
+ int err = 0;
+
+#ifndef CONFIG_PCI_DOMAINS
+ if (domain != 0) {
+ dev_err(&pdev->xdev->dev,
+ "PCI Root in non-zero PCI Domain! domain=%d\n", domain);
+ dev_err(&pdev->xdev->dev,
+ "Please compile with CONFIG_PCI_DOMAINS\n");
+ err = -EINVAL;
+ goto err_out;
+ }
+#endif
+
+ dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n",
+ domain, bus);
+
+ bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL);
+ sd = kmalloc(sizeof(*sd), GFP_KERNEL);
+ if (!bus_entry || !sd) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ pcifront_init_sd(sd, domain, bus, pdev);
+
+ b = pci_scan_bus_parented(&pdev->xdev->dev, bus,
+ &pcifront_bus_ops, sd);
+ if (!b) {
+ dev_err(&pdev->xdev->dev,
+ "Error creating PCI Frontend Bus!\n");
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ pcifront_setup_root_resources(b, sd);
+ bus_entry->bus = b;
+
+ list_add(&bus_entry->list, &pdev->root_buses);
+
+ /* pci_scan_bus_parented skips devices which do not have a have
+ * devfn==0. The pcifront_scan_bus enumerates all devfn. */
+ err = pcifront_scan_bus(pdev, domain, bus, b);
+
+ /* Claim resources before going "live" with our devices */
+ pci_walk_bus(b, pcifront_claim_resource, pdev);
+
+ /* Create SysFS and notify udev of the devices. Aka: "going live" */
+ pci_bus_add_devices(b);
+
+ return err;
+
+err_out:
+ kfree(bus_entry);
+ kfree(sd);
+
+ return err;
+}
+
+int __devinit pcifront_rescan_root(struct pcifront_device *pdev,
+ unsigned int domain, unsigned int bus)
+{
+ int err;
+ struct pci_bus *b;
+
+#ifndef CONFIG_PCI_DOMAINS
+ if (domain != 0) {
+ dev_err(&pdev->xdev->dev,
+ "PCI Root in non-zero PCI Domain! domain=%d\n", domain);
+ dev_err(&pdev->xdev->dev,
+ "Please compile with CONFIG_PCI_DOMAINS\n");
+ return -EINVAL;
+ }
+#endif
+
+ dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n",
+ domain, bus);
+
+ b = pci_find_bus(domain, bus);
+ if (!b)
+ /* If the bus is unknown, create it. */
+ return pcifront_scan_root(pdev, domain, bus);
+
+ err = pcifront_scan_bus(pdev, domain, bus, b);
+
+ /* Claim resources before going "live" with our devices */
+ pci_walk_bus(b, pcifront_claim_resource, pdev);
+
+ /* Create SysFS and notify udev of the devices. Aka: "going live" */
+ pci_bus_add_devices(b);
+
+ return err;
+}
+
+static void free_root_bus_devs(struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+
+ while (!list_empty(&bus->devices)) {
+ dev = container_of(bus->devices.next, struct pci_dev,
+ bus_list);
+ dev_dbg(&dev->dev, "removing device\n");
+ pci_remove_bus_device(dev);
+ }
+}
+
+void pcifront_free_roots(struct pcifront_device *pdev)
+{
+ struct pci_bus_entry *bus_entry, *t;
+
+ dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n");
+
+ list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) {
+ list_del(&bus_entry->list);
+
+ free_root_bus_devs(bus_entry->bus);
+
+ kfree(bus_entry->bus->sysdata);
+
+ device_unregister(bus_entry->bus->bridge);
+ pci_remove_bus(bus_entry->bus);
+
+ kfree(bus_entry);
+ }
+}
+
+static pci_ers_result_t pcifront_common_process(int cmd,
+ struct pcifront_device *pdev,
+ pci_channel_state_t state)
+{
+ pci_ers_result_t result;
+ struct pci_driver *pdrv;
+ int bus = pdev->sh_info->aer_op.bus;
+ int devfn = pdev->sh_info->aer_op.devfn;
+ struct pci_dev *pcidev;
+ int flag = 0;
+
+ dev_dbg(&pdev->xdev->dev,
+ "pcifront AER process: cmd %x (bus:%x, devfn%x)",
+ cmd, bus, devfn);
+ result = PCI_ERS_RESULT_NONE;
+
+ pcidev = pci_get_bus_and_slot(bus, devfn);
+ if (!pcidev || !pcidev->driver) {
+ dev_err(&pcidev->dev,
+ "device or driver is NULL\n");
+ return result;
+ }
+ pdrv = pcidev->driver;
+
+ if (get_driver(&pdrv->driver)) {
+ if (pdrv->err_handler && pdrv->err_handler->error_detected) {
+ dev_dbg(&pcidev->dev,
+ "trying to call AER service\n");
+ if (pcidev) {
+ flag = 1;
+ switch (cmd) {
+ case XEN_PCI_OP_aer_detected:
+ result = pdrv->err_handler->
+ error_detected(pcidev, state);
+ break;
+ case XEN_PCI_OP_aer_mmio:
+ result = pdrv->err_handler->
+ mmio_enabled(pcidev);
+ break;
+ case XEN_PCI_OP_aer_slotreset:
+ result = pdrv->err_handler->
+ slot_reset(pcidev);
+ break;
+ case XEN_PCI_OP_aer_resume:
+ pdrv->err_handler->resume(pcidev);
+ break;
+ default:
+ dev_err(&pdev->xdev->dev,
+ "bad request in aer recovery "
+ "operation!\n");
+
+ }
+ }
+ }
+ put_driver(&pdrv->driver);
+ }
+ if (!flag)
+ result = PCI_ERS_RESULT_NONE;
+
+ return result;
+}
+
+
+void pcifront_do_aer(struct work_struct *data)
+{
+ struct pcifront_device *pdev =
+ container_of(data, struct pcifront_device, op_work);
+ int cmd = pdev->sh_info->aer_op.cmd;
+ pci_channel_state_t state =
+ (pci_channel_state_t)pdev->sh_info->aer_op.err;
+
+ /*If a pci_conf op is in progress,
+ we have to wait until it is done before service aer op*/
+ dev_dbg(&pdev->xdev->dev,
+ "pcifront service aer bus %x devfn %x\n",
+ pdev->sh_info->aer_op.bus, pdev->sh_info->aer_op.devfn);
+
+ pdev->sh_info->aer_op.err = pcifront_common_process(cmd, pdev, state);
+
+ wmb();
+ clear_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags);
+ notify_remote_via_evtchn(pdev->evtchn);
+
+ /*in case of we lost an aer request in four lines time_window*/
+ smp_mb__before_clear_bit();
+ clear_bit(_PDEVB_op_active, &pdev->flags);
+ smp_mb__after_clear_bit();
+
+ schedule_pcifront_aer_op(pdev);
+
+}
+
+irqreturn_t pcifront_handler_aer(int irq, void *dev)
+{
+ struct pcifront_device *pdev = dev;
+ schedule_pcifront_aer_op(pdev);
+ return IRQ_HANDLED;
+}
+int pcifront_connect(struct pcifront_device *pdev)
+{
+ int err = 0;
+
+ spin_lock(&pcifront_dev_lock);
+
+ if (!pcifront_dev) {
+ dev_info(&pdev->xdev->dev, "Installing PCI frontend\n");
+ pcifront_dev = pdev;
+ } else {
+ dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n");
+ err = -EEXIST;
+ }
+
+ spin_unlock(&pcifront_dev_lock);
+
+ return err;
+}
+
+void pcifront_disconnect(struct pcifront_device *pdev)
+{
+ spin_lock(&pcifront_dev_lock);
+
+ if (pdev == pcifront_dev) {
+ dev_info(&pdev->xdev->dev,
+ "Disconnecting PCI Frontend Buses\n");
+ pcifront_dev = NULL;
+ }
+
+ spin_unlock(&pcifront_dev_lock);
+}
+static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev)
+{
+ struct pcifront_device *pdev;
+
+ pdev = kzalloc(sizeof(struct pcifront_device), GFP_KERNEL);
+ if (pdev == NULL)
+ goto out;
+
+ pdev->sh_info =
+ (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL);
+ if (pdev->sh_info == NULL) {
+ kfree(pdev);
+ pdev = NULL;
+ goto out;
+ }
+ pdev->sh_info->flags = 0;
+
+ /*Flag for registering PV AER handler*/
+ set_bit(_XEN_PCIB_AERHANDLER, (void *)&pdev->sh_info->flags);
+
+ dev_set_drvdata(&xdev->dev, pdev);
+ pdev->xdev = xdev;
+
+ INIT_LIST_HEAD(&pdev->root_buses);
+
+ spin_lock_init(&pdev->sh_info_lock);
+
+ pdev->evtchn = INVALID_EVTCHN;
+ pdev->gnt_ref = INVALID_GRANT_REF;
+ pdev->irq = -1;
+
+ INIT_WORK(&pdev->op_work, pcifront_do_aer);
+
+ dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n",
+ pdev, pdev->sh_info);
+out:
+ return pdev;
+}
+
+static void free_pdev(struct pcifront_device *pdev)
+{
+ dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev);
+
+ pcifront_free_roots(pdev);
+
+ /*For PCIE_AER error handling job*/
+ flush_scheduled_work();
+ unbind_from_irqhandler(pdev->irq, pdev);
+
+ if (pdev->evtchn != INVALID_EVTCHN)
+ xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
+
+ if (pdev->gnt_ref != INVALID_GRANT_REF)
+ gnttab_end_foreign_access(pdev->gnt_ref, 0 /* r/w page */,
+ (unsigned long)pdev->sh_info);
+
+ dev_set_drvdata(&pdev->xdev->dev, NULL);
+ kfree(pdev);
+}
+
+static int pcifront_publish_info(struct pcifront_device *pdev)
+{
+ int err = 0;
+ struct xenbus_transaction trans;
+
+ err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
+ if (err < 0)
+ goto out;
+
+ pdev->gnt_ref = err;
+
+ err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
+ if (err)
+ goto out;
+
+ err = bind_evtchn_to_irqhandler(pdev->evtchn, pcifront_handler_aer,
+ 0, "pcifront", pdev);
+ if (err < 0) {
+ xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
+ xenbus_dev_fatal(pdev->xdev, err, "Failed to bind evtchn to "
+ "irqhandler.\n");
+ return err;
+ }
+ pdev->irq = err;
+
+do_publish:
+ err = xenbus_transaction_start(&trans);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error writing configuration for backend "
+ "(start transaction)");
+ goto out;
+ }
+
+ err = xenbus_printf(trans, pdev->xdev->nodename,
+ "pci-op-ref", "%u", pdev->gnt_ref);
+ if (!err)
+ err = xenbus_printf(trans, pdev->xdev->nodename,
+ "event-channel", "%u", pdev->evtchn);
+ if (!err)
+ err = xenbus_printf(trans, pdev->xdev->nodename,
+ "magic", XEN_PCI_MAGIC);
+
+ if (err) {
+ xenbus_transaction_end(trans, 1);
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error writing configuration for backend");
+ goto out;
+ } else {
+ err = xenbus_transaction_end(trans, 0);
+ if (err == -EAGAIN)
+ goto do_publish;
+ else if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error completing transaction "
+ "for backend");
+ goto out;
+ }
+ }
+
+ xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
+
+ dev_dbg(&pdev->xdev->dev, "publishing successful!\n");
+
+out:
+ return err;
+}
+
+static int __devinit pcifront_try_connect(struct pcifront_device *pdev)
+{
+ int err = -EFAULT;
+ int i, num_roots, len;
+ char str[64];
+ unsigned int domain, bus;
+
+
+ /* Only connect once */
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateInitialised)
+ goto out;
+
+ err = pcifront_connect(pdev);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error connecting PCI Frontend");
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+ "root_num", "%d", &num_roots);
+ if (err == -ENOENT) {
+ xenbus_dev_error(pdev->xdev, err,
+ "No PCI Roots found, trying 0000:00");
+ err = pcifront_scan_root(pdev, 0, 0);
+ num_roots = 0;
+ } else if (err != 1) {
+ if (err == 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading number of PCI roots");
+ goto out;
+ }
+
+ for (i = 0; i < num_roots; i++) {
+ len = snprintf(str, sizeof(str), "root-%d", i);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+ "%x:%x", &domain, &bus);
+ if (err != 2) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading PCI root %d", i);
+ goto out;
+ }
+
+ err = pcifront_scan_root(pdev, domain, bus);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error scanning PCI root %04x:%02x",
+ domain, bus);
+ goto out;
+ }
+ }
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+
+out:
+ return err;
+}
+
+static int pcifront_try_disconnect(struct pcifront_device *pdev)
+{
+ int err = 0;
+ enum xenbus_state prev_state;
+
+
+ prev_state = xenbus_read_driver_state(pdev->xdev->nodename);
+
+ if (prev_state >= XenbusStateClosing)
+ goto out;
+
+ if (prev_state == XenbusStateConnected) {
+ pcifront_free_roots(pdev);
+ pcifront_disconnect(pdev);
+ }
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateClosed);
+
+out:
+
+ return err;
+}
+
+static int __devinit pcifront_attach_devices(struct pcifront_device *pdev)
+{
+ int err = -EFAULT;
+ int i, num_roots, len;
+ unsigned int domain, bus;
+ char str[64];
+
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateReconfiguring)
+ goto out;
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+ "root_num", "%d", &num_roots);
+ if (err == -ENOENT) {
+ xenbus_dev_error(pdev->xdev, err,
+ "No PCI Roots found, trying 0000:00");
+ err = pcifront_rescan_root(pdev, 0, 0);
+ num_roots = 0;
+ } else if (err != 1) {
+ if (err == 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading number of PCI roots");
+ goto out;
+ }
+
+ for (i = 0; i < num_roots; i++) {
+ len = snprintf(str, sizeof(str), "root-%d", i);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+ "%x:%x", &domain, &bus);
+ if (err != 2) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading PCI root %d", i);
+ goto out;
+ }
+
+ err = pcifront_rescan_root(pdev, domain, bus);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error scanning PCI root %04x:%02x",
+ domain, bus);
+ goto out;
+ }
+ }
+
+ xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+
+out:
+ return err;
+}
+
+static int pcifront_detach_devices(struct pcifront_device *pdev)
+{
+ int err = 0;
+ int i, num_devs;
+ unsigned int domain, bus, slot, func;
+ struct pci_bus *pci_bus;
+ struct pci_dev *pci_dev;
+ char str[64];
+
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateConnected)
+ goto out;
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "num_devs", "%d",
+ &num_devs);
+ if (err != 1) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading number of PCI devices");
+ goto out;
+ }
+
+ /* Find devices being detached and remove them. */
+ for (i = 0; i < num_devs; i++) {
+ int l, state;
+ l = snprintf(str, sizeof(str), "state-%d", i);
+ if (unlikely(l >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, "%d",
+ &state);
+ if (err != 1)
+ state = XenbusStateUnknown;
+
+ if (state != XenbusStateClosing)
+ continue;
+
+ /* Remove device. */
+ l = snprintf(str, sizeof(str), "vdev-%d", i);
+ if (unlikely(l >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+ "%x:%x:%x.%x", &domain, &bus, &slot, &func);
+ if (err != 4) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading PCI device %d", i);
+ goto out;
+ }
+
+ pci_bus = pci_find_bus(domain, bus);
+ if (!pci_bus) {
+ dev_dbg(&pdev->xdev->dev, "Cannot get bus %04x:%02x\n",
+ domain, bus);
+ continue;
+ }
+ pci_dev = pci_get_slot(pci_bus, PCI_DEVFN(slot, func));
+ if (!pci_dev) {
+ dev_dbg(&pdev->xdev->dev,
+ "Cannot get PCI device %04x:%02x:%02x.%02x\n",
+ domain, bus, slot, func);
+ continue;
+ }
+ pci_remove_bus_device(pci_dev);
+ pci_dev_put(pci_dev);
+
+ dev_dbg(&pdev->xdev->dev,
+ "PCI device %04x:%02x:%02x.%02x removed.\n",
+ domain, bus, slot, func);
+ }
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateReconfiguring);
+
+out:
+ return err;
+}
+
+static void __init_refok pcifront_backend_changed(struct xenbus_device *xdev,
+ enum xenbus_state be_state)
+{
+ struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev);
+
+ switch (be_state) {
+ case XenbusStateUnknown:
+ case XenbusStateInitialising:
+ case XenbusStateInitWait:
+ case XenbusStateInitialised:
+ case XenbusStateClosed:
+ break;
+
+ case XenbusStateConnected:
+ pcifront_try_connect(pdev);
+ break;
+
+ case XenbusStateClosing:
+ dev_warn(&xdev->dev, "backend going away!\n");
+ pcifront_try_disconnect(pdev);
+ break;
+
+ case XenbusStateReconfiguring:
+ pcifront_detach_devices(pdev);
+ break;
+
+ case XenbusStateReconfigured:
+ pcifront_attach_devices(pdev);
+ break;
+ }
+}
+
+static int pcifront_xenbus_probe(struct xenbus_device *xdev,
+ const struct xenbus_device_id *id)
+{
+ int err = 0;
+ struct pcifront_device *pdev = alloc_pdev(xdev);
+
+ if (pdev == NULL) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(xdev, err,
+ "Error allocating pcifront_device struct");
+ goto out;
+ }
+
+ err = pcifront_publish_info(pdev);
+
+out:
+ return err;
+}
+
+static int pcifront_xenbus_remove(struct xenbus_device *xdev)
+{
+ struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev);
+
+ if (pdev)
+ free_pdev(pdev);
+
+ return 0;
+}
+
+static const struct xenbus_device_id xenpci_ids[] = {
+ {"pci"},
+ {""},
+};
+
+static struct xenbus_driver xenbus_pcifront_driver = {
+ .name = "pcifront",
+ .owner = THIS_MODULE,
+ .ids = xenpci_ids,
+ .probe = pcifront_xenbus_probe,
+ .remove = pcifront_xenbus_remove,
+ .otherend_changed = pcifront_backend_changed,
+};
+
+static int __init pcifront_init(void)
+{
+ if (!xen_domain())
+ return -ENODEV;
+
+ pci_frontend_registrar(1 /* enable */);
+
+ return xenbus_register_frontend(&xenbus_pcifront_driver);
+}
+
+static void __exit pcifront_cleanup(void)
+{
+ xenbus_unregister_driver(&xenbus_pcifront_driver);
+ pci_frontend_registrar(0 /* disable */);
+}
+module_init(pcifront_init);
+module_exit(pcifront_cleanup);
+
+MODULE_DESCRIPTION("Xen PCI passthrough frontend.");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("xen:pci");
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index 188e1ba..efac9e3 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -2063,6 +2063,7 @@
select FB_SYS_IMAGEBLIT
select FB_SYS_FOPS
select FB_DEFERRED_IO
+ select XEN_XENBUS_FRONTEND
default y
help
This driver implements the front-end of the Xen virtual
diff --git a/drivers/video/broadsheetfb.c b/drivers/video/broadsheetfb.c
index 509cb92..df9ccb9 100644
--- a/drivers/video/broadsheetfb.c
+++ b/drivers/video/broadsheetfb.c
@@ -470,7 +470,7 @@
par->read_reg = broadsheet_read_reg;
init_waitqueue_head(&par->waitq);
- info->flags = FBINFO_FLAG_DEFAULT;
+ info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
info->fbdefio = &broadsheetfb_defio;
fb_deferred_io_init(info);
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
index c27ab1e..94414fc 100644
--- a/drivers/video/fb_defio.c
+++ b/drivers/video/fb_defio.c
@@ -144,7 +144,9 @@
static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
{
vma->vm_ops = &fb_deferred_io_vm_ops;
- vma->vm_flags |= ( VM_IO | VM_RESERVED | VM_DONTEXPAND );
+ vma->vm_flags |= ( VM_RESERVED | VM_DONTEXPAND );
+ if (!(info->flags & FBINFO_VIRTFB))
+ vma->vm_flags |= VM_IO;
vma->vm_private_data = info;
return 0;
}
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 99bbd28..057433a 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1362,6 +1362,7 @@
vma->vm_pgoff = off >> PAGE_SHIFT;
/* This is an IO map - tell maydump to skip this VMA */
vma->vm_flags |= VM_IO | VM_RESERVED;
+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
fb_pgprotect(file, vma, off);
if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT,
vma->vm_end - vma->vm_start, vma->vm_page_prot))
diff --git a/drivers/video/hecubafb.c b/drivers/video/hecubafb.c
index 0b4bffb..f9d77ad 100644
--- a/drivers/video/hecubafb.c
+++ b/drivers/video/hecubafb.c
@@ -253,7 +253,7 @@
par->send_command = apollo_send_command;
par->send_data = apollo_send_data;
- info->flags = FBINFO_FLAG_DEFAULT;
+ info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
info->fbdefio = &hecubafb_defio;
fb_deferred_io_init(info);
diff --git a/drivers/video/metronomefb.c b/drivers/video/metronomefb.c
index df1f757..661bfd2 100644
--- a/drivers/video/metronomefb.c
+++ b/drivers/video/metronomefb.c
@@ -700,7 +700,7 @@
if (retval < 0)
goto err_free_irq;
- info->flags = FBINFO_FLAG_DEFAULT;
+ info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
info->fbdefio = &metronomefb_defio;
fb_deferred_io_init(info);
diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c
index 54cd916..dc72563 100644
--- a/drivers/video/xen-fbfront.c
+++ b/drivers/video/xen-fbfront.c
@@ -25,7 +25,10 @@
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
+
#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
#include <xen/events.h>
#include <xen/page.h>
#include <xen/interface/io/fbif.h>
@@ -440,7 +443,7 @@
fb_info->fix.type = FB_TYPE_PACKED_PIXELS;
fb_info->fix.accel = FB_ACCEL_NONE;
- fb_info->flags = FBINFO_FLAG_DEFAULT;
+ fb_info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
ret = fb_alloc_cmap(&fb_info->cmap, 256, 0);
if (ret < 0) {
@@ -627,6 +630,8 @@
switch (backend_state) {
case XenbusStateInitialising:
case XenbusStateInitialised:
+ case XenbusStateReconfiguring:
+ case XenbusStateReconfigured:
case XenbusStateUnknown:
case XenbusStateClosed:
break;
@@ -680,7 +685,7 @@
static int __init xenfb_init(void)
{
- if (!xen_domain())
+ if (!xen_domain() || xen_hvm_domain())
return -ENODEV;
/* Nothing to do if running in dom0. */
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 3711b88..4fcb4c5 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -975,6 +975,16 @@
# XTENSA Architecture
+# Xen Architecture
+
+config XEN_WDT
+ tristate "Xen Watchdog support"
+ depends on XEN
+ help
+ Say Y here to support the hypervisor watchdog capability provided
+ by Xen 4.0 and newer. The watchdog timeout period is normally one
+ minute but can be changed with a boot-time parameter.
+
#
# ISA-based Watchdog Cards
#
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index 699199b..2f6739a 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -141,6 +141,9 @@
# XTENSA Architecture
+# Xen
+obj-$(CONFIG_XEN_WDT) += xen_wdt.o
+
# Architecture Independant
obj-$(CONFIG_WM831X_WATCHDOG) += wm831x_wdt.o
obj-$(CONFIG_WM8350_WATCHDOG) += wm8350_wdt.o
diff --git a/drivers/watchdog/xen_wdt.c b/drivers/watchdog/xen_wdt.c
new file mode 100644
index 0000000..bcfaafb
--- /dev/null
+++ b/drivers/watchdog/xen_wdt.c
@@ -0,0 +1,359 @@
+/*
+ * Xen Watchdog Driver
+ *
+ * (c) Copyright 2010 Novell, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define DRV_NAME "wdt"
+#define DRV_VERSION "0.01"
+#define PFX DRV_NAME ": "
+
+#include <linux/bug.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/hrtimer.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/watchdog.h>
+#include <xen/xen.h>
+#include <asm/xen/hypercall.h>
+#include <xen/interface/sched.h>
+
+static struct platform_device *platform_device;
+static DEFINE_SPINLOCK(wdt_lock);
+static struct sched_watchdog wdt;
+static __kernel_time_t wdt_expires;
+static bool is_active, expect_release;
+
+#define WATCHDOG_TIMEOUT 60 /* in seconds */
+static unsigned int timeout = WATCHDOG_TIMEOUT;
+module_param(timeout, uint, S_IRUGO);
+MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds "
+ "(default=" __MODULE_STRING(WATCHDOG_TIMEOUT) ")");
+
+static bool nowayout = WATCHDOG_NOWAYOUT;
+module_param(nowayout, bool, S_IRUGO);
+MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started "
+ "(default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
+static inline __kernel_time_t set_timeout(void)
+{
+ wdt.timeout = timeout;
+ return ktime_to_timespec(ktime_get()).tv_sec + timeout;
+}
+
+static int xen_wdt_start(void)
+{
+ __kernel_time_t expires;
+ int err;
+
+ spin_lock(&wdt_lock);
+
+ expires = set_timeout();
+ if (!wdt.id)
+ err = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wdt);
+ else
+ err = -EBUSY;
+ if (err > 0) {
+ wdt.id = err;
+ wdt_expires = expires;
+ err = 0;
+ } else
+ BUG_ON(!err);
+
+ spin_unlock(&wdt_lock);
+
+ return err;
+}
+
+static int xen_wdt_stop(void)
+{
+ int err = 0;
+
+ spin_lock(&wdt_lock);
+
+ wdt.timeout = 0;
+ if (wdt.id)
+ err = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wdt);
+ if (!err)
+ wdt.id = 0;
+
+ spin_unlock(&wdt_lock);
+
+ return err;
+}
+
+static int xen_wdt_kick(void)
+{
+ __kernel_time_t expires;
+ int err;
+
+ spin_lock(&wdt_lock);
+
+ expires = set_timeout();
+ if (wdt.id)
+ err = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wdt);
+ else
+ err = -ENXIO;
+ if (!err)
+ wdt_expires = expires;
+
+ spin_unlock(&wdt_lock);
+
+ return err;
+}
+
+static int xen_wdt_open(struct inode *inode, struct file *file)
+{
+ int err;
+
+ /* /dev/watchdog can only be opened once */
+ if (xchg(&is_active, true))
+ return -EBUSY;
+
+ err = xen_wdt_start();
+ if (err == -EBUSY)
+ err = xen_wdt_kick();
+ return err ?: nonseekable_open(inode, file);
+}
+
+static int xen_wdt_release(struct inode *inode, struct file *file)
+{
+ if (expect_release)
+ xen_wdt_stop();
+ else {
+ printk(KERN_CRIT PFX
+ "unexpected close, not stopping watchdog!\n");
+ xen_wdt_kick();
+ }
+ is_active = false;
+ expect_release = false;
+ return 0;
+}
+
+static ssize_t xen_wdt_write(struct file *file, const char __user *data,
+ size_t len, loff_t *ppos)
+{
+ /* See if we got the magic character 'V' and reload the timer */
+ if (len) {
+ if (!nowayout) {
+ size_t i;
+
+ /* in case it was set long ago */
+ expect_release = false;
+
+ /* scan to see whether or not we got the magic
+ character */
+ for (i = 0; i != len; i++) {
+ char c;
+ if (get_user(c, data + i))
+ return -EFAULT;
+ if (c == 'V')
+ expect_release = true;
+ }
+ }
+
+ /* someone wrote to us, we should reload the timer */
+ xen_wdt_kick();
+ }
+ return len;
+}
+
+static long xen_wdt_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ int new_options, retval = -EINVAL;
+ int new_timeout;
+ int __user *argp = (void __user *)arg;
+ static const struct watchdog_info ident = {
+ .options = WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE,
+ .firmware_version = 0,
+ .identity = DRV_NAME,
+ };
+
+ switch (cmd) {
+ case WDIOC_GETSUPPORT:
+ return copy_to_user(argp, &ident, sizeof(ident)) ? -EFAULT : 0;
+
+ case WDIOC_GETSTATUS:
+ case WDIOC_GETBOOTSTATUS:
+ return put_user(0, argp);
+
+ case WDIOC_SETOPTIONS:
+ if (get_user(new_options, argp))
+ return -EFAULT;
+
+ if (new_options & WDIOS_DISABLECARD)
+ retval = xen_wdt_stop();
+ if (new_options & WDIOS_ENABLECARD) {
+ retval = xen_wdt_start();
+ if (retval == -EBUSY)
+ retval = xen_wdt_kick();
+ }
+ return retval;
+
+ case WDIOC_KEEPALIVE:
+ xen_wdt_kick();
+ return 0;
+
+ case WDIOC_SETTIMEOUT:
+ if (get_user(new_timeout, argp))
+ return -EFAULT;
+ if (!new_timeout)
+ return -EINVAL;
+ timeout = new_timeout;
+ xen_wdt_kick();
+ /* fall through */
+ case WDIOC_GETTIMEOUT:
+ return put_user(timeout, argp);
+
+ case WDIOC_GETTIMELEFT:
+ retval = wdt_expires - ktime_to_timespec(ktime_get()).tv_sec;
+ return put_user(retval, argp);
+ }
+
+ return -ENOTTY;
+}
+
+static const struct file_operations xen_wdt_fops = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .write = xen_wdt_write,
+ .unlocked_ioctl = xen_wdt_ioctl,
+ .open = xen_wdt_open,
+ .release = xen_wdt_release,
+};
+
+static struct miscdevice xen_wdt_miscdev = {
+ .minor = WATCHDOG_MINOR,
+ .name = "watchdog",
+ .fops = &xen_wdt_fops,
+};
+
+static int __devinit xen_wdt_probe(struct platform_device *dev)
+{
+ struct sched_watchdog wd = { .id = ~0 };
+ int ret = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wd);
+
+ switch (ret) {
+ case -EINVAL:
+ if (!timeout) {
+ timeout = WATCHDOG_TIMEOUT;
+ printk(KERN_INFO PFX
+ "timeout value invalid, using %d\n", timeout);
+ }
+
+ ret = misc_register(&xen_wdt_miscdev);
+ if (ret) {
+ printk(KERN_ERR PFX
+ "cannot register miscdev on minor=%d (%d)\n",
+ WATCHDOG_MINOR, ret);
+ break;
+ }
+
+ printk(KERN_INFO PFX
+ "initialized (timeout=%ds, nowayout=%d)\n",
+ timeout, nowayout);
+ break;
+
+ case -ENOSYS:
+ printk(KERN_INFO PFX "not supported\n");
+ ret = -ENODEV;
+ break;
+
+ default:
+ printk(KERN_INFO PFX "bogus return value %d\n", ret);
+ break;
+ }
+
+ return ret;
+}
+
+static int __devexit xen_wdt_remove(struct platform_device *dev)
+{
+ /* Stop the timer before we leave */
+ if (!nowayout)
+ xen_wdt_stop();
+
+ misc_deregister(&xen_wdt_miscdev);
+
+ return 0;
+}
+
+static void xen_wdt_shutdown(struct platform_device *dev)
+{
+ xen_wdt_stop();
+}
+
+static int xen_wdt_suspend(struct platform_device *dev, pm_message_t state)
+{
+ return xen_wdt_stop();
+}
+
+static int xen_wdt_resume(struct platform_device *dev)
+{
+ return xen_wdt_start();
+}
+
+static struct platform_driver xen_wdt_driver = {
+ .probe = xen_wdt_probe,
+ .remove = __devexit_p(xen_wdt_remove),
+ .shutdown = xen_wdt_shutdown,
+ .suspend = xen_wdt_suspend,
+ .resume = xen_wdt_resume,
+ .driver = {
+ .owner = THIS_MODULE,
+ .name = DRV_NAME,
+ },
+};
+
+static int __init xen_wdt_init_module(void)
+{
+ int err;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ printk(KERN_INFO PFX "Xen WatchDog Timer Driver v%s\n", DRV_VERSION);
+
+ err = platform_driver_register(&xen_wdt_driver);
+ if (err)
+ return err;
+
+ platform_device = platform_device_register_simple(DRV_NAME,
+ -1, NULL, 0);
+ if (IS_ERR(platform_device)) {
+ err = PTR_ERR(platform_device);
+ platform_driver_unregister(&xen_wdt_driver);
+ }
+
+ return err;
+}
+
+static void __exit xen_wdt_cleanup_module(void)
+{
+ platform_device_unregister(platform_device);
+ platform_driver_unregister(&xen_wdt_driver);
+ printk(KERN_INFO PFX "module unloaded\n");
+}
+
+module_init(xen_wdt_init_module);
+module_exit(xen_wdt_cleanup_module);
+
+MODULE_AUTHOR("Jen Beulich <jbeulich@novell.com>");
+MODULE_DESCRIPTION("Xen WatchDog Timer Driver");
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index cab100a..fa9982e 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -28,6 +28,110 @@
firing.
If in doubt, say yes.
+config XEN_BACKEND
+ bool "Backend driver support"
+ depends on XEN_DOM0
+ default y
+ help
+ Support for backend device drivers that provide I/O services
+ to other virtual machines.
+
+config XEN_NETDEV_BACKEND
+ tristate "Xen backend network device"
+ depends on XEN_BACKEND && NET
+ help
+ Implement the network backend driver, which passes packets
+ from the guest domain's frontend drivers to the network.
+
+config XEN_BLKDEV_BACKEND
+ tristate "Block-device backend driver"
+ depends on XEN_BACKEND && BLOCK
+ help
+ The block-device backend driver allows the kernel to export its
+ block devices to other guests via a high-performance shared-memory
+ interface.
+
+
+config XEN_BLKDEV_TAP
+ tristate "Block-device tap backend driver"
+ depends on XEN_BACKEND && BLOCK
+ help
+ The block tap driver is an alternative to the block back driver
+ and allows VM block requests to be redirected to userspace through
+ a device interface. The tap allows user-space development of
+ high-performance block backends, where disk images may be implemented
+ as files, in memory, or on other hosts across the network. This
+ driver can safely coexist with the existing blockback driver.
+
+config XEN_BLKBACK_PAGEMAP
+ tristate
+ depends on XEN_BLKDEV_BACKEND != n && XEN_BLKDEV_TAP != n
+ default XEN_BLKDEV_BACKEND || XEN_BLKDEV_TAP
+
+config XEN_PCIDEV_BACKEND
+ tristate "PCI-device backend driver"
+ depends on PCI && XEN_BACKEND
+ default XEN_BACKEND
+ help
+ The PCI device backend driver allows the kernel to export arbitrary
+ PCI devices to other guests. If you select this to be a module, you
+ will need to make sure no other driver has bound to the device(s)
+ you want to make visible to other guests.
+
+choice
+ prompt "PCI Backend Mode"
+ depends on XEN_PCIDEV_BACKEND
+ default XEN_PCIDEV_BACKEND_VPCI if !IA64
+ default XEN_PCIDEV_BACKEND_CONTROLLER if IA64
+
+config XEN_PCIDEV_BACKEND_VPCI
+ bool "Virtual PCI"
+ ---help---
+ This PCI Backend hides the true PCI topology and makes the frontend
+ think there is a single PCI bus with only the exported devices on it.
+ For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
+ second device at 02:1a.1 will be re-assigned to 00:01.1.
+
+config XEN_PCIDEV_BACKEND_PASS
+ bool "Passthrough"
+ ---help---
+ This PCI Backend provides a real view of the PCI topology to the
+ frontend (for example, a device at 06:01.b will still appear at
+ 06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
+ PCI devices to its driver domains. This may be required for drivers
+ which depend on finding their hardward in certain bus/slot
+ locations.
+
+config XEN_PCIDEV_BACKEND_SLOT
+ bool "Slot"
+ ---help---
+ This PCI Backend hides the true PCI topology and makes the frontend
+ think there is a single PCI bus with only the exported devices on it.
+ Contrary to the virtual PCI backend, a function becomes a new slot.
+ For example, a device at 03:05.2 will be re-assigned to 00:00.0. A
+ second device at 02:1a.1 will be re-assigned to 00:01.0.
+
+config XEN_PCIDEV_BACKEND_CONTROLLER
+ bool "Controller"
+ depends on IA64
+ ---help---
+ This PCI backend virtualizes the PCI bus topology by providing a
+ virtual bus per PCI root device. Devices which are physically under
+ the same root bus will appear on the same virtual bus. For systems
+ with complex I/O addressing, this is the only backend which supports
+ extended I/O port spaces and MMIO translation offsets. This backend
+ also supports slot virtualization. For example, a device at
+ 0000:01:02.1 will be re-assigned to 0000:00:00.0. A second device
+ at 0000:02:05.0 (behind a P2P bridge on bus 0000:01) will be
+ re-assigned to 0000:00:01.0. A third device at 0000:16:05.0 (under
+ a different PCI root bus) will be re-assigned to 0000:01:00.0.
+
+endchoice
+
+config XEN_PCIDEV_BE_DEBUG
+ bool "PCI Backend Debugging"
+ depends on XEN_PCIDEV_BACKEND
+
config XENFS
tristate "Xen filesystem"
depends on XEN
@@ -60,4 +164,37 @@
Create entries under /sys/hypervisor describing the Xen
hypervisor environment. When running native or in another
virtual environment, /sys/hypervisor will still be present,
- but will have no xen contents.
\ No newline at end of file
+ but will have no xen contents.
+
+config XEN_MCE
+ def_bool y
+ depends on XEN_DOM0 && X86_64 && X86_MCE_INTEL
+
+config XEN_XENBUS_FRONTEND
+ tristate
+
+config XEN_GNTDEV
+ tristate "userspace grant access device driver"
+ depends on XEN
+ select MMU_NOTIFIER
+ help
+ Allows userspace processes use grants.
+
+config XEN_S3
+ def_bool y
+ depends on XEN_DOM0 && ACPI
+
+config ACPI_PROCESSOR_XEN
+ tristate
+ depends on XEN_DOM0 && ACPI_PROCESSOR && CPU_FREQ
+ default y
+
+config XEN_PLATFORM_PCI
+ tristate "xen platform pci device driver"
+ depends on XEN_PVHVM
+ default m
+ help
+ Driver for the Xen PCI Platform device: it is responsible for
+ initializing xenbus and grant_table when running in a Xen HVM
+ domain. As a consequence this driver is required to run any Xen PV
+ frontend on Xen HVM.
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 7c28434..ef1ea63 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,12 +1,27 @@
-obj-y += grant-table.o features.o events.o manage.o
+obj-y += grant-table.o features.o events.o manage.o biomerge.o pcpu.o
obj-y += xenbus/
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_features.o := $(nostackp)
-obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
-obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
-obj-$(CONFIG_XEN_BALLOON) += balloon.o
-obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o
-obj-$(CONFIG_XENFS) += xenfs/
-obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
\ No newline at end of file
+obj-$(CONFIG_PCI) += pci.o
+obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
+obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
+obj-$(CONFIG_XEN_BALLOON) += balloon.o
+obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o
+obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o
+obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback/
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
+obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
+obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/
+obj-$(CONFIG_XENFS) += xenfs/
+obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
+obj-$(CONFIG_XEN_MCE) += mce.o
+
+obj-$(CONFIG_XEN_S3) += acpi.o
+obj-$(CONFIG_ACPI_PROCESSOR_XEN) += acpi_processor.o
+obj-$(CONFIG_ACPI_HOTPLUG_MEMORY) += xen_acpi_memhotplug.o
+obj-$(CONFIG_XEN_PLATFORM_PCI) += platform-pci.o
+
+xen-evtchn-y := evtchn.o
+xen-gntdev-y := gntdev.o
diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c
new file mode 100644
index 0000000..e6d3d0e
--- /dev/null
+++ b/drivers/xen/acpi.c
@@ -0,0 +1,23 @@
+#include <xen/acpi.h>
+
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+int acpi_notify_hypervisor_state(u8 sleep_state,
+ u32 pm1a_cnt, u32 pm1b_cnt)
+{
+ struct xen_platform_op op = {
+ .cmd = XENPF_enter_acpi_sleep,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u = {
+ .enter_acpi_sleep = {
+ .pm1a_cnt_val = (u16)pm1a_cnt,
+ .pm1b_cnt_val = (u16)pm1b_cnt,
+ .sleep_state = sleep_state,
+ },
+ },
+ };
+
+ return HYPERVISOR_dom0_op(&op);
+}
diff --git a/drivers/xen/acpi_processor.c b/drivers/xen/acpi_processor.c
new file mode 100644
index 0000000..e83b615
--- /dev/null
+++ b/drivers/xen/acpi_processor.c
@@ -0,0 +1,417 @@
+/*
+ * acpi_processor.c - interface to notify Xen on acpi processor object
+ * info parsing
+ *
+ * Copyright (C) 2008, Intel corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+
+#include <linux/cpufreq.h>
+#include <acpi/processor.h>
+#include <xen/acpi.h>
+#include <xen/pcpu.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+static int xen_hotplug_notifier(struct acpi_processor *pr, int event);
+
+static struct processor_cntl_xen_ops xen_ops = {
+ .hotplug = xen_hotplug_notifier,
+};
+
+static struct acpi_power_register *power_registers[XEN_MAX_ACPI_ID + 1];
+
+int processor_cntl_xen_power_cache(int cpu, int cx,
+ struct acpi_power_register *reg)
+{
+ struct acpi_power_register *buf;
+
+ if (cpu < 0 || cpu > XEN_MAX_ACPI_ID ||
+ cx < 1 || cx > ACPI_PROCESSOR_MAX_POWER) {
+ return -EINVAL;
+ }
+
+ if (power_registers[cpu] == NULL) {
+ buf = kzalloc(ACPI_PROCESSOR_MAX_POWER *
+ sizeof(struct xen_processor_cx), GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ power_registers[cpu] = buf;
+ }
+
+ memcpy(power_registers[cpu]+cx-1, reg, sizeof(*reg));
+
+ return 0;
+}
+EXPORT_SYMBOL(processor_cntl_xen_power_cache);
+
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+static int xen_get_apic_id(acpi_handle handle)
+{
+ struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+ union acpi_object *obj;
+ struct acpi_madt_local_apic *lapic;
+ u8 physid;
+
+ if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
+ return -EINVAL;
+
+ if (!buffer.length || !buffer.pointer)
+ return -EINVAL;
+
+ obj = buffer.pointer;
+ if (obj->type != ACPI_TYPE_BUFFER ||
+ obj->buffer.length < sizeof(*lapic)) {
+ kfree(buffer.pointer);
+ return -EINVAL;
+ }
+
+ lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer;
+
+ if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC ||
+ !(lapic->lapic_flags & ACPI_MADT_ENABLED)) {
+ kfree(buffer.pointer);
+ return -EINVAL;
+ }
+
+ physid = lapic->id;
+ kfree(buffer.pointer);
+ buffer.length = ACPI_ALLOCATE_BUFFER;
+ buffer.pointer = NULL;
+
+ return physid;
+}
+#else
+static int xen_get_apic_id(acpi_handle handle)
+{
+ return -1;
+}
+#endif
+
+int processor_cntl_xen_notify(struct acpi_processor *pr, int event, int type)
+{
+ int ret = -EINVAL;
+
+ switch (event) {
+ case PROCESSOR_PM_INIT:
+ case PROCESSOR_PM_CHANGE:
+ if ((type >= PM_TYPE_MAX) ||
+ !xen_ops.pm_ops[type])
+ break;
+
+ ret = xen_ops.pm_ops[type](pr, event);
+ break;
+ case PROCESSOR_HOTPLUG:
+ {
+ int apic_id;
+
+ apic_id = xen_get_apic_id(pr->handle);
+ if (apic_id < 0)
+ break;
+ if (xen_ops.hotplug)
+ ret = xen_ops.hotplug(pr, type);
+ xen_pcpu_hotplug(type, apic_id);
+ break;
+ }
+ default:
+ printk(KERN_ERR "Unsupport processor events %d.\n", event);
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(processor_cntl_xen_notify);
+
+static inline void xen_convert_pct_reg(struct xen_pct_register *xpct,
+ struct acpi_pct_register *apct)
+{
+ xpct->descriptor = apct->descriptor;
+ xpct->length = apct->length;
+ xpct->space_id = apct->space_id;
+ xpct->bit_width = apct->bit_width;
+ xpct->bit_offset = apct->bit_offset;
+ xpct->reserved = apct->reserved;
+ xpct->address = apct->address;
+}
+
+static inline void xen_convert_pss_states(struct xen_processor_px *xpss,
+ struct acpi_processor_px *apss, int state_count)
+{
+ int i;
+ for (i = 0; i < state_count; i++) {
+ xpss->core_frequency = apss->core_frequency;
+ xpss->power = apss->power;
+ xpss->transition_latency = apss->transition_latency;
+ xpss->bus_master_latency = apss->bus_master_latency;
+ xpss->control = apss->control;
+ xpss->status = apss->status;
+ xpss++;
+ apss++;
+ }
+}
+
+static inline void xen_convert_psd_pack(struct xen_psd_package *xpsd,
+ struct acpi_psd_package *apsd)
+{
+ xpsd->num_entries = apsd->num_entries;
+ xpsd->revision = apsd->revision;
+ xpsd->domain = apsd->domain;
+ xpsd->coord_type = apsd->coord_type;
+ xpsd->num_processors = apsd->num_processors;
+}
+
+static int xen_cx_notifier(struct acpi_processor *pr, int action)
+{
+ int ret, count = 0, i;
+ xen_platform_op_t op = {
+ .cmd = XENPF_set_processor_pminfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.set_pminfo.id = pr->acpi_id,
+ .u.set_pminfo.type = XEN_PM_CX,
+ };
+ struct xen_processor_cx *data, *buf;
+ struct acpi_processor_cx *cx;
+ struct acpi_power_register *reg;
+
+ if (action == PROCESSOR_PM_CHANGE)
+ return -EINVAL;
+
+ if (power_registers[pr->acpi_id] == NULL) {
+ printk(KERN_WARNING "No C state info for acpi processor %d\n",
+ pr->acpi_id);
+ return -EINVAL;
+ }
+
+ /* Convert to Xen defined structure and hypercall */
+ buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx),
+ GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ data = buf;
+ for (i = 1; i <= pr->power.count; i++) {
+ cx = &pr->power.states[i];
+ reg = power_registers[pr->acpi_id]+i-1;
+ /* Skip invalid cstate entry */
+ if (!cx->valid)
+ continue;
+
+ data->type = cx->type;
+ data->latency = cx->latency;
+ data->power = cx->power;
+ data->reg.space_id = reg->space_id;
+ data->reg.bit_width = reg->bit_width;
+ data->reg.bit_offset = reg->bit_offset;
+ data->reg.access_size = reg->access_size;
+ data->reg.address = reg->address;
+
+ /* Get dependency relationships, _CSD is not supported yet */
+ data->dpcnt = 0;
+ set_xen_guest_handle(data->dp, NULL);
+
+ data++;
+ count++;
+ }
+
+ if (!count) {
+ printk(KERN_ERR "No available Cx info for cpu %d\n",
+ pr->acpi_id);
+ kfree(buf);
+ return -EINVAL;
+ }
+
+ op.u.set_pminfo.power.count = count;
+ op.u.set_pminfo.power.flags.bm_control = pr->flags.bm_control;
+ op.u.set_pminfo.power.flags.bm_check = pr->flags.bm_check;
+ op.u.set_pminfo.power.flags.has_cst = pr->flags.has_cst;
+ op.u.set_pminfo.power.flags.power_setup_done =
+ pr->flags.power_setup_done;
+
+ set_xen_guest_handle(op.u.set_pminfo.power.states, buf);
+ ret = HYPERVISOR_dom0_op(&op);
+ kfree(buf);
+ return ret;
+}
+
+static int xen_px_notifier(struct acpi_processor *pr, int action)
+{
+ int ret = -EINVAL;
+ xen_platform_op_t op = {
+ .cmd = XENPF_set_processor_pminfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.set_pminfo.id = pr->acpi_id,
+ .u.set_pminfo.type = XEN_PM_PX,
+ };
+ struct xen_processor_performance *perf;
+ struct xen_processor_px *states = NULL;
+ struct acpi_processor_performance *px;
+ struct acpi_psd_package *pdomain;
+
+ if (!pr)
+ return -EINVAL;
+
+ perf = &op.u.set_pminfo.perf;
+ px = pr->performance;
+
+ switch (action) {
+ case PROCESSOR_PM_CHANGE:
+ /* ppc dynamic handle */
+ perf->flags = XEN_PX_PPC;
+ perf->platform_limit = pr->performance_platform_limit;
+
+ ret = HYPERVISOR_dom0_op(&op);
+ break;
+
+ case PROCESSOR_PM_INIT:
+ /* px normal init */
+ perf->flags = XEN_PX_PPC |
+ XEN_PX_PCT |
+ XEN_PX_PSS |
+ XEN_PX_PSD;
+
+ /* ppc */
+ perf->platform_limit = pr->performance_platform_limit;
+
+ /* pct */
+ xen_convert_pct_reg(&perf->control_register,
+ &px->control_register);
+ xen_convert_pct_reg(&perf->status_register,
+ &px->status_register);
+
+ /* pss */
+ perf->state_count = px->state_count;
+ states = kzalloc(px->state_count*sizeof(xen_processor_px_t),
+ GFP_KERNEL);
+ if (!states)
+ return -ENOMEM;
+ xen_convert_pss_states(states, px->states, px->state_count);
+ set_xen_guest_handle(perf->states, states);
+
+ /* psd */
+ pdomain = &px->domain_info;
+ xen_convert_psd_pack(&perf->domain_info, pdomain);
+ if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL)
+ perf->shared_type = CPUFREQ_SHARED_TYPE_ALL;
+ else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY)
+ perf->shared_type = CPUFREQ_SHARED_TYPE_ANY;
+ else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL)
+ perf->shared_type = CPUFREQ_SHARED_TYPE_HW;
+ else {
+ ret = -ENODEV;
+ kfree(states);
+ break;
+ }
+
+ ret = HYPERVISOR_dom0_op(&op);
+ kfree(states);
+ break;
+
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+static int xen_tx_notifier(struct acpi_processor *pr, int action)
+{
+ return -EINVAL;
+}
+
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
+{
+ int ret = -EINVAL;
+ uint32_t apic_id;
+ unsigned long long pxm;
+ acpi_status status = 0;
+
+ xen_platform_op_t op = {
+ .interface_version = XENPF_INTERFACE_VERSION,
+ };
+
+ apic_id = xen_get_apic_id(pr->handle);
+ if (apic_id < 0) {
+ printk(KERN_WARNING "Can't get apic_id for acpi_id %x\n",
+ pr->acpi_id);
+ return -1;
+ }
+
+ status = acpi_evaluate_integer(pr->handle, "_PXM",
+ NULL, &pxm);
+ if (ACPI_FAILURE(status)) {
+ printk(KERN_WARNING "can't get pxm for acpi_id %x\n",
+ pr->acpi_id);
+ return -1;
+ }
+
+ switch (event) {
+ case HOTPLUG_TYPE_ADD:
+ op.cmd = XENPF_cpu_hotadd;
+ op.u.cpu_add.apic_id = apic_id;
+ op.u.cpu_add.acpi_id = pr->acpi_id;
+ op.u.cpu_add.pxm = pxm;
+ ret = HYPERVISOR_dom0_op(&op);
+ break;
+ case HOTPLUG_TYPE_REMOVE:
+ printk(KERN_WARNING "Xen not support CPU hotremove\n");
+ ret = -ENOSYS;
+ break;
+ }
+
+ return ret;
+}
+#else
+static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
+{
+ return -ENOSYS;
+}
+#endif
+
+static int __init xen_acpi_processor_extcntl_init(void)
+{
+ unsigned int pmbits;
+
+ /* Only xen dom0 is allowed to handle ACPI processor info */
+ if (!xen_initial_domain())
+ return 0;
+
+ pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8;
+
+ if (pmbits & XEN_PROCESSOR_PM_CX)
+ xen_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier;
+ if (pmbits & XEN_PROCESSOR_PM_PX)
+ xen_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier;
+ if (pmbits & XEN_PROCESSOR_PM_TX)
+ xen_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier;
+
+ return 0;
+}
+
+subsys_initcall(xen_acpi_processor_extcntl_init);
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 4204336..a065fda 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -43,22 +43,26 @@
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/sysdev.h>
+#include <linux/swap.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
+#include <asm/e820.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
#include <xen/interface/xen.h>
#include <xen/interface/memory.h>
#include <xen/xenbus.h>
#include <xen/features.h>
#include <xen/page.h>
-#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT+balloon_order-10))
#define BALLOON_CLASS_NAME "xen_memory"
@@ -82,15 +86,16 @@
static int register_balloon(struct sys_device *sysdev);
-/*
- * Protects atomic reservation decrease/increase against concurrent increases.
- * Also protects non-atomic updates of current_pages and driver_pages, and
- * balloon lists.
- */
-static DEFINE_SPINLOCK(balloon_lock);
-
static struct balloon_stats balloon_stats;
+/*
+ * Work in pages of this order. Can be either 0 for normal pages
+ * or 9 for hugepages.
+ */
+static int balloon_order;
+static unsigned long balloon_npages;
+static unsigned long discontig_frame_list[PAGE_SIZE / sizeof(unsigned long)];
+
/* We increase/decrease in batches which fit in a page */
static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
@@ -118,12 +123,43 @@
static void scrub_page(struct page *page)
{
#ifdef CONFIG_XEN_SCRUB_PAGES
- clear_highpage(page);
+ int i;
+
+ for (i = 0; i < balloon_npages; i++)
+ clear_highpage(page++);
#endif
}
+static void free_discontig_frame(void)
+{
+ int rc;
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .domid = DOMID_SELF,
+ .nr_extents = balloon_npages,
+ .extent_order = 0
+ };
+
+ set_xen_guest_handle(reservation.extent_start, discontig_frame_list);
+ rc = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+ BUG_ON(rc != balloon_npages);
+}
+
+static unsigned long shrink_frame(unsigned long nr_pages)
+{
+ unsigned long i, j;
+
+ for (i = 0, j = 0; i < nr_pages; i++, j++) {
+ if (frame_list[i] == 0)
+ j++;
+ if (i != j)
+ frame_list[i] = frame_list[j];
+ }
+ return i;
+}
+
/* balloon_append: add the given page to the balloon. */
-static void balloon_append(struct page *page)
+static void __balloon_append(struct page *page)
{
/* Lowmem is re-populated first, so highmem pages go at list tail. */
if (PageHighMem(page)) {
@@ -134,7 +170,11 @@
list_add(&page->lru, &ballooned_pages);
balloon_stats.balloon_low++;
}
+}
+static void balloon_append(struct page *page)
+{
+ __balloon_append(page);
totalram_pages--;
}
@@ -195,20 +235,17 @@
static int increase_reservation(unsigned long nr_pages)
{
- unsigned long pfn, i, flags;
+ unsigned long pfn, mfn, i, j;
struct page *page;
long rc;
struct xen_memory_reservation reservation = {
.address_bits = 0,
- .extent_order = 0,
.domid = DOMID_SELF
};
if (nr_pages > ARRAY_SIZE(frame_list))
nr_pages = ARRAY_SIZE(frame_list);
- spin_lock_irqsave(&balloon_lock, flags);
-
page = balloon_first_page();
for (i = 0; i < nr_pages; i++) {
BUG_ON(page == NULL);
@@ -218,6 +255,8 @@
set_xen_guest_handle(reservation.extent_start, frame_list);
reservation.nr_extents = nr_pages;
+ reservation.extent_order = balloon_order;
+
rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
if (rc < 0)
goto out;
@@ -227,19 +266,22 @@
BUG_ON(page == NULL);
pfn = page_to_pfn(page);
+ mfn = frame_list[i];
BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
phys_to_machine_mapping_valid(pfn));
- set_phys_to_machine(pfn, frame_list[i]);
+ for (j = 0; j < balloon_npages; j++, pfn++, mfn++) {
+ set_phys_to_machine(pfn, mfn);
- /* Link back into the page tables if not highmem. */
- if (pfn < max_low_pfn) {
- int ret;
- ret = HYPERVISOR_update_va_mapping(
- (unsigned long)__va(pfn << PAGE_SHIFT),
- mfn_pte(frame_list[i], PAGE_KERNEL),
- 0);
- BUG_ON(ret);
+ /* Link back into the page tables if not highmem. */
+ if (!xen_hvm_domain() && pfn < max_low_pfn) {
+ int ret;
+ ret = HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ mfn_pte(mfn, PAGE_KERNEL),
+ 0);
+ BUG_ON(ret);
+ }
}
/* Relinquish the page back to the allocator. */
@@ -251,20 +293,18 @@
balloon_stats.current_pages += rc;
out:
- spin_unlock_irqrestore(&balloon_lock, flags);
-
return rc < 0 ? rc : rc != nr_pages;
}
static int decrease_reservation(unsigned long nr_pages)
{
- unsigned long pfn, i, flags;
- struct page *page;
+ unsigned long pfn, lpfn, mfn, i, j;
+ struct page *page = NULL;
int need_sleep = 0;
- int ret;
+ int discontig, discontig_free;
+ int ret;
struct xen_memory_reservation reservation = {
.address_bits = 0,
- .extent_order = 0,
.domid = DOMID_SELF
};
@@ -272,7 +312,7 @@
nr_pages = ARRAY_SIZE(frame_list);
for (i = 0; i < nr_pages; i++) {
- if ((page = alloc_page(GFP_BALLOON)) == NULL) {
+ if ((page = alloc_pages(GFP_BALLOON, balloon_order)) == NULL) {
nr_pages = i;
need_sleep = 1;
break;
@@ -282,38 +322,49 @@
frame_list[i] = pfn_to_mfn(pfn);
scrub_page(page);
-
- if (!PageHighMem(page)) {
- ret = HYPERVISOR_update_va_mapping(
- (unsigned long)__va(pfn << PAGE_SHIFT),
- __pte_ma(0), 0);
- BUG_ON(ret);
- }
-
}
/* Ensure that ballooned highmem pages don't have kmaps. */
kmap_flush_unused();
flush_tlb_all();
- spin_lock_irqsave(&balloon_lock, flags);
-
/* No more mappings: invalidate P2M and add to balloon. */
for (i = 0; i < nr_pages; i++) {
- pfn = mfn_to_pfn(frame_list[i]);
- set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ mfn = frame_list[i];
+ lpfn = pfn = mfn_to_pfn(mfn);
balloon_append(pfn_to_page(pfn));
+ discontig_free = 0;
+ for (j = 0; j < balloon_npages; j++, lpfn++, mfn++) {
+ if ((discontig_frame_list[j] = pfn_to_mfn(lpfn)) != mfn)
+ discontig_free = 1;
+
+ set_phys_to_machine(lpfn, INVALID_P2M_ENTRY);
+ page = pfn_to_page(lpfn);
+
+ if (!xen_hvm_domain() && !PageHighMem(page)) {
+ ret = HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(lpfn << PAGE_SHIFT),
+ __pte_ma(0), 0);
+ BUG_ON(ret);
+ }
+ }
+ if (discontig_free) {
+ free_discontig_frame();
+ frame_list[i] = 0;
+ discontig = 1;
+ }
}
+ balloon_stats.current_pages -= nr_pages;
+
+ if (discontig)
+ nr_pages = shrink_frame(nr_pages);
set_xen_guest_handle(reservation.extent_start, frame_list);
reservation.nr_extents = nr_pages;
+ reservation.extent_order = balloon_order;
ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
BUG_ON(ret != nr_pages);
- balloon_stats.current_pages -= nr_pages;
-
- spin_unlock_irqrestore(&balloon_lock, flags);
-
return need_sleep;
}
@@ -379,7 +430,7 @@
/* The given memory/target value is in KiB, so it needs converting to
* pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
*/
- balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
+ balloon_set_new_target(new_target >> ((PAGE_SHIFT - 10) + balloon_order));
}
static int balloon_init_watcher(struct notifier_block *notifier,
@@ -399,15 +450,22 @@
static int __init balloon_init(void)
{
- unsigned long pfn;
+ unsigned long pfn, nr_pages, extra_pfn_end;
struct page *page;
- if (!xen_pv_domain())
+ if (!xen_domain())
return -ENODEV;
- pr_info("xen_balloon: Initialising balloon driver.\n");
+ pr_info("xen_balloon: Initialising balloon driver with page order %d.\n",
+ balloon_order);
- balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
+ balloon_npages = 1 << balloon_order;
+
+ if (xen_pv_domain())
+ nr_pages = xen_start_info->nr_pages;
+ else
+ nr_pages = max_pfn;
+ balloon_stats.current_pages = (min(nr_pages, max_pfn)) >> balloon_order;
balloon_stats.target_pages = balloon_stats.current_pages;
balloon_stats.balloon_low = 0;
balloon_stats.balloon_high = 0;
@@ -419,11 +477,24 @@
register_balloon(&balloon_sysdev);
- /* Initialise the balloon with excess memory space. */
- for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
+ /*
+ * Initialise the balloon with excess memory space. We need
+ * to make sure we don't add memory which doesn't exist or
+ * logically exist. The E820 map can be trimmed to be smaller
+ * than the amount of physical memory due to the mem= command
+ * line parameter. And if this is a 32-bit non-HIGHMEM kernel
+ * on a system with memory which requires highmem to access,
+ * don't try to use it.
+ */
+ extra_pfn_end = min(min(max_pfn, e820_end_of_ram_pfn()),
+ (unsigned long)PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size));
+ for (pfn = PFN_UP(xen_extra_mem_start);
+ pfn < extra_pfn_end;
+ pfn += balloon_npages) {
page = pfn_to_page(pfn);
- if (!PageReserved(page))
- balloon_append(page);
+ /* totalram_pages doesn't include the boot-time
+ balloon extension, so don't subtract from it. */
+ __balloon_append(page);
}
target_watch.callback = watch_target;
@@ -444,6 +515,121 @@
module_exit(balloon_exit);
+static int __init balloon_parse_huge(char *s)
+{
+ balloon_order = 9;
+ return 1;
+}
+
+__setup("balloon_hugepages", balloon_parse_huge);
+
+static int dealloc_pte_fn(pte_t *pte, struct page *pmd_page,
+ unsigned long addr, void *data)
+{
+ unsigned long mfn = pte_mfn(*pte);
+ int ret;
+ struct xen_memory_reservation reservation = {
+ .nr_extents = 1,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+
+ set_xen_guest_handle(reservation.extent_start, &mfn);
+ set_pte_at(&init_mm, addr, pte, __pte_ma(0));
+ set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
+
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+ BUG_ON(ret != 1);
+
+ return 0;
+}
+
+struct page **alloc_empty_pages_and_pagevec(int nr_pages)
+{
+ struct page *page, **pagevec;
+ int npages;
+ int i, j, ret;
+
+ /* Round up to next number of balloon_order pages */
+ npages = (nr_pages + (balloon_npages-1)) >> balloon_order;
+
+ pagevec = kmalloc(sizeof(page) * nr_pages << balloon_order, GFP_KERNEL);
+ if (pagevec == NULL)
+ return NULL;
+
+ for (i = 0; i < nr_pages; i++) {
+ void *v;
+
+ page = alloc_pages(GFP_KERNEL|__GFP_COLD, balloon_order);
+ if (page == NULL)
+ goto err;
+
+ scrub_page(page);
+
+ mutex_lock(&balloon_mutex);
+
+ v = page_address(page);
+
+ ret = apply_to_page_range(&init_mm, (unsigned long)v,
+ PAGE_SIZE << balloon_order,
+ dealloc_pte_fn, NULL);
+
+ if (ret != 0) {
+ mutex_unlock(&balloon_mutex);
+ //balloon_free_page(page); /* tries to use free_cold_page */
+ __free_page(page);
+ goto err;
+ }
+ for (j = 0; j < balloon_npages; j++)
+ pagevec[(i<<balloon_order)+j] = page++;
+
+ totalram_pages = balloon_stats.current_pages -= balloon_npages;
+
+ mutex_unlock(&balloon_mutex);
+ }
+
+ out:
+ schedule_work(&balloon_worker);
+ flush_tlb_all();
+ return pagevec;
+
+ err:
+ mutex_lock(&balloon_mutex);
+ while (--i >= 0)
+ balloon_append(pagevec[i << balloon_order]);
+ mutex_unlock(&balloon_mutex);
+ kfree(pagevec);
+ pagevec = NULL;
+ goto out;
+}
+EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
+
+void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
+{
+ struct page *page;
+ int i;
+ int npages;
+
+ if (pagevec == NULL)
+ return;
+
+ /* Round up to next number of balloon_order pages */
+ npages = (nr_pages + (balloon_npages-1)) >> balloon_order;
+
+ mutex_lock(&balloon_mutex);
+ for (i = 0; i < nr_pages; i++) {
+ page = pagevec[i << balloon_order];
+ BUG_ON(page_count(page) != 1);
+ balloon_append(page);
+ }
+ mutex_unlock(&balloon_mutex);
+
+ kfree(pagevec);
+
+ schedule_work(&balloon_worker);
+}
+EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
+
#define BALLOON_SHOW(name, format, args...) \
static ssize_t show_##name(struct sys_device *dev, \
struct sysdev_attribute *attr, \
@@ -477,7 +663,7 @@
target_bytes = simple_strtoull(buf, &endchar, 0) * 1024;
- balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+ balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order));
return count;
}
@@ -491,7 +677,7 @@
{
return sprintf(buf, "%llu\n",
(unsigned long long)balloon_stats.target_pages
- << PAGE_SHIFT);
+ << (PAGE_SHIFT + balloon_order));
}
static ssize_t store_target(struct sys_device *dev,
@@ -507,7 +693,7 @@
target_bytes = memparse(buf, &endchar);
- balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+ balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order));
return count;
}
diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c
new file mode 100644
index 0000000..d40f534
--- /dev/null
+++ b/drivers/xen/biomerge.c
@@ -0,0 +1,14 @@
+#include <linux/bio.h>
+#include <asm/io.h>
+#include <xen/page.h>
+
+bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
+ const struct bio_vec *vec2)
+{
+ unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page));
+ unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page));
+
+ return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&
+ ((mfn1 == mfn2) || ((mfn1+1) == mfn2));
+}
+
diff --git a/drivers/xen/blkback/Makefile b/drivers/xen/blkback/Makefile
new file mode 100644
index 0000000..dee55ba
--- /dev/null
+++ b/drivers/xen/blkback/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
+obj-$(CONFIG_XEN_BLKBACK_PAGEMAP) += blkback-pagemap.o
+
+xen-blkback-y := blkback.o xenbus.o interface.o vbd.o
diff --git a/drivers/xen/blkback/blkback-pagemap.c b/drivers/xen/blkback/blkback-pagemap.c
new file mode 100644
index 0000000..45f6eb2
--- /dev/null
+++ b/drivers/xen/blkback/blkback-pagemap.c
@@ -0,0 +1,109 @@
+#include <linux/module.h>
+#include "blkback-pagemap.h"
+
+static int blkback_pagemap_size;
+static struct blkback_pagemap *blkback_pagemap;
+
+static inline int
+blkback_pagemap_entry_clear(struct blkback_pagemap *map)
+{
+ static struct blkback_pagemap zero;
+ return !memcmp(map, &zero, sizeof(zero));
+}
+
+int
+blkback_pagemap_init(int pages)
+{
+ blkback_pagemap = kzalloc(pages * sizeof(struct blkback_pagemap),
+ GFP_KERNEL);
+ if (!blkback_pagemap)
+ return -ENOMEM;
+
+ blkback_pagemap_size = pages;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blkback_pagemap_init);
+
+void
+blkback_pagemap_set(int idx, struct page *page,
+ domid_t domid, busid_t busid, grant_ref_t gref)
+{
+ struct blkback_pagemap *entry;
+
+ BUG_ON(!blkback_pagemap);
+ BUG_ON(idx >= blkback_pagemap_size);
+
+ set_page_private(page, idx);
+
+ entry = blkback_pagemap + idx;
+ if (!blkback_pagemap_entry_clear(entry)) {
+ printk("overwriting pagemap %d: d %u b %u g %u\n",
+ idx, entry->domid, entry->busid, entry->gref);
+ BUG();
+ }
+
+ entry->page = page;
+ entry->domid = domid;
+ entry->busid = busid;
+ entry->gref = gref;
+}
+EXPORT_SYMBOL_GPL(blkback_pagemap_set);
+
+void
+blkback_pagemap_clear(struct page *page)
+{
+ int idx;
+ struct blkback_pagemap *entry;
+
+ idx = (int)page_private(page);
+
+ BUG_ON(!blkback_pagemap);
+ BUG_ON(idx >= blkback_pagemap_size);
+
+ entry = blkback_pagemap + idx;
+ if (blkback_pagemap_entry_clear(entry)) {
+ printk("clearing empty pagemap %d\n", idx);
+ BUG();
+ }
+
+ memset(entry, 0, sizeof(*entry));
+}
+EXPORT_SYMBOL_GPL(blkback_pagemap_clear);
+
+struct blkback_pagemap
+blkback_pagemap_read(struct page *page)
+{
+ int idx;
+ struct blkback_pagemap *entry;
+
+ idx = (int)page_private(page);
+
+ BUG_ON(!blkback_pagemap);
+ BUG_ON(idx >= blkback_pagemap_size);
+
+ entry = blkback_pagemap + idx;
+ if (blkback_pagemap_entry_clear(entry)) {
+ printk("reading empty pagemap %d\n", idx);
+ BUG();
+ }
+
+ return *entry;
+}
+EXPORT_SYMBOL(blkback_pagemap_read);
+
+MODULE_LICENSE("Dual BSD/GPL");
+
+int
+blkback_pagemap_contains_page(struct page *page)
+{
+ struct blkback_pagemap *entry;
+ int idx = (int)page_private(page);
+
+ if (idx < 0 || idx >= blkback_pagemap_size)
+ return 0;
+
+ entry = blkback_pagemap + idx;
+
+ return (entry->page == page);
+}
+EXPORT_SYMBOL(blkback_pagemap_contains_page);
diff --git a/drivers/xen/blkback/blkback-pagemap.h b/drivers/xen/blkback/blkback-pagemap.h
new file mode 100644
index 0000000..7f97d15
--- /dev/null
+++ b/drivers/xen/blkback/blkback-pagemap.h
@@ -0,0 +1,36 @@
+#ifndef _BLKBACK_PAGEMAP_H_
+#define _BLKBACK_PAGEMAP_H_
+
+#include <linux/mm.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/grant_table.h>
+
+typedef unsigned int busid_t;
+
+struct blkback_pagemap {
+ struct page *page;
+ domid_t domid;
+ busid_t busid;
+ grant_ref_t gref;
+};
+
+#if defined(CONFIG_XEN_BLKBACK_PAGEMAP) || defined(CONFIG_XEN_BLKBACK_PAGEMAP_MODULE)
+
+int blkback_pagemap_init(int);
+void blkback_pagemap_set(int, struct page *, domid_t, busid_t, grant_ref_t);
+void blkback_pagemap_clear(struct page *);
+struct blkback_pagemap blkback_pagemap_read(struct page *);
+int blkback_pagemap_contains_page(struct page *page);
+
+#else /* CONFIG_XEN_BLKBACK_PAGEMAP */
+
+static inline int blkback_pagemap_init(int pages) { return 0; }
+static inline void blkback_pagemap_set(int idx, struct page *page, domid_t dom,
+ busid_t bus, grant_ref_t gnt) {}
+static inline void blkback_pagemap_clear(struct page *page) {}
+#define blkback_pagemap_read(_page) ({ BUG(); (struct blkback_pagemap){0}; })
+static inline int blkback_pagemap_contains_page(struct page *page) { return 0; }
+
+#endif /* CONFIG_XEN_BLKBACK_PAGEMAP */
+
+#endif
diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
new file mode 100644
index 0000000..0bef445
--- /dev/null
+++ b/drivers/xen/blkback/blkback.c
@@ -0,0 +1,675 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/main.c
+ *
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ * arch/xen/drivers/blkif/frontend
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Copyright (c) 2005, Christopher Clark
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+
+#include <xen/balloon.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include "common.h"
+
+/*
+ * These are rather arbitrary. They are fairly large because adjacent requests
+ * pulled from a communication ring are quite likely to end up being part of
+ * the same scatter/gather request at the disc.
+ *
+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ *
+ * This will increase the chances of being able to write whole tracks.
+ * 64 should be enough to keep us competitive with Linux.
+ */
+static int blkif_reqs = 64;
+module_param_named(reqs, blkif_reqs, int, 0);
+MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
+
+/* Run-time switchable: /sys/module/blkback/parameters/ */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+ blkif_t *blkif;
+ u64 id;
+ int nr_pages;
+ atomic_t pendcnt;
+ unsigned short operation;
+ int status;
+ struct list_head free_list;
+} pending_req_t;
+
+static pending_req_t *pending_reqs;
+static struct list_head pending_free;
+static DEFINE_SPINLOCK(pending_free_lock);
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+static struct page **pending_pages;
+static grant_handle_t *pending_grant_handles;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+ return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+ unsigned long pfn = page_to_pfn(pending_page(req, seg));
+ return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+#define pending_handle(_req, _seg) \
+ (pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
+
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+ struct blkif_request *req,
+ pending_req_t *pending_req);
+static void make_response(blkif_t *blkif, u64 id,
+ unsigned short op, int st);
+
+/******************************************************************
+ * misc small helpers
+ */
+static pending_req_t* alloc_req(void)
+{
+ pending_req_t *req = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pending_free_lock, flags);
+ if (!list_empty(&pending_free)) {
+ req = list_entry(pending_free.next, pending_req_t, free_list);
+ list_del(&req->free_list);
+ }
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+ return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+ unsigned long flags;
+ int was_empty;
+
+ spin_lock_irqsave(&pending_free_lock, flags);
+ was_empty = list_empty(&pending_free);
+ list_add(&req->free_list, &pending_free);
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+ if (was_empty)
+ wake_up(&pending_free_wq);
+}
+
+static void unplug_queue(blkif_t *blkif)
+{
+ if (blkif->plug == NULL)
+ return;
+ if (blkif->plug->unplug_fn)
+ blkif->plug->unplug_fn(blkif->plug);
+ blk_put_queue(blkif->plug);
+ blkif->plug = NULL;
+}
+
+static void plug_queue(blkif_t *blkif, struct block_device *bdev)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (q == blkif->plug)
+ return;
+ unplug_queue(blkif);
+ blk_get_queue(q);
+ blkif->plug = q;
+}
+
+static void fast_flush_area(pending_req_t *req)
+{
+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ unsigned int i, invcount = 0;
+ grant_handle_t handle;
+ int ret;
+
+ for (i = 0; i < req->nr_pages; i++) {
+ handle = pending_handle(req, i);
+ if (handle == BLKBACK_INVALID_HANDLE)
+ continue;
+ blkback_pagemap_clear(pending_page(req, i));
+ gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
+ GNTMAP_host_map, handle);
+ pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
+ invcount++;
+ }
+
+ ret = HYPERVISOR_grant_table_op(
+ GNTTABOP_unmap_grant_ref, unmap, invcount);
+ BUG_ON(ret);
+}
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(blkif_t *blkif)
+{
+ printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | br %4d\n",
+ current->comm, blkif->st_oo_req,
+ blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
+ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+ blkif->st_rd_req = 0;
+ blkif->st_wr_req = 0;
+ blkif->st_oo_req = 0;
+}
+
+int blkif_schedule(void *arg)
+{
+ blkif_t *blkif = arg;
+ struct vbd *vbd = &blkif->vbd;
+
+ blkif_get(blkif);
+
+ if (debug_lvl)
+ printk(KERN_DEBUG "%s: started\n", current->comm);
+
+ while (!kthread_should_stop()) {
+ if (try_to_freeze())
+ continue;
+ if (unlikely(vbd->size != vbd_size(vbd)))
+ vbd_resize(blkif);
+
+ wait_event_interruptible(
+ blkif->wq,
+ blkif->waiting_reqs || kthread_should_stop());
+ wait_event_interruptible(
+ pending_free_wq,
+ !list_empty(&pending_free) || kthread_should_stop());
+
+ blkif->waiting_reqs = 0;
+ smp_mb(); /* clear flag *before* checking for work */
+
+ if (do_block_io_op(blkif))
+ blkif->waiting_reqs = 1;
+ unplug_queue(blkif);
+
+ if (log_stats && time_after(jiffies, blkif->st_print))
+ print_stats(blkif);
+ }
+
+ if (log_stats)
+ print_stats(blkif);
+ if (debug_lvl)
+ printk(KERN_DEBUG "%s: exiting\n", current->comm);
+
+ blkif->xenblkd = NULL;
+ blkif_put(blkif);
+
+ return 0;
+}
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
+ */
+
+static void __end_block_io_op(pending_req_t *pending_req, int error)
+{
+ /* An error fails the entire request. */
+ if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
+ (error == -EOPNOTSUPP)) {
+ DPRINTK("blkback: write barrier op failed, not supported\n");
+ blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
+ pending_req->status = BLKIF_RSP_EOPNOTSUPP;
+ } else if (error) {
+ DPRINTK("Buffer not up-to-date at end of operation, "
+ "error=%d\n", error);
+ pending_req->status = BLKIF_RSP_ERROR;
+ }
+
+ if (atomic_dec_and_test(&pending_req->pendcnt)) {
+ fast_flush_area(pending_req);
+ make_response(pending_req->blkif, pending_req->id,
+ pending_req->operation, pending_req->status);
+ blkif_put(pending_req->blkif);
+ free_req(pending_req);
+ }
+}
+
+static void end_block_io_op(struct bio *bio, int error)
+{
+ __end_block_io_op(bio->bi_private, error);
+ bio_put(bio);
+}
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+static void blkif_notify_work(blkif_t *blkif)
+{
+ blkif->waiting_reqs = 1;
+ wake_up(&blkif->wq);
+}
+
+irqreturn_t blkif_be_int(int irq, void *dev_id)
+{
+ blkif_notify_work(dev_id);
+ return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+
+static int do_block_io_op(blkif_t *blkif)
+{
+ union blkif_back_rings *blk_rings = &blkif->blk_rings;
+ struct blkif_request req;
+ pending_req_t *pending_req;
+ RING_IDX rc, rp;
+ int more_to_do = 0;
+
+ rc = blk_rings->common.req_cons;
+ rp = blk_rings->common.sring->req_prod;
+ rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+ while (rc != rp) {
+
+ if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
+ break;
+
+ if (kthread_should_stop()) {
+ more_to_do = 1;
+ break;
+ }
+
+ pending_req = alloc_req();
+ if (NULL == pending_req) {
+ blkif->st_oo_req++;
+ more_to_do = 1;
+ break;
+ }
+
+ switch (blkif->blk_protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
+ break;
+ case BLKIF_PROTOCOL_X86_64:
+ blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
+ break;
+ default:
+ BUG();
+ }
+ blk_rings->common.req_cons = ++rc; /* before make_response() */
+
+ /* Apply all sanity checks to /private copy/ of request. */
+ barrier();
+
+ switch (req.operation) {
+ case BLKIF_OP_READ:
+ blkif->st_rd_req++;
+ dispatch_rw_block_io(blkif, &req, pending_req);
+ break;
+ case BLKIF_OP_WRITE_BARRIER:
+ blkif->st_br_req++;
+ /* fall through */
+ case BLKIF_OP_WRITE:
+ blkif->st_wr_req++;
+ dispatch_rw_block_io(blkif, &req, pending_req);
+ break;
+ default:
+ /* A good sign something is wrong: sleep for a while to
+ * avoid excessive CPU consumption by a bad guest. */
+ msleep(1);
+ DPRINTK("error: unknown block io operation [%d]\n",
+ req.operation);
+ make_response(blkif, req.id, req.operation,
+ BLKIF_RSP_ERROR);
+ free_req(pending_req);
+ break;
+ }
+
+ /* Yield point for this unbounded loop. */
+ cond_resched();
+ }
+
+ return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif,
+ struct blkif_request *req,
+ pending_req_t *pending_req)
+{
+ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct phys_req preq;
+ struct {
+ unsigned long buf; unsigned int nsec;
+ } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ unsigned int nseg;
+ struct bio *bio = NULL;
+ int ret, i;
+ int operation;
+
+ switch (req->operation) {
+ case BLKIF_OP_READ:
+ operation = READ;
+ break;
+ case BLKIF_OP_WRITE:
+ operation = WRITE;
+ break;
+ case BLKIF_OP_WRITE_BARRIER:
+ operation = WRITE_BARRIER;
+ break;
+ default:
+ operation = 0; /* make gcc happy */
+ BUG();
+ }
+
+ /* Check that number of segments is sane. */
+ nseg = req->nr_segments;
+ if (unlikely(nseg == 0 && operation != WRITE_BARRIER) ||
+ unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+ DPRINTK("Bad number of segments in request (%d)\n", nseg);
+ goto fail_response;
+ }
+
+ preq.dev = req->handle;
+ preq.sector_number = req->sector_number;
+ preq.nr_sects = 0;
+
+ pending_req->blkif = blkif;
+ pending_req->id = req->id;
+ pending_req->operation = req->operation;
+ pending_req->status = BLKIF_RSP_OKAY;
+ pending_req->nr_pages = nseg;
+
+ for (i = 0; i < nseg; i++) {
+ uint32_t flags;
+
+ seg[i].nsec = req->seg[i].last_sect -
+ req->seg[i].first_sect + 1;
+
+ if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
+ (req->seg[i].last_sect < req->seg[i].first_sect))
+ goto fail_response;
+ preq.nr_sects += seg[i].nsec;
+
+ flags = GNTMAP_host_map;
+ if (operation != READ)
+ flags |= GNTMAP_readonly;
+ gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
+ req->seg[i].gref, blkif->domid);
+ }
+
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
+ BUG_ON(ret);
+
+ for (i = 0; i < nseg; i++) {
+ if (unlikely(map[i].status != 0)) {
+ DPRINTK("invalid buffer -- could not remap it\n");
+ map[i].handle = BLKBACK_INVALID_HANDLE;
+ ret |= 1;
+ continue;
+ }
+
+ set_phys_to_machine(
+ page_to_pfn(pending_page(pending_req, i)),
+ FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
+ seg[i].buf = map[i].dev_bus_addr |
+ (req->seg[i].first_sect << 9);
+ blkback_pagemap_set(vaddr_pagenr(pending_req, i),
+ pending_page(pending_req, i),
+ blkif->domid, req->handle,
+ req->seg[i].gref);
+ pending_handle(pending_req, i) = map[i].handle;
+ }
+
+ if (ret)
+ goto fail_flush;
+
+ if (vbd_translate(&preq, blkif, operation) != 0) {
+ DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
+ operation == READ ? "read" : "write",
+ preq.sector_number,
+ preq.sector_number + preq.nr_sects, preq.dev);
+ goto fail_flush;
+ }
+
+ plug_queue(blkif, preq.bdev);
+ atomic_set(&pending_req->pendcnt, 1);
+ blkif_get(blkif);
+
+ for (i = 0; i < nseg; i++) {
+ if (((int)preq.sector_number|(int)seg[i].nsec) &
+ ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
+ DPRINTK("Misaligned I/O request from domain %d",
+ blkif->domid);
+ goto fail_put_bio;
+ }
+
+ while ((bio == NULL) ||
+ (bio_add_page(bio,
+ pending_page(pending_req, i),
+ seg[i].nsec << 9,
+ seg[i].buf & ~PAGE_MASK) == 0)) {
+ if (bio) {
+ atomic_inc(&pending_req->pendcnt);
+ submit_bio(operation, bio);
+ }
+
+ bio = bio_alloc(GFP_KERNEL, nseg-i);
+ if (unlikely(bio == NULL))
+ goto fail_put_bio;
+
+ bio->bi_bdev = preq.bdev;
+ bio->bi_private = pending_req;
+ bio->bi_end_io = end_block_io_op;
+ bio->bi_sector = preq.sector_number;
+ }
+
+ preq.sector_number += seg[i].nsec;
+ }
+
+ if (!bio) {
+ BUG_ON(operation != WRITE_BARRIER);
+ bio = bio_alloc(GFP_KERNEL, 0);
+ if (unlikely(bio == NULL))
+ goto fail_put_bio;
+
+ bio->bi_bdev = preq.bdev;
+ bio->bi_private = pending_req;
+ bio->bi_end_io = end_block_io_op;
+ bio->bi_sector = -1;
+ }
+
+ submit_bio(operation, bio);
+
+ if (operation == READ)
+ blkif->st_rd_sect += preq.nr_sects;
+ else if (operation == WRITE || operation == WRITE_BARRIER)
+ blkif->st_wr_sect += preq.nr_sects;
+
+ return;
+
+ fail_flush:
+ fast_flush_area(pending_req);
+ fail_response:
+ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+ free_req(pending_req);
+ msleep(1); /* back off a bit */
+ return;
+
+ fail_put_bio:
+ __end_block_io_op(pending_req, -EINVAL);
+ if (bio)
+ bio_put(bio);
+ unplug_queue(blkif);
+ msleep(1); /* back off a bit */
+ return;
+}
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, u64 id,
+ unsigned short op, int st)
+{
+ struct blkif_response resp;
+ unsigned long flags;
+ union blkif_back_rings *blk_rings = &blkif->blk_rings;
+ int more_to_do = 0;
+ int notify;
+
+ resp.id = id;
+ resp.operation = op;
+ resp.status = st;
+
+ spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+ /* Place on the response ring for the relevant domain. */
+ switch (blkif->blk_protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
+ &resp, sizeof(resp));
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
+ &resp, sizeof(resp));
+ break;
+ case BLKIF_PROTOCOL_X86_64:
+ memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
+ &resp, sizeof(resp));
+ break;
+ default:
+ BUG();
+ }
+ blk_rings->common.rsp_prod_pvt++;
+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
+ if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
+ /*
+ * Tail check for pending requests. Allows frontend to avoid
+ * notifications if requests are already in flight (lower
+ * overheads and promotes batching).
+ */
+ RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
+
+ } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
+ more_to_do = 1;
+ }
+
+ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+
+ if (more_to_do)
+ blkif_notify_work(blkif);
+ if (notify)
+ notify_remote_via_irq(blkif->irq);
+}
+
+static int __init blkif_init(void)
+{
+ int i, mmap_pages;
+ int rc = 0;
+
+ if (!xen_pv_domain())
+ return -ENODEV;
+
+ mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+ pending_reqs = kmalloc(sizeof(pending_reqs[0]) *
+ blkif_reqs, GFP_KERNEL);
+ pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+ mmap_pages, GFP_KERNEL);
+ pending_pages = alloc_empty_pages_and_pagevec(mmap_pages);
+
+ if (blkback_pagemap_init(mmap_pages))
+ goto out_of_memory;
+
+ if (!pending_reqs || !pending_grant_handles || !pending_pages) {
+ rc = -ENOMEM;
+ goto out_of_memory;
+ }
+
+ for (i = 0; i < mmap_pages; i++)
+ pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
+
+ rc = blkif_interface_init();
+ if (rc)
+ goto failed_init;
+
+ memset(pending_reqs, 0, sizeof(pending_reqs));
+ INIT_LIST_HEAD(&pending_free);
+
+ for (i = 0; i < blkif_reqs; i++)
+ list_add_tail(&pending_reqs[i].free_list, &pending_free);
+
+ rc = blkif_xenbus_init();
+ if (rc)
+ goto failed_init;
+
+ return 0;
+
+ out_of_memory:
+ printk(KERN_ERR "%s: out of memory\n", __func__);
+ failed_init:
+ kfree(pending_reqs);
+ kfree(pending_grant_handles);
+ free_empty_pages_and_pagevec(pending_pages, mmap_pages);
+ return rc;
+}
+
+module_init(blkif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
new file mode 100644
index 0000000..531ba81
--- /dev/null
+++ b/drivers/xen/blkback/common.h
@@ -0,0 +1,143 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+#include <asm/pgalloc.h>
+#include <asm/hypervisor.h>
+#include <xen/blkif.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+#include "blkback-pagemap.h"
+
+
+#define DPRINTK(_f, _a...) \
+ pr_debug("(file=%s, line=%d) " _f, \
+ __FILE__ , __LINE__ , ## _a )
+
+struct vbd {
+ blkif_vdev_t handle; /* what the domain refers to this vbd as */
+ unsigned char readonly; /* Non-zero -> read-only */
+ unsigned char type; /* VDISK_xxx */
+ u32 pdevice; /* phys device that this vbd maps to */
+ struct block_device *bdev;
+ sector_t size; /* Cached size parameter */
+};
+
+struct backend_info;
+
+typedef struct blkif_st {
+ /* Unique identifier for this interface. */
+ domid_t domid;
+ unsigned int handle;
+ /* Physical parameters of the comms window. */
+ unsigned int irq;
+ /* Comms information. */
+ enum blkif_protocol blk_protocol;
+ union blkif_back_rings blk_rings;
+ struct vm_struct *blk_ring_area;
+ /* The VBD attached to this interface. */
+ struct vbd vbd;
+ /* Back pointer to the backend_info. */
+ struct backend_info *be;
+ /* Private fields. */
+ spinlock_t blk_ring_lock;
+ atomic_t refcnt;
+
+ wait_queue_head_t wq;
+ struct task_struct *xenblkd;
+ unsigned int waiting_reqs;
+ struct request_queue *plug;
+
+ /* statistics */
+ unsigned long st_print;
+ int st_rd_req;
+ int st_wr_req;
+ int st_oo_req;
+ int st_br_req;
+ int st_rd_sect;
+ int st_wr_sect;
+
+ wait_queue_head_t waiting_to_free;
+
+ grant_handle_t shmem_handle;
+ grant_ref_t shmem_ref;
+} blkif_t;
+
+blkif_t *blkif_alloc(domid_t domid);
+void blkif_disconnect(blkif_t *blkif);
+void blkif_free(blkif_t *blkif);
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
+void vbd_resize(blkif_t *blkif);
+
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b) \
+ do { \
+ if (atomic_dec_and_test(&(_b)->refcnt)) \
+ wake_up(&(_b)->waiting_to_free);\
+ } while (0)
+
+/* Create a vbd. */
+int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
+ unsigned minor, int readonly, int cdrom);
+void vbd_free(struct vbd *vbd);
+
+unsigned long long vbd_size(struct vbd *vbd);
+unsigned int vbd_info(struct vbd *vbd);
+unsigned long vbd_secsize(struct vbd *vbd);
+
+struct phys_req {
+ unsigned short dev;
+ unsigned short nr_sects;
+ struct block_device *bdev;
+ blkif_sector_t sector_number;
+};
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
+
+int blkif_interface_init(void);
+
+int blkif_xenbus_init(void);
+
+irqreturn_t blkif_be_int(int irq, void *dev_id);
+int blkif_schedule(void *arg);
+
+int blkback_barrier(struct xenbus_transaction xbt,
+ struct backend_info *be, int state);
+
+struct xenbus_device *blkback_xenbus(struct backend_info *be);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
new file mode 100644
index 0000000..e397a41
--- /dev/null
+++ b/drivers/xen/blkback/interface.c
@@ -0,0 +1,186 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/interface.c
+ *
+ * Block-device interface management.
+ *
+ * Copyright (c) 2004, Keir Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include <linux/kthread.h>
+
+static struct kmem_cache *blkif_cachep;
+
+blkif_t *blkif_alloc(domid_t domid)
+{
+ blkif_t *blkif;
+
+ blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
+ if (!blkif)
+ return ERR_PTR(-ENOMEM);
+
+ memset(blkif, 0, sizeof(*blkif));
+ blkif->domid = domid;
+ spin_lock_init(&blkif->blk_ring_lock);
+ atomic_set(&blkif->refcnt, 1);
+ init_waitqueue_head(&blkif->wq);
+ blkif->st_print = jiffies;
+ init_waitqueue_head(&blkif->waiting_to_free);
+
+ return blkif;
+}
+
+static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
+{
+ struct gnttab_map_grant_ref op;
+
+ gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+ GNTMAP_host_map, shared_page, blkif->domid);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status) {
+ DPRINTK(" Grant table operation failure !\n");
+ return op.status;
+ }
+
+ blkif->shmem_ref = shared_page;
+ blkif->shmem_handle = op.handle;
+
+ return 0;
+}
+
+static void unmap_frontend_page(blkif_t *blkif)
+{
+ struct gnttab_unmap_grant_ref op;
+
+ gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+ GNTMAP_host_map, blkif->shmem_handle);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+ BUG();
+}
+
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
+{
+ int err;
+
+ /* Already connected through? */
+ if (blkif->irq)
+ return 0;
+
+ if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
+ return -ENOMEM;
+
+ err = map_frontend_page(blkif, shared_page);
+ if (err) {
+ free_vm_area(blkif->blk_ring_area);
+ return err;
+ }
+
+ switch (blkif->blk_protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ {
+ struct blkif_sring *sring;
+ sring = (struct blkif_sring *)blkif->blk_ring_area->addr;
+ BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_32:
+ {
+ struct blkif_x86_32_sring *sring_x86_32;
+ sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr;
+ BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_64:
+ {
+ struct blkif_x86_64_sring *sring_x86_64;
+ sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;
+ BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+ break;
+ }
+ default:
+ BUG();
+ }
+
+ err = bind_interdomain_evtchn_to_irqhandler(
+ blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
+ if (err < 0)
+ {
+ unmap_frontend_page(blkif);
+ free_vm_area(blkif->blk_ring_area);
+ blkif->blk_rings.common.sring = NULL;
+ return err;
+ }
+ blkif->irq = err;
+
+ return 0;
+}
+
+void blkif_disconnect(blkif_t *blkif)
+{
+ if (blkif->xenblkd) {
+ kthread_stop(blkif->xenblkd);
+ blkif->xenblkd = NULL;
+ }
+
+ atomic_dec(&blkif->refcnt);
+ wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
+ atomic_inc(&blkif->refcnt);
+
+ if (blkif->irq) {
+ unbind_from_irqhandler(blkif->irq, blkif);
+ blkif->irq = 0;
+ }
+
+ if (blkif->blk_rings.common.sring) {
+ unmap_frontend_page(blkif);
+ free_vm_area(blkif->blk_ring_area);
+ blkif->blk_rings.common.sring = NULL;
+ }
+}
+
+void blkif_free(blkif_t *blkif)
+{
+ if (!atomic_dec_and_test(&blkif->refcnt))
+ BUG();
+ kmem_cache_free(blkif_cachep, blkif);
+}
+
+int __init blkif_interface_init(void)
+{
+ blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
+ 0, 0, NULL);
+ if (!blkif_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
new file mode 100644
index 0000000..6e0abcc
--- /dev/null
+++ b/drivers/xen/blkback/vbd.c
@@ -0,0 +1,167 @@
+/******************************************************************************
+ * blkback/vbd.c
+ *
+ * Routines for managing virtual block devices (VBDs).
+ *
+ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#define vbd_sz(_v) ((_v)->bdev->bd_part ? \
+ (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
+
+unsigned long long vbd_size(struct vbd *vbd)
+{
+ return vbd_sz(vbd);
+}
+
+unsigned int vbd_info(struct vbd *vbd)
+{
+ return vbd->type | (vbd->readonly?VDISK_READONLY:0);
+}
+
+unsigned long vbd_secsize(struct vbd *vbd)
+{
+ return bdev_logical_block_size(vbd->bdev);
+}
+
+int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
+ unsigned minor, int readonly, int cdrom)
+{
+ struct vbd *vbd;
+ struct block_device *bdev;
+
+ vbd = &blkif->vbd;
+ vbd->handle = handle;
+ vbd->readonly = readonly;
+ vbd->type = 0;
+
+ vbd->pdevice = MKDEV(major, minor);
+
+ bdev = open_by_devnum(vbd->pdevice,
+ vbd->readonly ? FMODE_READ : FMODE_WRITE);
+
+ if (IS_ERR(bdev)) {
+ DPRINTK("vbd_creat: device %08x could not be opened.\n",
+ vbd->pdevice);
+ return -ENOENT;
+ }
+
+ vbd->bdev = bdev;
+ vbd->size = vbd_size(vbd);
+
+ if (vbd->bdev->bd_disk == NULL) {
+ DPRINTK("vbd_creat: device %08x doesn't exist.\n",
+ vbd->pdevice);
+ vbd_free(vbd);
+ return -ENOENT;
+ }
+
+ if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
+ vbd->type |= VDISK_CDROM;
+ if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+ vbd->type |= VDISK_REMOVABLE;
+
+ DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
+ handle, blkif->domid);
+ return 0;
+}
+
+void vbd_free(struct vbd *vbd)
+{
+ if (vbd->bdev)
+ blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
+ vbd->bdev = NULL;
+}
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
+{
+ struct vbd *vbd = &blkif->vbd;
+ int rc = -EACCES;
+
+ if ((operation != READ) && vbd->readonly)
+ goto out;
+
+ if (likely(req->nr_sects)) {
+ blkif_sector_t end = req->sector_number + req->nr_sects;
+
+ if (unlikely(end < req->sector_number))
+ goto out;
+ if (unlikely(end > vbd_sz(vbd)))
+ goto out;
+ }
+
+ req->dev = vbd->pdevice;
+ req->bdev = vbd->bdev;
+ rc = 0;
+
+ out:
+ return rc;
+}
+
+void vbd_resize(blkif_t *blkif)
+{
+ struct vbd *vbd = &blkif->vbd;
+ struct xenbus_transaction xbt;
+ int err;
+ struct xenbus_device *dev = blkback_xenbus(blkif->be);
+ unsigned long long new_size = vbd_size(vbd);
+
+ printk(KERN_INFO "VBD Resize: new size %Lu\n", new_size);
+ vbd->size = new_size;
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ printk(KERN_WARNING "Error starting transaction");
+ return;
+ }
+ err = xenbus_printf(xbt, dev->nodename, "sectors", "%Lu",
+ vbd_size(vbd));
+ if (err) {
+ printk(KERN_WARNING "Error writing new size");
+ goto abort;
+ }
+ /*
+ * Write the current state; we will use this to synchronize
+ * the front-end. If the current state is "connected" the
+ * front-end will get the new size information online.
+ */
+ err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
+ if (err) {
+ printk(KERN_WARNING "Error writing the state");
+ goto abort;
+ }
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN)
+ goto again;
+ if (err)
+ printk(KERN_WARNING "Error ending transaction");
+abort:
+ xenbus_transaction_end(xbt, 1);
+}
diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
new file mode 100644
index 0000000..a0534fc
--- /dev/null
+++ b/drivers/xen/blkback/xenbus.c
@@ -0,0 +1,553 @@
+/* Xenbus code for blkif backend
+ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+ Copyright (C) 2005 XenSource Ltd
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include "common.h"
+
+#undef DPRINTK
+#define DPRINTK(fmt, args...) \
+ pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \
+ __FUNCTION__, __LINE__, ##args)
+
+struct backend_info
+{
+ struct xenbus_device *dev;
+ blkif_t *blkif;
+ struct xenbus_watch backend_watch;
+ unsigned major;
+ unsigned minor;
+ char *mode;
+};
+
+static void connect(struct backend_info *);
+static int connect_ring(struct backend_info *);
+static void backend_changed(struct xenbus_watch *, const char **,
+ unsigned int);
+
+struct xenbus_device *blkback_xenbus(struct backend_info *be)
+{
+ return be->dev;
+}
+
+static int blkback_name(blkif_t *blkif, char *buf)
+{
+ char *devpath, *devname;
+ struct xenbus_device *dev = blkif->be->dev;
+
+ devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
+ if (IS_ERR(devpath))
+ return PTR_ERR(devpath);
+
+ if ((devname = strstr(devpath, "/dev/")) != NULL)
+ devname += strlen("/dev/");
+ else
+ devname = devpath;
+
+ snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
+ kfree(devpath);
+
+ return 0;
+}
+
+static void update_blkif_status(blkif_t *blkif)
+{
+ int err;
+ char name[TASK_COMM_LEN];
+
+ /* Not ready to connect? */
+ if (!blkif->irq || !blkif->vbd.bdev)
+ return;
+
+ /* Already connected? */
+ if (blkif->be->dev->state == XenbusStateConnected)
+ return;
+
+ /* Attempt to connect: exit if we fail to. */
+ connect(blkif->be);
+ if (blkif->be->dev->state != XenbusStateConnected)
+ return;
+
+ err = blkback_name(blkif, name);
+ if (err) {
+ xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
+ return;
+ }
+
+ err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping);
+ if (err) {
+ xenbus_dev_error(blkif->be->dev, err, "block flush");
+ return;
+ }
+ invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
+
+ blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
+ if (IS_ERR(blkif->xenblkd)) {
+ err = PTR_ERR(blkif->xenblkd);
+ blkif->xenblkd = NULL;
+ xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
+ }
+}
+
+
+/****************************************************************
+ * sysfs interface for VBD I/O requests
+ */
+
+#define VBD_SHOW(name, format, args...) \
+ static ssize_t show_##name(struct device *_dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+ { \
+ struct xenbus_device *dev = to_xenbus_device(_dev); \
+ struct backend_info *be = dev_get_drvdata(&dev->dev); \
+ \
+ return sprintf(buf, format, ##args); \
+ } \
+ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req);
+VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req);
+VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req);
+VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req);
+VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
+VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
+
+static struct attribute *vbdstat_attrs[] = {
+ &dev_attr_oo_req.attr,
+ &dev_attr_rd_req.attr,
+ &dev_attr_wr_req.attr,
+ &dev_attr_br_req.attr,
+ &dev_attr_rd_sect.attr,
+ &dev_attr_wr_sect.attr,
+ NULL
+};
+
+static struct attribute_group vbdstat_group = {
+ .name = "statistics",
+ .attrs = vbdstat_attrs,
+};
+
+VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
+VBD_SHOW(mode, "%s\n", be->mode);
+
+int xenvbd_sysfs_addif(struct xenbus_device *dev)
+{
+ int error;
+
+ error = device_create_file(&dev->dev, &dev_attr_physical_device);
+ if (error)
+ goto fail1;
+
+ error = device_create_file(&dev->dev, &dev_attr_mode);
+ if (error)
+ goto fail2;
+
+ error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
+ if (error)
+ goto fail3;
+
+ return 0;
+
+fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+fail2: device_remove_file(&dev->dev, &dev_attr_mode);
+fail1: device_remove_file(&dev->dev, &dev_attr_physical_device);
+ return error;
+}
+
+void xenvbd_sysfs_delif(struct xenbus_device *dev)
+{
+ sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+ device_remove_file(&dev->dev, &dev_attr_mode);
+ device_remove_file(&dev->dev, &dev_attr_physical_device);
+}
+
+static int blkback_remove(struct xenbus_device *dev)
+{
+ struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+ DPRINTK("");
+
+ if (be->major || be->minor)
+ xenvbd_sysfs_delif(dev);
+
+ if (be->backend_watch.node) {
+ unregister_xenbus_watch(&be->backend_watch);
+ kfree(be->backend_watch.node);
+ be->backend_watch.node = NULL;
+ }
+
+ if (be->blkif) {
+ blkif_disconnect(be->blkif);
+ vbd_free(&be->blkif->vbd);
+ blkif_free(be->blkif);
+ be->blkif = NULL;
+ }
+
+ kfree(be);
+ dev_set_drvdata(&dev->dev, NULL);
+ return 0;
+}
+
+int blkback_barrier(struct xenbus_transaction xbt,
+ struct backend_info *be, int state)
+{
+ struct xenbus_device *dev = be->dev;
+ int err;
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
+ "%d", state);
+ if (err)
+ xenbus_dev_fatal(dev, err, "writing feature-barrier");
+
+ return err;
+}
+
+/**
+ * Entry point to this code when a new device is created. Allocate the basic
+ * structures, and watch the store waiting for the hotplug scripts to tell us
+ * the device's physical major and minor numbers. Switch to InitWait.
+ */
+static int blkback_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ int err;
+ struct backend_info *be = kzalloc(sizeof(struct backend_info),
+ GFP_KERNEL);
+ if (!be) {
+ xenbus_dev_fatal(dev, -ENOMEM,
+ "allocating backend structure");
+ return -ENOMEM;
+ }
+ be->dev = dev;
+ dev_set_drvdata(&dev->dev, be);
+
+ be->blkif = blkif_alloc(dev->otherend_id);
+ if (IS_ERR(be->blkif)) {
+ err = PTR_ERR(be->blkif);
+ be->blkif = NULL;
+ xenbus_dev_fatal(dev, err, "creating block interface");
+ goto fail;
+ }
+
+ /* setup back pointer */
+ be->blkif->be = be;
+
+ err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed,
+ "%s/%s", dev->nodename, "physical-device");
+ if (err)
+ goto fail;
+
+ err = xenbus_switch_state(dev, XenbusStateInitWait);
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ DPRINTK("failed");
+ blkback_remove(dev);
+ return err;
+}
+
+
+/**
+ * Callback received when the hotplug scripts have placed the physical-device
+ * node. Read it and the mode node, and create a vbd. If the frontend is
+ * ready, connect.
+ */
+static void backend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ int err;
+ unsigned major;
+ unsigned minor;
+ struct backend_info *be
+ = container_of(watch, struct backend_info, backend_watch);
+ struct xenbus_device *dev = be->dev;
+ int cdrom = 0;
+ char *device_type;
+
+ DPRINTK("");
+
+ err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
+ &major, &minor);
+ if (XENBUS_EXIST_ERR(err)) {
+ /* Since this watch will fire once immediately after it is
+ registered, we expect this. Ignore it, and wait for the
+ hotplug scripts. */
+ return;
+ }
+ if (err != 2) {
+ xenbus_dev_fatal(dev, err, "reading physical-device");
+ return;
+ }
+
+ if ((be->major || be->minor) &&
+ ((be->major != major) || (be->minor != minor))) {
+ printk(KERN_WARNING
+ "blkback: changing physical device (from %x:%x to "
+ "%x:%x) not supported.\n", be->major, be->minor,
+ major, minor);
+ return;
+ }
+
+ be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
+ if (IS_ERR(be->mode)) {
+ err = PTR_ERR(be->mode);
+ be->mode = NULL;
+ xenbus_dev_fatal(dev, err, "reading mode");
+ return;
+ }
+
+ device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
+ if (!IS_ERR(device_type)) {
+ cdrom = strcmp(device_type, "cdrom") == 0;
+ kfree(device_type);
+ }
+
+ if (be->major == 0 && be->minor == 0) {
+ /* Front end dir is a number, which is used as the handle. */
+
+ char *p = strrchr(dev->otherend, '/') + 1;
+ long handle = simple_strtoul(p, NULL, 0);
+
+ be->major = major;
+ be->minor = minor;
+
+ err = vbd_create(be->blkif, handle, major, minor,
+ (NULL == strchr(be->mode, 'w')), cdrom);
+ if (err) {
+ be->major = be->minor = 0;
+ xenbus_dev_fatal(dev, err, "creating vbd structure");
+ return;
+ }
+
+ err = xenvbd_sysfs_addif(dev);
+ if (err) {
+ vbd_free(&be->blkif->vbd);
+ be->major = be->minor = 0;
+ xenbus_dev_fatal(dev, err, "creating sysfs entries");
+ return;
+ }
+
+ /* We're potentially connected now */
+ update_blkif_status(be->blkif);
+ }
+}
+
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+ enum xenbus_state frontend_state)
+{
+ struct backend_info *be = dev_get_drvdata(&dev->dev);
+ int err;
+
+ DPRINTK("%s", xenbus_strstate(frontend_state));
+
+ switch (frontend_state) {
+ case XenbusStateInitialising:
+ if (dev->state == XenbusStateClosed) {
+ printk(KERN_INFO "%s: %s: prepare for reconnect\n",
+ __FUNCTION__, dev->nodename);
+ xenbus_switch_state(dev, XenbusStateInitWait);
+ }
+ break;
+
+ case XenbusStateInitialised:
+ case XenbusStateConnected:
+ /* Ensure we connect even when two watches fire in
+ close successsion and we miss the intermediate value
+ of frontend_state. */
+ if (dev->state == XenbusStateConnected)
+ break;
+
+ err = connect_ring(be);
+ if (err)
+ break;
+ update_blkif_status(be->blkif);
+ break;
+
+ case XenbusStateClosing:
+ blkif_disconnect(be->blkif);
+ xenbus_switch_state(dev, XenbusStateClosing);
+ break;
+
+ case XenbusStateClosed:
+ xenbus_switch_state(dev, XenbusStateClosed);
+ if (xenbus_dev_is_online(dev))
+ break;
+ /* fall through if not online */
+ case XenbusStateUnknown:
+ device_unregister(&dev->dev);
+ break;
+
+ default:
+ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+ frontend_state);
+ break;
+ }
+}
+
+
+/* ** Connection ** */
+
+
+/**
+ * Write the physical details regarding the block device to the store, and
+ * switch to Connected state.
+ */
+static void connect(struct backend_info *be)
+{
+ struct xenbus_transaction xbt;
+ int err;
+ struct xenbus_device *dev = be->dev;
+
+ DPRINTK("%s", dev->otherend);
+
+ /* Supply the information about the device the frontend needs */
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "starting transaction");
+ return;
+ }
+
+ err = blkback_barrier(xbt, be, 1);
+ if (err)
+ goto abort;
+
+ err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+ vbd_size(&be->blkif->vbd));
+ if (err) {
+ xenbus_dev_fatal(dev, err, "writing %s/sectors",
+ dev->nodename);
+ goto abort;
+ }
+
+ /* FIXME: use a typename instead */
+ err = xenbus_printf(xbt, dev->nodename, "info", "%u",
+ vbd_info(&be->blkif->vbd));
+ if (err) {
+ xenbus_dev_fatal(dev, err, "writing %s/info",
+ dev->nodename);
+ goto abort;
+ }
+ err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
+ vbd_secsize(&be->blkif->vbd));
+ if (err) {
+ xenbus_dev_fatal(dev, err, "writing %s/sector-size",
+ dev->nodename);
+ goto abort;
+ }
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN)
+ goto again;
+ if (err)
+ xenbus_dev_fatal(dev, err, "ending transaction");
+
+ err = xenbus_switch_state(dev, XenbusStateConnected);
+ if (err)
+ xenbus_dev_fatal(dev, err, "switching to Connected state",
+ dev->nodename);
+
+ return;
+ abort:
+ xenbus_transaction_end(xbt, 1);
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+ struct xenbus_device *dev = be->dev;
+ unsigned long ring_ref;
+ unsigned int evtchn;
+ char protocol[64] = "";
+ int err;
+
+ DPRINTK("%s", dev->otherend);
+
+ err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
+ "event-channel", "%u", &evtchn, NULL);
+ if (err) {
+ xenbus_dev_fatal(dev, err,
+ "reading %s/ring-ref and event-channel",
+ dev->otherend);
+ return err;
+ }
+
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+ err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+ "%63s", protocol, NULL);
+ if (err)
+ strcpy(protocol, "unspecified, assuming native");
+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+ else {
+ xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+ return -1;
+ }
+ printk(KERN_INFO
+ "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
+ ring_ref, evtchn, be->blkif->blk_protocol, protocol);
+
+ /* Map the shared frame, irq etc. */
+ err = blkif_map(be->blkif, ring_ref, evtchn);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
+ ring_ref, evtchn);
+ return err;
+ }
+
+ return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id blkback_ids[] = {
+ { "vbd" },
+ { "" }
+};
+
+
+static struct xenbus_driver blkback = {
+ .name = "vbd",
+ .owner = THIS_MODULE,
+ .ids = blkback_ids,
+ .probe = blkback_probe,
+ .remove = blkback_remove,
+ .otherend_changed = frontend_changed
+};
+
+
+int blkif_xenbus_init(void)
+{
+ return xenbus_register_backend(&blkback);
+}
diff --git a/drivers/xen/blktap/Makefile b/drivers/xen/blktap/Makefile
new file mode 100644
index 0000000..822b4e4
--- /dev/null
+++ b/drivers/xen/blktap/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o
+
+blktap-objs := control.o ring.o device.o request.o sysfs.o
diff --git a/drivers/xen/blktap/blktap.h b/drivers/xen/blktap/blktap.h
new file mode 100644
index 0000000..fe63fc9
--- /dev/null
+++ b/drivers/xen/blktap/blktap.h
@@ -0,0 +1,209 @@
+#ifndef _BLKTAP_H_
+#define _BLKTAP_H_
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/init.h>
+#include <linux/scatterlist.h>
+#include <xen/blkif.h>
+
+extern int blktap_debug_level;
+extern int blktap_ring_major;
+extern int blktap_device_major;
+
+#define BTPRINTK(level, tag, force, _f, _a...) \
+ do { \
+ if (blktap_debug_level > level && \
+ (force || printk_ratelimit())) \
+ printk(tag "%s: " _f, __func__, ##_a); \
+ } while (0)
+
+#define BTDBG(_f, _a...) BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a)
+#define BTINFO(_f, _a...) BTPRINTK(0, KERN_INFO, 0, _f, ##_a)
+#define BTWARN(_f, _a...) BTPRINTK(0, KERN_WARNING, 0, _f, ##_a)
+#define BTERR(_f, _a...) BTPRINTK(0, KERN_ERR, 0, _f, ##_a)
+
+#define MAX_BLKTAP_DEVICE 1024
+
+#define BLKTAP_DEVICE 4
+#define BLKTAP_DEVICE_CLOSED 5
+#define BLKTAP_SHUTDOWN_REQUESTED 8
+
+/* blktap IOCTLs: */
+#define BLKTAP2_IOCTL_KICK_FE 1
+#define BLKTAP2_IOCTL_ALLOC_TAP 200
+#define BLKTAP2_IOCTL_FREE_TAP 201
+#define BLKTAP2_IOCTL_CREATE_DEVICE 202
+#define BLKTAP2_IOCTL_REMOVE_DEVICE 207
+
+#define BLKTAP2_MAX_MESSAGE_LEN 256
+
+#define BLKTAP2_RING_MESSAGE_CLOSE 3
+
+#define BLKTAP_REQUEST_FREE 0
+#define BLKTAP_REQUEST_PENDING 1
+
+/*
+ * The maximum number of requests that can be outstanding at any time
+ * is determined by
+ *
+ * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
+ *
+ * where mmap_alloc < MAX_DYNAMIC_MEM.
+ *
+ * TODO:
+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
+ * sysfs.
+ */
+#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
+#define MAX_DYNAMIC_MEM BLK_RING_SIZE
+#define MAX_PENDING_REQS BLK_RING_SIZE
+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req, _seg) \
+ (_start + \
+ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
+ ((_seg) * PAGE_SIZE))
+
+struct grant_handle_pair {
+ grant_handle_t kernel;
+ grant_handle_t user;
+};
+#define INVALID_GRANT_HANDLE 0xFFFF
+
+struct blktap_handle {
+ unsigned int ring;
+ unsigned int device;
+ unsigned int minor;
+};
+
+struct blktap_params {
+ char name[BLKTAP2_MAX_MESSAGE_LEN];
+ unsigned long long capacity;
+ unsigned long sector_size;
+};
+
+struct blktap_device {
+ spinlock_t lock;
+ struct gendisk *gd;
+};
+
+struct blktap_ring {
+ struct task_struct *task;
+
+ struct vm_area_struct *vma;
+ struct blkif_front_ring ring;
+ unsigned long ring_vstart;
+ unsigned long user_vstart;
+
+ int n_pending;
+ struct blktap_request *pending[MAX_PENDING_REQS];
+
+ wait_queue_head_t poll_wait;
+
+ dev_t devno;
+ struct device *dev;
+};
+
+struct blktap_statistics {
+ unsigned long st_print;
+ int st_rd_req;
+ int st_wr_req;
+ int st_oo_req;
+ int st_rd_sect;
+ int st_wr_sect;
+ s64 st_rd_cnt;
+ s64 st_rd_sum_usecs;
+ s64 st_rd_max_usecs;
+ s64 st_wr_cnt;
+ s64 st_wr_sum_usecs;
+ s64 st_wr_max_usecs;
+};
+
+struct blktap_request {
+ struct blktap *tap;
+ struct request *rq;
+ int usr_idx;
+
+ int operation;
+ struct timeval time;
+
+ struct scatterlist sg_table[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ int nr_pages;
+};
+
+#define blktap_for_each_sg(_sg, _req, _i) \
+ for (_sg = (_req)->sg_table, _i = 0; \
+ _i < (_req)->nr_pages; \
+ (_sg)++, (_i)++)
+
+struct blktap {
+ int minor;
+ unsigned long dev_inuse;
+
+ struct blktap_ring ring;
+ struct blktap_device device;
+ struct blktap_page_pool *pool;
+
+ wait_queue_head_t remove_wait;
+ struct work_struct remove_work;
+ char name[BLKTAP2_MAX_MESSAGE_LEN];
+
+ struct blktap_statistics stats;
+};
+
+struct blktap_page_pool {
+ struct mempool_s *bufs;
+ spinlock_t lock;
+ struct kobject kobj;
+ wait_queue_head_t wait;
+};
+
+extern struct mutex blktap_lock;
+extern struct blktap **blktaps;
+extern int blktap_max_minor;
+
+int blktap_control_destroy_tap(struct blktap *);
+size_t blktap_control_debug(struct blktap *, char *, size_t);
+
+int blktap_ring_init(void);
+void blktap_ring_exit(void);
+size_t blktap_ring_debug(struct blktap *, char *, size_t);
+int blktap_ring_create(struct blktap *);
+int blktap_ring_destroy(struct blktap *);
+struct blktap_request *blktap_ring_make_request(struct blktap *);
+void blktap_ring_free_request(struct blktap *,struct blktap_request *);
+void blktap_ring_submit_request(struct blktap *, struct blktap_request *);
+int blktap_ring_map_request_segment(struct blktap *, struct blktap_request *, int);
+int blktap_ring_map_request(struct blktap *, struct blktap_request *);
+void blktap_ring_unmap_request(struct blktap *, struct blktap_request *);
+void blktap_ring_set_message(struct blktap *, int);
+void blktap_ring_kick_user(struct blktap *);
+
+int blktap_sysfs_init(void);
+void blktap_sysfs_exit(void);
+int blktap_sysfs_create(struct blktap *);
+void blktap_sysfs_destroy(struct blktap *);
+
+int blktap_device_init(void);
+void blktap_device_exit(void);
+size_t blktap_device_debug(struct blktap *, char *, size_t);
+int blktap_device_create(struct blktap *, struct blktap_params *);
+int blktap_device_destroy(struct blktap *);
+void blktap_device_destroy_sync(struct blktap *);
+void blktap_device_run_queue(struct blktap *);
+void blktap_device_end_request(struct blktap *, struct blktap_request *, int);
+
+int blktap_page_pool_init(struct kobject *);
+void blktap_page_pool_exit(void);
+struct blktap_page_pool *blktap_page_pool_get(const char *);
+
+size_t blktap_request_debug(struct blktap *, char *, size_t);
+struct blktap_request *blktap_request_alloc(struct blktap *);
+int blktap_request_get_pages(struct blktap *, struct blktap_request *, int);
+void blktap_request_free(struct blktap *, struct blktap_request *);
+void blktap_request_bounce(struct blktap *, struct blktap_request *, int, int);
+
+
+#endif
diff --git a/drivers/xen/blktap/control.c b/drivers/xen/blktap/control.c
new file mode 100644
index 0000000..f339bba
--- /dev/null
+++ b/drivers/xen/blktap/control.c
@@ -0,0 +1,315 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/miscdevice.h>
+#include <linux/device.h>
+#include <asm/uaccess.h>
+
+#include "blktap.h"
+
+DEFINE_MUTEX(blktap_lock);
+
+struct blktap **blktaps;
+int blktap_max_minor;
+static struct blktap_page_pool *default_pool;
+
+static struct blktap *
+blktap_control_get_minor(void)
+{
+ int minor;
+ struct blktap *tap;
+
+ tap = kzalloc(sizeof(*tap), GFP_KERNEL);
+ if (unlikely(!tap))
+ return NULL;
+
+ mutex_lock(&blktap_lock);
+
+ for (minor = 0; minor < blktap_max_minor; minor++)
+ if (!blktaps[minor])
+ break;
+
+ if (minor == MAX_BLKTAP_DEVICE)
+ goto fail;
+
+ if (minor == blktap_max_minor) {
+ void *p;
+ int n;
+
+ n = min(2 * blktap_max_minor, MAX_BLKTAP_DEVICE);
+ p = krealloc(blktaps, n * sizeof(blktaps[0]), GFP_KERNEL);
+ if (!p)
+ goto fail;
+
+ blktaps = p;
+ minor = blktap_max_minor;
+ blktap_max_minor = n;
+
+ memset(&blktaps[minor], 0, (n - minor) * sizeof(blktaps[0]));
+ }
+
+ tap->minor = minor;
+ blktaps[minor] = tap;
+
+ __module_get(THIS_MODULE);
+out:
+ mutex_unlock(&blktap_lock);
+ return tap;
+
+fail:
+ mutex_unlock(&blktap_lock);
+ kfree(tap);
+ tap = NULL;
+ goto out;
+}
+
+static void
+blktap_control_put_minor(struct blktap* tap)
+{
+ blktaps[tap->minor] = NULL;
+ kfree(tap);
+
+ module_put(THIS_MODULE);
+}
+
+static struct blktap*
+blktap_control_create_tap(void)
+{
+ struct blktap *tap;
+ int err;
+
+ tap = blktap_control_get_minor();
+ if (!tap)
+ return NULL;
+
+ kobject_get(&default_pool->kobj);
+ tap->pool = default_pool;
+
+ err = blktap_ring_create(tap);
+ if (err)
+ goto fail_tap;
+
+ err = blktap_sysfs_create(tap);
+ if (err)
+ goto fail_ring;
+
+ return tap;
+
+fail_ring:
+ blktap_ring_destroy(tap);
+fail_tap:
+ blktap_control_put_minor(tap);
+
+ return NULL;
+}
+
+int
+blktap_control_destroy_tap(struct blktap *tap)
+{
+ int err;
+
+ err = blktap_ring_destroy(tap);
+ if (err)
+ return err;
+
+ kobject_put(&tap->pool->kobj);
+
+ blktap_sysfs_destroy(tap);
+
+ blktap_control_put_minor(tap);
+
+ return 0;
+}
+
+static int
+blktap_control_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct blktap *tap;
+
+ switch (cmd) {
+ case BLKTAP2_IOCTL_ALLOC_TAP: {
+ struct blktap_handle h;
+ void __user *ptr = (void __user*)arg;
+
+ tap = blktap_control_create_tap();
+ if (!tap)
+ return -ENOMEM;
+
+ h.ring = blktap_ring_major;
+ h.device = blktap_device_major;
+ h.minor = tap->minor;
+
+ if (copy_to_user(ptr, &h, sizeof(h))) {
+ blktap_control_destroy_tap(tap);
+ return -EFAULT;
+ }
+
+ return 0;
+ }
+
+ case BLKTAP2_IOCTL_FREE_TAP: {
+ int minor = arg;
+
+ if (minor > MAX_BLKTAP_DEVICE)
+ return -EINVAL;
+
+ tap = blktaps[minor];
+ if (!tap)
+ return -ENODEV;
+
+ return blktap_control_destroy_tap(tap);
+ }
+ }
+
+ return -ENOIOCTLCMD;
+}
+
+static struct file_operations blktap_control_file_operations = {
+ .owner = THIS_MODULE,
+ .ioctl = blktap_control_ioctl,
+};
+
+static struct miscdevice blktap_control = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "blktap-control",
+ .fops = &blktap_control_file_operations,
+};
+
+static struct device *control_device;
+
+static ssize_t
+blktap_control_show_default_pool(struct device *device,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%s", kobject_name(&default_pool->kobj));
+}
+
+static ssize_t
+blktap_control_store_default_pool(struct device *device,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ struct blktap_page_pool *pool, *tmp = default_pool;
+
+ pool = blktap_page_pool_get(buf);
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+
+ default_pool = pool;
+ kobject_put(&tmp->kobj);
+
+ return size;
+}
+
+static DEVICE_ATTR(default_pool, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH,
+ blktap_control_show_default_pool,
+ blktap_control_store_default_pool);
+
+size_t
+blktap_control_debug(struct blktap *tap, char *buf, size_t size)
+{
+ char *s = buf, *end = buf + size;
+
+ s += snprintf(s, end - s,
+ "tap %u:%u name:'%s' flags:%#08lx\n",
+ MAJOR(tap->ring.devno), MINOR(tap->ring.devno),
+ tap->name, tap->dev_inuse);
+
+ return s - buf;
+}
+
+static int __init
+blktap_control_init(void)
+{
+ int err;
+
+ err = misc_register(&blktap_control);
+ if (err)
+ return err;
+
+ control_device = blktap_control.this_device;
+
+ blktap_max_minor = min(64, MAX_BLKTAP_DEVICE);
+ blktaps = kzalloc(blktap_max_minor * sizeof(blktaps[0]), GFP_KERNEL);
+ if (!blktaps) {
+ BTERR("failed to allocate blktap minor map");
+ return -ENOMEM;
+ }
+
+ err = blktap_page_pool_init(&control_device->kobj);
+ if (err)
+ return err;
+
+ default_pool = blktap_page_pool_get("default");
+ if (!default_pool)
+ return -ENOMEM;
+
+ err = device_create_file(control_device, &dev_attr_default_pool);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static void
+blktap_control_exit(void)
+{
+ if (default_pool) {
+ kobject_put(&default_pool->kobj);
+ default_pool = NULL;
+ }
+
+ blktap_page_pool_exit();
+
+ if (blktaps) {
+ kfree(blktaps);
+ blktaps = NULL;
+ }
+
+ if (control_device) {
+ misc_deregister(&blktap_control);
+ control_device = NULL;
+ }
+}
+
+static void
+blktap_exit(void)
+{
+ blktap_control_exit();
+ blktap_ring_exit();
+ blktap_sysfs_exit();
+ blktap_device_exit();
+}
+
+static int __init
+blktap_init(void)
+{
+ int err;
+
+ err = blktap_device_init();
+ if (err)
+ goto fail;
+
+ err = blktap_ring_init();
+ if (err)
+ goto fail;
+
+ err = blktap_sysfs_init();
+ if (err)
+ goto fail;
+
+ err = blktap_control_init();
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ blktap_exit();
+ return err;
+}
+
+module_init(blktap_init);
+module_exit(blktap_exit);
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/blktap/device.c b/drivers/xen/blktap/device.c
new file mode 100644
index 0000000..8ff276a
--- /dev/null
+++ b/drivers/xen/blktap/device.c
@@ -0,0 +1,566 @@
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/cdrom.h>
+#include <linux/hdreg.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+
+#include "blktap.h"
+
+int blktap_device_major;
+
+#define dev_to_blktap(_dev) container_of(_dev, struct blktap, device)
+
+static int
+blktap_device_open(struct block_device *bdev, fmode_t mode)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ struct blktap_device *tapdev = disk->private_data;
+
+ if (!tapdev)
+ return -ENXIO;
+
+ /* NB. we might have bounced a bd trylock by tapdisk. when
+ * failing for reasons not !tapdev, make sure to kick tapdisk
+ * out of destroy wait state again. */
+
+ return 0;
+}
+
+static int
+blktap_device_release(struct gendisk *disk, fmode_t mode)
+{
+ struct blktap_device *tapdev = disk->private_data;
+ struct block_device *bdev = bdget_disk(disk, 0);
+ struct blktap *tap = dev_to_blktap(tapdev);
+
+ bdput(bdev);
+
+ if (!bdev->bd_openers) {
+ set_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse);
+ blktap_ring_kick_user(tap);
+ }
+
+ return 0;
+}
+
+static int
+blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
+{
+ /* We don't have real geometry info, but let's at least return
+ values consistent with the size of the device */
+ sector_t nsect = get_capacity(bd->bd_disk);
+ sector_t cylinders = nsect;
+
+ hg->heads = 0xff;
+ hg->sectors = 0x3f;
+ sector_div(cylinders, hg->heads * hg->sectors);
+ hg->cylinders = cylinders;
+ if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
+ hg->cylinders = 0xffff;
+ return 0;
+}
+
+static int
+blktap_device_ioctl(struct block_device *bd, fmode_t mode,
+ unsigned command, unsigned long argument)
+{
+ int i;
+
+ switch (command) {
+ case CDROMMULTISESSION:
+ BTDBG("FIXME: support multisession CDs later\n");
+ for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+ if (put_user(0, (char __user *)(argument + i)))
+ return -EFAULT;
+ return 0;
+
+ case SCSI_IOCTL_GET_IDLUN:
+ if (!access_ok(VERIFY_WRITE, argument,
+ sizeof(struct scsi_idlun)))
+ return -EFAULT;
+
+ /* return 0 for now. */
+ __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
+ __put_user(0,
+ &((struct scsi_idlun __user *)argument)->host_unique_id);
+ return 0;
+
+ default:
+ /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
+ command);*/
+ return -EINVAL; /* same return as native Linux */
+ }
+
+ return 0;
+}
+
+static struct block_device_operations blktap_device_file_operations = {
+ .owner = THIS_MODULE,
+ .open = blktap_device_open,
+ .release = blktap_device_release,
+ .ioctl = blktap_device_ioctl,
+ .getgeo = blktap_device_getgeo
+};
+
+/* NB. __blktap holding the queue lock; blktap where unlocked */
+
+static inline struct request*
+__blktap_next_queued_rq(struct request_queue *q)
+{
+ return blk_peek_request(q);
+}
+
+static inline void
+__blktap_dequeue_rq(struct request *rq)
+{
+ blk_start_request(rq);
+}
+
+/* NB. err == 0 indicates success, failures < 0 */
+
+static inline void
+__blktap_end_queued_rq(struct request *rq, int err)
+{
+ blk_start_request(rq);
+ __blk_end_request(rq, err, blk_rq_bytes(rq));
+}
+
+static inline void
+__blktap_end_rq(struct request *rq, int err)
+{
+ __blk_end_request(rq, err, blk_rq_bytes(rq));
+}
+
+static inline void
+blktap_end_rq(struct request *rq, int err)
+{
+ struct request_queue *q = rq->q;
+
+ spin_lock_irq(q->queue_lock);
+ __blktap_end_rq(rq, err);
+ spin_unlock_irq(q->queue_lock);
+}
+
+void
+blktap_device_end_request(struct blktap *tap,
+ struct blktap_request *request,
+ int error)
+{
+ struct blktap_device *tapdev = &tap->device;
+ struct request *rq = request->rq;
+
+ blktap_ring_unmap_request(tap, request);
+
+ blktap_ring_free_request(tap, request);
+
+ dev_dbg(disk_to_dev(tapdev->gd),
+ "end_request: op=%d error=%d bytes=%d\n",
+ rq_data_dir(rq), error, blk_rq_bytes(rq));
+
+ blktap_end_rq(rq, error);
+}
+
+int
+blktap_device_make_request(struct blktap *tap, struct request *rq)
+{
+ struct blktap_device *tapdev = &tap->device;
+ struct blktap_request *request;
+ int write, nsegs;
+ int err;
+
+ request = blktap_ring_make_request(tap);
+ if (IS_ERR(request)) {
+ err = PTR_ERR(request);
+ request = NULL;
+
+ if (err == -ENOSPC || err == -ENOMEM)
+ goto stop;
+
+ goto fail;
+ }
+
+ write = rq_data_dir(rq) == WRITE;
+ nsegs = blk_rq_map_sg(rq->q, rq, request->sg_table);
+
+ dev_dbg(disk_to_dev(tapdev->gd),
+ "make_request: op=%c bytes=%d nsegs=%d\n",
+ write ? 'w' : 'r', blk_rq_bytes(rq), nsegs);
+
+ request->rq = rq;
+ request->operation = write ? BLKIF_OP_WRITE : BLKIF_OP_READ;
+
+ err = blktap_request_get_pages(tap, request, nsegs);
+ if (err)
+ goto stop;
+
+ err = blktap_ring_map_request(tap, request);
+ if (err)
+ goto fail;
+
+ blktap_ring_submit_request(tap, request);
+
+ return 0;
+
+stop:
+ tap->stats.st_oo_req++;
+ err = -EBUSY;
+
+_out:
+ if (request)
+ blktap_ring_free_request(tap, request);
+
+ return err;
+fail:
+ if (printk_ratelimit())
+ dev_warn(disk_to_dev(tapdev->gd),
+ "make request: %d, failing\n", err);
+ goto _out;
+}
+
+/*
+ * called from tapdisk context
+ */
+void
+blktap_device_run_queue(struct blktap *tap)
+{
+ struct blktap_device *tapdev = &tap->device;
+ struct request_queue *q;
+ struct request *rq;
+ int err;
+
+ if (!tapdev->gd)
+ return;
+
+ q = tapdev->gd->queue;
+
+ spin_lock_irq(&tapdev->lock);
+ queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+
+ do {
+ rq = __blktap_next_queued_rq(q);
+ if (!rq)
+ break;
+
+ if (!blk_fs_request(rq)) {
+ __blktap_end_queued_rq(rq, -EOPNOTSUPP);
+ continue;
+ }
+
+ spin_unlock_irq(&tapdev->lock);
+
+ err = blktap_device_make_request(tap, rq);
+
+ spin_lock_irq(&tapdev->lock);
+
+ if (err == -EBUSY) {
+ blk_stop_queue(q);
+ break;
+ }
+
+ __blktap_dequeue_rq(rq);
+
+ if (unlikely(err))
+ __blktap_end_rq(rq, err);
+ } while (1);
+
+ spin_unlock_irq(&tapdev->lock);
+}
+
+static void
+blktap_device_do_request(struct request_queue *rq)
+{
+ struct blktap_device *tapdev = rq->queuedata;
+ struct blktap *tap = dev_to_blktap(tapdev);
+
+ blktap_ring_kick_user(tap);
+}
+
+static void
+blktap_device_configure(struct blktap *tap,
+ struct blktap_params *params)
+{
+ struct request_queue *rq;
+ struct blktap_device *dev = &tap->device;
+
+ dev = &tap->device;
+ rq = dev->gd->queue;
+
+ spin_lock_irq(&dev->lock);
+
+ set_capacity(dev->gd, params->capacity);
+
+ /* Hard sector size and max sectors impersonate the equiv. hardware. */
+ blk_queue_logical_block_size(rq, params->sector_size);
+ blk_queue_max_sectors(rq, 512);
+
+ /* Each segment in a request is up to an aligned page in size. */
+ blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+ blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+ /* Ensure a merged request will fit in a single I/O ring slot. */
+ blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+ blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+ /* Make sure buffer addresses are sector-aligned. */
+ blk_queue_dma_alignment(rq, 511);
+
+ /* We are reordering, but cacheless. */
+ blk_queue_ordered(rq, QUEUE_ORDERED_DRAIN, NULL);
+
+ spin_unlock_irq(&dev->lock);
+}
+
+static int
+blktap_device_validate_params(struct blktap *tap,
+ struct blktap_params *params)
+{
+ struct device *dev = tap->ring.dev;
+ int sector_order, name_sz;
+
+ sector_order = ffs(params->sector_size) - 1;
+
+ if (sector_order < 9 ||
+ sector_order > 12 ||
+ params->sector_size != 1U<<sector_order)
+ goto fail;
+
+ if (!params->capacity ||
+ (params->capacity > ULLONG_MAX >> sector_order))
+ goto fail;
+
+ name_sz = min(sizeof(params->name), sizeof(tap->name));
+ if (strnlen(params->name, name_sz) >= name_sz)
+ goto fail;
+
+ return 0;
+
+fail:
+ params->name[name_sz-1] = 0;
+ dev_err(dev, "capacity: %llu, sector-size: %lu, name: %s\n",
+ params->capacity, params->sector_size, params->name);
+ return -EINVAL;
+}
+
+int
+blktap_device_destroy(struct blktap *tap)
+{
+ struct blktap_device *tapdev = &tap->device;
+ struct block_device *bdev;
+ struct gendisk *gd;
+ int err;
+
+ gd = tapdev->gd;
+ if (!gd)
+ return 0;
+
+ bdev = bdget_disk(gd, 0);
+
+ err = !mutex_trylock(&bdev->bd_mutex);
+ if (err) {
+ /* NB. avoid a deadlock. the last opener syncs the
+ * bdev holding bd_mutex. */
+ err = -EBUSY;
+ goto out_nolock;
+ }
+
+ if (bdev->bd_openers) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ del_gendisk(gd);
+ gd->private_data = NULL;
+
+ blk_cleanup_queue(gd->queue);
+
+ put_disk(gd);
+ tapdev->gd = NULL;
+
+ clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
+ err = 0;
+out:
+ mutex_unlock(&bdev->bd_mutex);
+out_nolock:
+ bdput(bdev);
+
+ return err;
+}
+
+static void
+blktap_device_fail_queue(struct blktap *tap)
+{
+ struct blktap_device *tapdev = &tap->device;
+ struct request_queue *q = tapdev->gd->queue;
+
+ spin_lock_irq(&tapdev->lock);
+ queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+
+ do {
+ struct request *rq = __blktap_next_queued_rq(q);
+ if (!rq)
+ break;
+
+ __blktap_end_queued_rq(rq, -EIO);
+ } while (1);
+
+ spin_unlock_irq(&tapdev->lock);
+}
+
+static int
+blktap_device_try_destroy(struct blktap *tap)
+{
+ int err;
+
+ err = blktap_device_destroy(tap);
+ if (err)
+ blktap_device_fail_queue(tap);
+
+ return err;
+}
+
+void
+blktap_device_destroy_sync(struct blktap *tap)
+{
+ wait_event(tap->ring.poll_wait,
+ !blktap_device_try_destroy(tap));
+}
+
+int
+blktap_device_create(struct blktap *tap, struct blktap_params *params)
+{
+ int minor, err;
+ struct gendisk *gd;
+ struct request_queue *rq;
+ struct blktap_device *tapdev;
+
+ gd = NULL;
+ rq = NULL;
+ tapdev = &tap->device;
+ minor = tap->minor;
+
+ if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+ return -EEXIST;
+
+ if (blktap_device_validate_params(tap, params))
+ return -EINVAL;
+
+ gd = alloc_disk(1);
+ if (!gd) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ if (minor < 26) {
+ sprintf(gd->disk_name, "td%c", 'a' + minor % 26);
+ } else if (minor < (26 + 1) * 26) {
+ sprintf(gd->disk_name, "td%c%c",
+ 'a' + minor / 26 - 1,'a' + minor % 26);
+ } else {
+ const unsigned int m1 = (minor / 26 - 1) / 26 - 1;
+ const unsigned int m2 = (minor / 26 - 1) % 26;
+ const unsigned int m3 = minor % 26;
+ sprintf(gd->disk_name, "td%c%c%c",
+ 'a' + m1, 'a' + m2, 'a' + m3);
+ }
+
+ gd->major = blktap_device_major;
+ gd->first_minor = minor;
+ gd->fops = &blktap_device_file_operations;
+ gd->private_data = tapdev;
+
+ spin_lock_init(&tapdev->lock);
+ rq = blk_init_queue(blktap_device_do_request, &tapdev->lock);
+ if (!rq) {
+ err = -ENOMEM;
+ goto fail;
+ }
+ elevator_init(rq, "noop");
+
+ gd->queue = rq;
+ rq->queuedata = tapdev;
+ tapdev->gd = gd;
+
+ blktap_device_configure(tap, params);
+ add_disk(gd);
+
+ if (params->name[0])
+ strncpy(tap->name, params->name, sizeof(tap->name)-1);
+
+ set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
+
+ dev_info(disk_to_dev(gd), "sector-size: %u capacity: %llu\n",
+ queue_logical_block_size(rq),
+ (unsigned long long)get_capacity(gd));
+
+ return 0;
+
+fail:
+ if (gd)
+ del_gendisk(gd);
+ if (rq)
+ blk_cleanup_queue(rq);
+
+ return err;
+}
+
+size_t
+blktap_device_debug(struct blktap *tap, char *buf, size_t size)
+{
+ struct gendisk *disk = tap->device.gd;
+ struct request_queue *q;
+ struct block_device *bdev;
+ char *s = buf, *end = buf + size;
+
+ if (!disk)
+ return 0;
+
+ q = disk->queue;
+
+ s += snprintf(s, end - s,
+ "disk capacity:%llu sector size:%u\n",
+ (unsigned long long)get_capacity(disk),
+ queue_logical_block_size(q));
+
+ s += snprintf(s, end - s,
+ "queue flags:%#lx plugged:%d stopped:%d empty:%d\n",
+ q->queue_flags,
+ blk_queue_plugged(q), blk_queue_stopped(q),
+ elv_queue_empty(q));
+
+ bdev = bdget_disk(disk, 0);
+ if (bdev) {
+ s += snprintf(s, end - s,
+ "bdev openers:%d closed:%d\n",
+ bdev->bd_openers,
+ test_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse));
+ bdput(bdev);
+ }
+
+ return s - buf;
+}
+
+int __init
+blktap_device_init()
+{
+ int major;
+
+ /* Dynamically allocate a major for this device */
+ major = register_blkdev(0, "tapdev");
+ if (major < 0) {
+ BTERR("Couldn't register blktap device\n");
+ return -ENOMEM;
+ }
+
+ blktap_device_major = major;
+ BTINFO("blktap device major %d\n", major);
+
+ return 0;
+}
+
+void
+blktap_device_exit(void)
+{
+ if (blktap_device_major)
+ unregister_blkdev(blktap_device_major, "tapdev");
+}
diff --git a/drivers/xen/blktap/request.c b/drivers/xen/blktap/request.c
new file mode 100644
index 0000000..9bef48c
--- /dev/null
+++ b/drivers/xen/blktap/request.c
@@ -0,0 +1,418 @@
+#include <linux/mempool.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+
+#include "blktap.h"
+
+/* max pages per shared pool. just to prevent accidental dos. */
+#define POOL_MAX_PAGES (256*BLKIF_MAX_SEGMENTS_PER_REQUEST)
+
+/* default page pool size. when considering to shrink a shared pool,
+ * note that paused tapdisks may grab a whole lot of pages for a long
+ * time. */
+#define POOL_DEFAULT_PAGES (2 * MMAP_PAGES)
+
+/* max number of pages allocatable per request. */
+#define POOL_MAX_REQUEST_PAGES BLKIF_MAX_SEGMENTS_PER_REQUEST
+
+/* min request structs per pool. These grow dynamically. */
+#define POOL_MIN_REQS BLK_RING_SIZE
+
+static struct kset *pool_set;
+
+#define kobj_to_pool(_kobj) \
+ container_of(_kobj, struct blktap_page_pool, kobj)
+
+static struct kmem_cache *request_cache;
+static mempool_t *request_pool;
+
+static void
+__page_pool_wake(struct blktap_page_pool *pool)
+{
+ mempool_t *mem = pool->bufs;
+
+ /*
+ NB. slightly wasteful to always wait for a full segment
+ set. but this ensures the next disk makes
+ progress. presently, the repeated request struct
+ alloc/release cycles would otherwise keep everyone spinning.
+ */
+
+ if (mem->curr_nr >= POOL_MAX_REQUEST_PAGES)
+ wake_up(&pool->wait);
+}
+
+int
+blktap_request_get_pages(struct blktap *tap,
+ struct blktap_request *request, int nr_pages)
+{
+ struct blktap_page_pool *pool = tap->pool;
+ mempool_t *mem = pool->bufs;
+ struct page *page;
+
+ BUG_ON(request->nr_pages != 0);
+ BUG_ON(nr_pages > POOL_MAX_REQUEST_PAGES);
+
+ if (mem->curr_nr < nr_pages)
+ return -ENOMEM;
+
+ /* NB. avoid thundering herds of tapdisks colliding. */
+ spin_lock(&pool->lock);
+
+ if (mem->curr_nr < nr_pages) {
+ spin_unlock(&pool->lock);
+ return -ENOMEM;
+ }
+
+ while (request->nr_pages < nr_pages) {
+ page = mempool_alloc(mem, GFP_NOWAIT);
+ BUG_ON(!page);
+ request->pages[request->nr_pages++] = page;
+ }
+
+ spin_unlock(&pool->lock);
+
+ return 0;
+}
+
+static void
+blktap_request_put_pages(struct blktap *tap,
+ struct blktap_request *request)
+{
+ struct blktap_page_pool *pool = tap->pool;
+ struct page *page;
+
+ while (request->nr_pages) {
+ page = request->pages[--request->nr_pages];
+ mempool_free(page, pool->bufs);
+ }
+}
+
+size_t
+blktap_request_debug(struct blktap *tap, char *buf, size_t size)
+{
+ struct blktap_page_pool *pool = tap->pool;
+ mempool_t *mem = pool->bufs;
+ char *s = buf, *end = buf + size;
+
+ s += snprintf(buf, end - s,
+ "pool:%s pages:%d free:%d\n",
+ kobject_name(&pool->kobj),
+ mem->min_nr, mem->curr_nr);
+
+ return s - buf;
+}
+
+struct blktap_request*
+blktap_request_alloc(struct blktap *tap)
+{
+ struct blktap_request *request;
+
+ request = mempool_alloc(request_pool, GFP_NOWAIT);
+ if (request)
+ request->tap = tap;
+
+ return request;
+}
+
+void
+blktap_request_free(struct blktap *tap,
+ struct blktap_request *request)
+{
+ blktap_request_put_pages(tap, request);
+
+ mempool_free(request, request_pool);
+
+ __page_pool_wake(tap->pool);
+}
+
+void
+blktap_request_bounce(struct blktap *tap,
+ struct blktap_request *request,
+ int seg, int write)
+{
+ struct scatterlist *sg = &request->sg_table[seg];
+ void *s, *p;
+
+ BUG_ON(seg >= request->nr_pages);
+
+ s = sg_virt(sg);
+ p = page_address(request->pages[seg]) + sg->offset;
+
+ if (write)
+ memcpy(p, s, sg->length);
+ else
+ memcpy(s, p, sg->length);
+}
+
+static void
+blktap_request_ctor(void *obj)
+{
+ struct blktap_request *request = obj;
+
+ memset(request, 0, sizeof(*request));
+ sg_init_table(request->sg_table, ARRAY_SIZE(request->sg_table));
+}
+
+static int
+blktap_page_pool_resize(struct blktap_page_pool *pool, int target)
+{
+ mempool_t *bufs = pool->bufs;
+ int err;
+
+ /* NB. mempool asserts min_nr >= 1 */
+ target = max(1, target);
+
+ err = mempool_resize(bufs, target, GFP_KERNEL);
+ if (err)
+ return err;
+
+ __page_pool_wake(pool);
+
+ return 0;
+}
+
+struct pool_attribute {
+ struct attribute attr;
+
+ ssize_t (*show)(struct blktap_page_pool *pool,
+ char *buf);
+
+ ssize_t (*store)(struct blktap_page_pool *pool,
+ const char *buf, size_t count);
+};
+
+#define kattr_to_pool_attr(_kattr) \
+ container_of(_kattr, struct pool_attribute, attr)
+
+static ssize_t
+blktap_page_pool_show_size(struct blktap_page_pool *pool,
+ char *buf)
+{
+ mempool_t *mem = pool->bufs;
+ return sprintf(buf, "%d", mem->min_nr);
+}
+
+static ssize_t
+blktap_page_pool_store_size(struct blktap_page_pool *pool,
+ const char *buf, size_t size)
+{
+ int target;
+
+ /*
+ * NB. target fixup to avoid undesired results. less than a
+ * full segment set can wedge the disk. much more than a
+ * couple times the physical queue depth is rarely useful.
+ */
+
+ target = simple_strtoul(buf, NULL, 0);
+ target = max(POOL_MAX_REQUEST_PAGES, target);
+ target = min(target, POOL_MAX_PAGES);
+
+ return blktap_page_pool_resize(pool, target) ? : size;
+}
+
+static struct pool_attribute blktap_page_pool_attr_size =
+ __ATTR(size, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH,
+ blktap_page_pool_show_size,
+ blktap_page_pool_store_size);
+
+static ssize_t
+blktap_page_pool_show_free(struct blktap_page_pool *pool,
+ char *buf)
+{
+ mempool_t *mem = pool->bufs;
+ return sprintf(buf, "%d", mem->curr_nr);
+}
+
+static struct pool_attribute blktap_page_pool_attr_free =
+ __ATTR(free, S_IRUSR|S_IRGRP|S_IROTH,
+ blktap_page_pool_show_free,
+ NULL);
+
+static struct attribute *blktap_page_pool_attrs[] = {
+ &blktap_page_pool_attr_size.attr,
+ &blktap_page_pool_attr_free.attr,
+ NULL,
+};
+
+static inline struct kobject*
+__blktap_kset_find_obj(struct kset *kset, const char *name)
+{
+ struct kobject *k;
+ struct kobject *ret = NULL;
+
+ spin_lock(&kset->list_lock);
+ list_for_each_entry(k, &kset->list, entry) {
+ if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
+ ret = kobject_get(k);
+ break;
+ }
+ }
+ spin_unlock(&kset->list_lock);
+ return ret;
+}
+
+static ssize_t
+blktap_page_pool_show_attr(struct kobject *kobj, struct attribute *kattr,
+ char *buf)
+{
+ struct blktap_page_pool *pool = kobj_to_pool(kobj);
+ struct pool_attribute *attr = kattr_to_pool_attr(kattr);
+
+ if (attr->show)
+ return attr->show(pool, buf);
+
+ return -EIO;
+}
+
+static ssize_t
+blktap_page_pool_store_attr(struct kobject *kobj, struct attribute *kattr,
+ const char *buf, size_t size)
+{
+ struct blktap_page_pool *pool = kobj_to_pool(kobj);
+ struct pool_attribute *attr = kattr_to_pool_attr(kattr);
+
+ if (attr->show)
+ return attr->store(pool, buf, size);
+
+ return -EIO;
+}
+
+static struct sysfs_ops blktap_page_pool_sysfs_ops = {
+ .show = blktap_page_pool_show_attr,
+ .store = blktap_page_pool_store_attr,
+};
+
+static void
+blktap_page_pool_release(struct kobject *kobj)
+{
+ struct blktap_page_pool *pool = kobj_to_pool(kobj);
+ mempool_destroy(pool->bufs);
+ kfree(pool);
+}
+
+struct kobj_type blktap_page_pool_ktype = {
+ .release = blktap_page_pool_release,
+ .sysfs_ops = &blktap_page_pool_sysfs_ops,
+ .default_attrs = blktap_page_pool_attrs,
+};
+
+static void*
+__mempool_page_alloc(gfp_t gfp_mask, void *pool_data)
+{
+ struct page *page;
+
+ if (!(gfp_mask & __GFP_WAIT))
+ return NULL;
+
+ page = alloc_page(gfp_mask);
+ if (page)
+ SetPageReserved(page);
+
+ return page;
+}
+
+static void
+__mempool_page_free(void *element, void *pool_data)
+{
+ struct page *page = element;
+
+ ClearPageReserved(page);
+ put_page(page);
+}
+
+static struct kobject*
+blktap_page_pool_create(const char *name, int nr_pages)
+{
+ struct blktap_page_pool *pool;
+ int err;
+
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool)
+ goto fail;
+
+ spin_lock_init(&pool->lock);
+ init_waitqueue_head(&pool->wait);
+
+ pool->bufs = mempool_create(nr_pages,
+ __mempool_page_alloc, __mempool_page_free,
+ pool);
+ if (!pool->bufs)
+ goto fail_pool;
+
+ kobject_init(&pool->kobj, &blktap_page_pool_ktype);
+ pool->kobj.kset = pool_set;
+ err = kobject_add(&pool->kobj, &pool_set->kobj, "%s", name);
+ if (err)
+ goto fail_bufs;
+
+ return &pool->kobj;
+
+ kobject_del(&pool->kobj);
+fail_bufs:
+ mempool_destroy(pool->bufs);
+fail_pool:
+ kfree(pool);
+fail:
+ return NULL;
+}
+
+struct blktap_page_pool*
+blktap_page_pool_get(const char *name)
+{
+ struct kobject *kobj;
+
+ kobj = __blktap_kset_find_obj(pool_set, name);
+ if (!kobj)
+ kobj = blktap_page_pool_create(name,
+ POOL_DEFAULT_PAGES);
+ if (!kobj)
+ return ERR_PTR(-ENOMEM);
+
+ return kobj_to_pool(kobj);
+}
+
+int __init
+blktap_page_pool_init(struct kobject *parent)
+{
+ request_cache =
+ kmem_cache_create("blktap-request",
+ sizeof(struct blktap_request), 0,
+ 0, blktap_request_ctor);
+ if (!request_cache)
+ return -ENOMEM;
+
+ request_pool =
+ mempool_create_slab_pool(POOL_MIN_REQS, request_cache);
+ if (!request_pool)
+ return -ENOMEM;
+
+ pool_set = kset_create_and_add("pools", NULL, parent);
+ if (!pool_set)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void
+blktap_page_pool_exit(void)
+{
+ if (pool_set) {
+ BUG_ON(!list_empty(&pool_set->list));
+ kset_unregister(pool_set);
+ pool_set = NULL;
+ }
+
+ if (request_pool) {
+ mempool_destroy(request_pool);
+ request_pool = NULL;
+ }
+
+ if (request_cache) {
+ kmem_cache_destroy(request_cache);
+ request_cache = NULL;
+ }
+}
diff --git a/drivers/xen/blktap/ring.c b/drivers/xen/blktap/ring.c
new file mode 100644
index 0000000..6b86be5
--- /dev/null
+++ b/drivers/xen/blktap/ring.c
@@ -0,0 +1,550 @@
+
+#include <linux/device.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/poll.h>
+#include <linux/blkdev.h>
+
+#include "blktap.h"
+
+int blktap_ring_major;
+static struct cdev blktap_ring_cdev;
+
+ /*
+ * BLKTAP - immediately before the mmap area,
+ * we have a bunch of pages reserved for shared memory rings.
+ */
+#define RING_PAGES 1
+
+static void
+blktap_ring_read_response(struct blktap *tap,
+ const struct blkif_response *rsp)
+{
+ struct blktap_ring *ring = &tap->ring;
+ struct blktap_request *request;
+ int usr_idx, err;
+
+ request = NULL;
+
+ usr_idx = rsp->id;
+ if (usr_idx < 0 || usr_idx >= MAX_PENDING_REQS) {
+ err = -ERANGE;
+ goto invalid;
+ }
+
+ request = ring->pending[usr_idx];
+
+ if (!request) {
+ err = -ESRCH;
+ goto invalid;
+ }
+
+ if (rsp->operation != request->operation) {
+ err = -EINVAL;
+ goto invalid;
+ }
+
+ dev_dbg(ring->dev,
+ "request %d [%p] response: %d\n",
+ request->usr_idx, request, rsp->status);
+
+ err = rsp->status == BLKIF_RSP_OKAY ? 0 : -EIO;
+end_request:
+ blktap_device_end_request(tap, request, err);
+ return;
+
+invalid:
+ dev_warn(ring->dev,
+ "invalid response, idx:%d status:%d op:%d/%d: err %d\n",
+ usr_idx, rsp->status,
+ rsp->operation, request->operation,
+ err);
+ if (request)
+ goto end_request;
+}
+
+static void
+blktap_read_ring(struct blktap *tap)
+{
+ struct blktap_ring *ring = &tap->ring;
+ struct blkif_response rsp;
+ RING_IDX rc, rp;
+
+ down_read(¤t->mm->mmap_sem);
+ if (!ring->vma) {
+ up_read(¤t->mm->mmap_sem);
+ return;
+ }
+
+ /* for each outstanding message on the ring */
+ rp = ring->ring.sring->rsp_prod;
+ rmb();
+
+ for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
+ memcpy(&rsp, RING_GET_RESPONSE(&ring->ring, rc), sizeof(rsp));
+ blktap_ring_read_response(tap, &rsp);
+ }
+
+ ring->ring.rsp_cons = rc;
+
+ up_read(¤t->mm->mmap_sem);
+}
+
+static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+static void
+blktap_ring_fail_pending(struct blktap *tap)
+{
+ struct blktap_ring *ring = &tap->ring;
+ struct blktap_request *request;
+ int usr_idx;
+
+ for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
+ request = ring->pending[usr_idx];
+ if (!request)
+ continue;
+
+ blktap_device_end_request(tap, request, -EIO);
+ }
+}
+
+static void
+blktap_ring_vm_close(struct vm_area_struct *vma)
+{
+ struct blktap *tap = vma->vm_private_data;
+ struct blktap_ring *ring = &tap->ring;
+ struct page *page = virt_to_page(ring->ring.sring);
+
+ blktap_ring_fail_pending(tap);
+
+ zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL);
+ ClearPageReserved(page);
+ __free_page(page);
+
+ ring->vma = NULL;
+
+ if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+ blktap_control_destroy_tap(tap);
+}
+
+static struct vm_operations_struct blktap_ring_vm_operations = {
+ .close = blktap_ring_vm_close,
+ .fault = blktap_ring_fault,
+};
+
+int
+blktap_ring_map_segment(struct blktap *tap,
+ struct blktap_request *request,
+ int seg)
+{
+ struct blktap_ring *ring = &tap->ring;
+ unsigned long uaddr;
+
+ uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
+ return vm_insert_page(ring->vma, uaddr, request->pages[seg]);
+}
+
+int
+blktap_ring_map_request(struct blktap *tap,
+ struct blktap_request *request)
+{
+ int seg, err = 0;
+ int write;
+
+ write = request->operation == BLKIF_OP_WRITE;
+
+ for (seg = 0; seg < request->nr_pages; seg++) {
+ if (write)
+ blktap_request_bounce(tap, request, seg, write);
+
+ err = blktap_ring_map_segment(tap, request, seg);
+ if (err)
+ break;
+ }
+
+ if (err)
+ blktap_ring_unmap_request(tap, request);
+
+ return err;
+}
+
+void
+blktap_ring_unmap_request(struct blktap *tap,
+ struct blktap_request *request)
+{
+ struct blktap_ring *ring = &tap->ring;
+ unsigned long uaddr;
+ unsigned size;
+ int seg, read;
+
+ uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, 0);
+ size = request->nr_pages << PAGE_SHIFT;
+ read = request->operation == BLKIF_OP_READ;
+
+ if (read)
+ for (seg = 0; seg < request->nr_pages; seg++)
+ blktap_request_bounce(tap, request, seg, !read);
+
+ zap_page_range(ring->vma, uaddr, size, NULL);
+}
+
+void
+blktap_ring_free_request(struct blktap *tap,
+ struct blktap_request *request)
+{
+ struct blktap_ring *ring = &tap->ring;
+
+ ring->pending[request->usr_idx] = NULL;
+ ring->n_pending--;
+
+ blktap_request_free(tap, request);
+}
+
+struct blktap_request*
+blktap_ring_make_request(struct blktap *tap)
+{
+ struct blktap_ring *ring = &tap->ring;
+ struct blktap_request *request;
+ int usr_idx;
+
+ if (RING_FULL(&ring->ring))
+ return ERR_PTR(-ENOSPC);
+
+ request = blktap_request_alloc(tap);
+ if (!request)
+ return ERR_PTR(-ENOMEM);
+
+ for (usr_idx = 0; usr_idx < BLK_RING_SIZE; usr_idx++)
+ if (!ring->pending[usr_idx])
+ break;
+
+ BUG_ON(usr_idx >= BLK_RING_SIZE);
+
+ request->tap = tap;
+ request->usr_idx = usr_idx;
+
+ ring->pending[usr_idx] = request;
+ ring->n_pending++;
+
+ return request;
+}
+
+void
+blktap_ring_submit_request(struct blktap *tap,
+ struct blktap_request *request)
+{
+ struct blktap_ring *ring = &tap->ring;
+ struct blkif_request *breq;
+ struct scatterlist *sg;
+ int i, nsecs = 0;
+
+ dev_dbg(ring->dev,
+ "request %d [%p] submit\n", request->usr_idx, request);
+
+ breq = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
+
+ breq->id = request->usr_idx;
+ breq->sector_number = blk_rq_pos(request->rq);
+ breq->handle = 0;
+ breq->operation = request->operation;
+ breq->nr_segments = request->nr_pages;
+
+ blktap_for_each_sg(sg, request, i) {
+ struct blkif_request_segment *seg = &breq->seg[i];
+ int first, count;
+
+ count = sg->length >> 9;
+ first = sg->offset >> 9;
+
+ seg->first_sect = first;
+ seg->last_sect = first + count - 1;
+
+ nsecs += count;
+ }
+
+ ring->ring.req_prod_pvt++;
+
+ do_gettimeofday(&request->time);
+
+
+ if (request->operation == BLKIF_OP_WRITE) {
+ tap->stats.st_wr_sect += nsecs;
+ tap->stats.st_wr_req++;
+ }
+
+ if (request->operation == BLKIF_OP_READ) {
+ tap->stats.st_rd_sect += nsecs;
+ tap->stats.st_rd_req++;
+ }
+}
+
+static int
+blktap_ring_open(struct inode *inode, struct file *filp)
+{
+ struct blktap *tap = NULL;
+ int minor;
+
+ minor = iminor(inode);
+
+ if (minor < blktap_max_minor)
+ tap = blktaps[minor];
+
+ if (!tap)
+ return -ENXIO;
+
+ if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+ return -ENXIO;
+
+ if (tap->ring.task)
+ return -EBUSY;
+
+ filp->private_data = tap;
+ tap->ring.task = current;
+
+ return 0;
+}
+
+static int
+blktap_ring_release(struct inode *inode, struct file *filp)
+{
+ struct blktap *tap = filp->private_data;
+
+ blktap_device_destroy_sync(tap);
+
+ tap->ring.task = NULL;
+
+ if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+ blktap_control_destroy_tap(tap);
+
+ return 0;
+}
+
+static int
+blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct blktap *tap = filp->private_data;
+ struct blktap_ring *ring = &tap->ring;
+ struct blkif_sring *sring;
+ struct page *page = NULL;
+ int err;
+
+ if (ring->vma)
+ return -EBUSY;
+
+ page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+
+ SetPageReserved(page);
+
+ err = vm_insert_page(vma, vma->vm_start, page);
+ if (err)
+ goto fail;
+
+ sring = page_address(page);
+ SHARED_RING_INIT(sring);
+ FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
+
+ ring->ring_vstart = vma->vm_start;
+ ring->user_vstart = ring->ring_vstart + PAGE_SIZE;
+
+ vma->vm_private_data = tap;
+
+ vma->vm_flags |= VM_DONTCOPY;
+ vma->vm_flags |= VM_RESERVED;
+
+ vma->vm_ops = &blktap_ring_vm_operations;
+
+ ring->vma = vma;
+ return 0;
+
+fail:
+ if (page) {
+ zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL);
+ ClearPageReserved(page);
+ __free_page(page);
+ }
+
+ return err;
+}
+
+static int
+blktap_ring_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct blktap *tap = filp->private_data;
+ struct blktap_ring *ring = &tap->ring;
+
+ BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
+
+ if (!ring->vma || ring->vma->vm_mm != current->mm)
+ return -EACCES;
+
+ switch(cmd) {
+ case BLKTAP2_IOCTL_KICK_FE:
+
+ blktap_read_ring(tap);
+ return 0;
+
+ case BLKTAP2_IOCTL_CREATE_DEVICE: {
+ struct blktap_params params;
+ void __user *ptr = (void *)arg;
+
+ if (!arg)
+ return -EINVAL;
+
+ if (copy_from_user(¶ms, ptr, sizeof(params)))
+ return -EFAULT;
+
+ return blktap_device_create(tap, ¶ms);
+ }
+
+ case BLKTAP2_IOCTL_REMOVE_DEVICE:
+
+ return blktap_device_destroy(tap);
+ }
+
+ return -ENOIOCTLCMD;
+}
+
+static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
+{
+ struct blktap *tap = filp->private_data;
+ struct blktap_ring *ring = &tap->ring;
+ int work;
+
+ poll_wait(filp, &tap->pool->wait, wait);
+ poll_wait(filp, &ring->poll_wait, wait);
+
+ down_read(¤t->mm->mmap_sem);
+ if (ring->vma && tap->device.gd)
+ blktap_device_run_queue(tap);
+ up_read(¤t->mm->mmap_sem);
+
+ work = ring->ring.req_prod_pvt - ring->ring.sring->req_prod;
+ RING_PUSH_REQUESTS(&ring->ring);
+
+ if (work ||
+ ring->ring.sring->private.tapif_user.msg ||
+ test_and_clear_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse))
+ return POLLIN | POLLRDNORM;
+
+ return 0;
+}
+
+static struct file_operations blktap_ring_file_operations = {
+ .owner = THIS_MODULE,
+ .open = blktap_ring_open,
+ .release = blktap_ring_release,
+ .ioctl = blktap_ring_ioctl,
+ .mmap = blktap_ring_mmap,
+ .poll = blktap_ring_poll,
+};
+
+void
+blktap_ring_kick_user(struct blktap *tap)
+{
+ wake_up(&tap->ring.poll_wait);
+}
+
+int
+blktap_ring_destroy(struct blktap *tap)
+{
+ struct blktap_ring *ring = &tap->ring;
+
+ if (ring->task || ring->vma)
+ return -EBUSY;
+
+ return 0;
+}
+
+int
+blktap_ring_create(struct blktap *tap)
+{
+ struct blktap_ring *ring = &tap->ring;
+
+ init_waitqueue_head(&ring->poll_wait);
+ ring->devno = MKDEV(blktap_ring_major, tap->minor);
+
+ return 0;
+}
+
+size_t
+blktap_ring_debug(struct blktap *tap, char *buf, size_t size)
+{
+ struct blktap_ring *ring = &tap->ring;
+ char *s = buf, *end = buf + size;
+ int usr_idx;
+
+ s += snprintf(s, end - s,
+ "begin pending:%d\n", ring->n_pending);
+
+ for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
+ struct blktap_request *request;
+ struct timeval *time;
+ int write;
+
+ request = ring->pending[usr_idx];
+ if (!request)
+ continue;
+
+ write = request->operation == BLKIF_OP_WRITE;
+ time = &request->time;
+
+ s += snprintf(s, end - s,
+ "%02d: usr_idx:%02d "
+ "op:%c nr_pages:%02d time:%lu.%09lu\n",
+ usr_idx, request->usr_idx,
+ write ? 'W' : 'R', request->nr_pages,
+ time->tv_sec, time->tv_usec);
+ }
+
+ s += snprintf(s, end - s, "end pending\n");
+
+ return s - buf;
+}
+
+
+int __init
+blktap_ring_init(void)
+{
+ dev_t dev = 0;
+ int err;
+
+ cdev_init(&blktap_ring_cdev, &blktap_ring_file_operations);
+ blktap_ring_cdev.owner = THIS_MODULE;
+
+ err = alloc_chrdev_region(&dev, 0, MAX_BLKTAP_DEVICE, "blktap2");
+ if (err < 0) {
+ BTERR("error registering ring devices: %d\n", err);
+ return err;
+ }
+
+ err = cdev_add(&blktap_ring_cdev, dev, MAX_BLKTAP_DEVICE);
+ if (err) {
+ BTERR("error adding ring device: %d\n", err);
+ unregister_chrdev_region(dev, MAX_BLKTAP_DEVICE);
+ return err;
+ }
+
+ blktap_ring_major = MAJOR(dev);
+ BTINFO("blktap ring major: %d\n", blktap_ring_major);
+
+ return 0;
+}
+
+void
+blktap_ring_exit(void)
+{
+ if (!blktap_ring_major)
+ return;
+
+ cdev_del(&blktap_ring_cdev);
+ unregister_chrdev_region(MKDEV(blktap_ring_major, 0),
+ MAX_BLKTAP_DEVICE);
+
+ blktap_ring_major = 0;
+}
diff --git a/drivers/xen/blktap/sysfs.c b/drivers/xen/blktap/sysfs.c
new file mode 100644
index 0000000..3c424af
--- /dev/null
+++ b/drivers/xen/blktap/sysfs.c
@@ -0,0 +1,288 @@
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+#include "blktap.h"
+
+int blktap_debug_level = 1;
+
+static struct class *class;
+
+static ssize_t
+blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, const char *buf, size_t size)
+{
+ struct blktap *tap;
+
+ tap = dev_get_drvdata(dev);
+ if (!tap)
+ return 0;
+
+ if (size >= BLKTAP2_MAX_MESSAGE_LEN)
+ return -ENAMETOOLONG;
+
+ if (strnlen(buf, size) != size)
+ return -EINVAL;
+
+ strcpy(tap->name, buf);
+
+ return size;
+}
+
+static ssize_t
+blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct blktap *tap;
+ ssize_t size;
+
+ tap = dev_get_drvdata(dev);
+ if (!tap)
+ return 0;
+
+ if (tap->name[0])
+ size = sprintf(buf, "%s\n", tap->name);
+ else
+ size = sprintf(buf, "%d\n", tap->minor);
+
+ return size;
+}
+static DEVICE_ATTR(name, S_IRUGO|S_IWUSR,
+ blktap_sysfs_get_name, blktap_sysfs_set_name);
+
+static void
+blktap_sysfs_remove_work(struct work_struct *work)
+{
+ struct blktap *tap
+ = container_of(work, struct blktap, remove_work);
+ blktap_control_destroy_tap(tap);
+}
+
+static ssize_t
+blktap_sysfs_remove_device(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ struct blktap *tap;
+ int err;
+
+ tap = dev_get_drvdata(dev);
+ if (!tap)
+ return size;
+
+ if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+ goto wait;
+
+ if (tap->ring.vma) {
+ struct blkif_sring *sring = tap->ring.ring.sring;
+ sring->private.tapif_user.msg = BLKTAP2_RING_MESSAGE_CLOSE;
+ blktap_ring_kick_user(tap);
+ } else {
+ INIT_WORK(&tap->remove_work, blktap_sysfs_remove_work);
+ schedule_work(&tap->remove_work);
+ }
+wait:
+ err = wait_event_interruptible(tap->remove_wait,
+ !dev_get_drvdata(dev));
+ if (err)
+ return err;
+
+ return size;
+}
+static DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
+
+static ssize_t
+blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct blktap *tap;
+ char *s = buf, *end = buf + PAGE_SIZE;
+
+ tap = dev_get_drvdata(dev);
+ if (!tap)
+ return 0;
+
+ s += blktap_control_debug(tap, s, end - s);
+
+ s += blktap_request_debug(tap, s, end - s);
+
+ s += blktap_device_debug(tap, s, end - s);
+
+ s += blktap_ring_debug(tap, s, end - s);
+
+ return s - buf;
+}
+static DEVICE_ATTR(debug, S_IRUGO, blktap_sysfs_debug_device, NULL);
+
+static ssize_t
+blktap_sysfs_show_task(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct blktap *tap;
+ ssize_t rv = 0;
+
+ tap = dev_get_drvdata(dev);
+ if (!tap)
+ return 0;
+
+ if (tap->ring.task)
+ rv = sprintf(buf, "%d\n", tap->ring.task->pid);
+
+ return rv;
+}
+static DEVICE_ATTR(task, S_IRUGO, blktap_sysfs_show_task, NULL);
+
+static ssize_t
+blktap_sysfs_show_pool(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct blktap *tap = dev_get_drvdata(dev);
+ return sprintf(buf, "%s", kobject_name(&tap->pool->kobj));
+}
+
+static ssize_t
+blktap_sysfs_store_pool(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ struct blktap *tap = dev_get_drvdata(dev);
+ struct blktap_page_pool *pool, *tmp = tap->pool;
+
+ if (tap->device.gd)
+ return -EBUSY;
+
+ pool = blktap_page_pool_get(buf);
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+
+ tap->pool = pool;
+ kobject_put(&tmp->kobj);
+
+ return size;
+}
+DEVICE_ATTR(pool, S_IRUSR|S_IWUSR,
+ blktap_sysfs_show_pool, blktap_sysfs_store_pool);
+
+int
+blktap_sysfs_create(struct blktap *tap)
+{
+ struct blktap_ring *ring = &tap->ring;
+ struct device *dev;
+ int err = 0;
+
+ init_waitqueue_head(&tap->remove_wait);
+
+ dev = device_create(class, NULL, ring->devno,
+ tap, "blktap%d", tap->minor);
+ if (IS_ERR(dev))
+ err = PTR_ERR(dev);
+ if (!err)
+ err = device_create_file(dev, &dev_attr_name);
+ if (!err)
+ err = device_create_file(dev, &dev_attr_remove);
+ if (!err)
+ err = device_create_file(dev, &dev_attr_debug);
+ if (!err)
+ err = device_create_file(dev, &dev_attr_task);
+ if (!err)
+ err = device_create_file(dev, &dev_attr_pool);
+ if (!err)
+ ring->dev = dev;
+ else
+ device_unregister(dev);
+
+ return err;
+}
+
+void
+blktap_sysfs_destroy(struct blktap *tap)
+{
+ struct blktap_ring *ring = &tap->ring;
+ struct device *dev;
+
+ dev = ring->dev;
+
+ if (!dev)
+ return;
+
+ dev_set_drvdata(dev, NULL);
+ wake_up(&tap->remove_wait);
+
+ device_unregister(dev);
+ ring->dev = NULL;
+}
+
+static ssize_t
+blktap_sysfs_show_verbosity(struct class *class, char *buf)
+{
+ return sprintf(buf, "%d\n", blktap_debug_level);
+}
+
+static ssize_t
+blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size)
+{
+ int level;
+
+ if (sscanf(buf, "%d", &level) == 1) {
+ blktap_debug_level = level;
+ return size;
+ }
+
+ return -EINVAL;
+}
+static CLASS_ATTR(verbosity, S_IRUGO|S_IWUSR,
+ blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
+
+static ssize_t
+blktap_sysfs_show_devices(struct class *class, char *buf)
+{
+ int i, ret;
+ struct blktap *tap;
+
+ mutex_lock(&blktap_lock);
+
+ ret = 0;
+ for (i = 0; i < blktap_max_minor; i++) {
+ tap = blktaps[i];
+ if (!tap)
+ continue;
+
+ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+ continue;
+
+ ret += sprintf(buf + ret, "%d %s\n", tap->minor, tap->name);
+ }
+
+ mutex_unlock(&blktap_lock);
+
+ return ret;
+}
+static CLASS_ATTR(devices, S_IRUGO, blktap_sysfs_show_devices, NULL);
+
+void
+blktap_sysfs_exit(void)
+{
+ if (class)
+ class_destroy(class);
+}
+
+int __init
+blktap_sysfs_init(void)
+{
+ struct class *cls;
+ int err = 0;
+
+ cls = class_create(THIS_MODULE, "blktap2");
+ if (IS_ERR(cls))
+ err = PTR_ERR(cls);
+ if (!err)
+ err = class_create_file(cls, &class_attr_verbosity);
+ if (!err)
+ err = class_create_file(cls, &class_attr_devices);
+ if (!err)
+ class = cls;
+ else
+ class_destroy(cls);
+
+ return err;
+}
diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
index bdfd584..6625ffe 100644
--- a/drivers/xen/cpu_hotplug.c
+++ b/drivers/xen/cpu_hotplug.c
@@ -1,5 +1,6 @@
#include <linux/notifier.h>
+#include <xen/xen.h>
#include <xen/xenbus.h>
#include <asm/xen/hypervisor.h>
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 15ed43e..fc28a70 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -16,7 +16,7 @@
* (typically dom0).
* 2. VIRQs, typically used for timers. These are per-cpu events.
* 3. IPIs.
- * 4. Hardware interrupts. Not supported at present.
+ * 4. PIRQs - Hardware interrupts.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
@@ -27,18 +27,32 @@
#include <linux/module.h>
#include <linux/string.h>
#include <linux/bootmem.h>
+#include <linux/irqnr.h>
+#include <linux/pci_regs.h>
+#include <linux/pci.h>
+#include <linux/msi.h>
+#include <asm/desc.h>
#include <asm/ptrace.h>
#include <asm/irq.h>
#include <asm/idle.h>
+#include <asm/io_apic.h>
#include <asm/sync_bitops.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
+#include <asm/xen/pci.h>
+#include <xen/xen.h>
+#include <xen/hvm.h>
#include <xen/xen-ops.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/event_channel.h>
+#include <xen/interface/hvm/hvm_op.h>
+#include <xen/interface/hvm/params.h>
+#include <xen/page.h>
+
+#include "../pci/msi.h"
/*
* This lock protects updates to the following mapping and reference-count
@@ -67,7 +81,7 @@
* event channel - irq->event channel mapping
* cpu - cpu this event channel is bound to
* index - type-specific information:
- * PIRQ - vector, with MSB being "needs EIO"
+ * PIRQ - with MSB being "needs EIO"
* VIRQ - virq number
* IPI - IPI vector
* EVTCHN -
@@ -83,20 +97,30 @@
enum ipi_vector ipi;
struct {
unsigned short gsi;
- unsigned short vector;
+ unsigned char vector;
+ unsigned char flags;
+ uint16_t domid;
} pirq;
} u;
};
+#define PIRQ_SHAREABLE (1 << 1)
-static struct irq_info irq_info[NR_IRQS];
+/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
+static bool pirq_eoi_does_unmask;
+static unsigned long *pirq_needs_eoi_bits;
-static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
- [0 ... NR_EVENT_CHANNELS-1] = -1
-};
+static struct irq_info *irq_info;
+
+static int *evtchn_to_irq;
struct cpu_evtchn_s {
unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
};
-static struct cpu_evtchn_s *cpu_evtchn_mask_p;
+
+static __initdata struct cpu_evtchn_s init_evtchn_mask = {
+ .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
+};
+static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask;
+
static inline unsigned long *cpu_evtchn_mask(int cpu)
{
return cpu_evtchn_mask_p[cpu].bits;
@@ -107,6 +131,7 @@
static struct irq_chip xen_dynamic_chip;
static struct irq_chip xen_percpu_chip;
+static struct irq_chip xen_pirq_chip;
/* Constructor for packed IRQ information. */
static struct irq_info mk_unbound_info(void)
@@ -136,7 +161,8 @@
unsigned short gsi, unsigned short vector)
{
return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn,
- .cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } };
+ .cpu = 0, .u.pirq =
+ { .gsi = gsi, .vector = vector, .domid = DOMID_SELF } };
}
/*
@@ -219,6 +245,15 @@
return ret;
}
+static bool pirq_needs_eoi(unsigned irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ return test_bit(info->u.pirq.gsi, pirq_needs_eoi_bits);
+}
+
static inline unsigned long active_evtchns(unsigned int cpu,
struct shared_info *sh,
unsigned int idx)
@@ -237,17 +272,17 @@
cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
#endif
- __clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq)));
- __set_bit(chn, cpu_evtchn_mask(cpu));
+ clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq)));
+ set_bit(chn, cpu_evtchn_mask(cpu));
irq_info[irq].cpu = cpu;
}
static void init_evtchn_cpu_bindings(void)
{
+ int i;
#ifdef CONFIG_SMP
struct irq_desc *desc;
- int i;
/* By default all event channels notify CPU#0. */
for_each_irq_desc(i, desc) {
@@ -255,7 +290,9 @@
}
#endif
- memset(cpu_evtchn_mask(0), ~0, sizeof(struct cpu_evtchn_s));
+ for_each_possible_cpu(i)
+ memset(cpu_evtchn_mask(i),
+ (i == 0) ? ~0 : 0, sizeof(struct cpu_evtchn_s));
}
static inline void clear_evtchn(int port)
@@ -300,6 +337,14 @@
sync_set_bit(port, &s->evtchn_mask[0]);
}
+static void mask_irq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ mask_evtchn(evtchn);
+}
+
static void unmask_evtchn(int port)
{
struct shared_info *s = HYPERVISOR_shared_info;
@@ -330,27 +375,371 @@
put_cpu();
}
+static void unmask_irq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ unmask_evtchn(evtchn);
+}
+
+static int get_nr_hw_irqs(void)
+{
+ int ret = 1;
+
+#ifdef CONFIG_X86_IO_APIC
+ ret = get_nr_irqs_gsi();
+#endif
+
+ return ret;
+}
+
static int find_unbound_irq(void)
{
int irq;
struct irq_desc *desc;
+ int start = get_nr_hw_irqs();
- for (irq = 0; irq < nr_irqs; irq++)
+ if (start == nr_irqs)
+ goto no_irqs;
+
+ /* nr_irqs is a magic value. Must not use it.*/
+ for (irq = nr_irqs-1; irq > start; irq--) {
+ desc = irq_to_desc(irq);
+ /* only 0->15 have init'd desc; handle irq > 16 */
+ if (desc == NULL)
+ break;
+ if (desc->chip == &no_irq_chip)
+ break;
+ if (desc->chip != &xen_dynamic_chip)
+ continue;
if (irq_info[irq].type == IRQT_UNBOUND)
break;
+ }
- if (irq == nr_irqs)
- panic("No available IRQ to bind to: increase nr_irqs!\n");
+ if (irq == start)
+ goto no_irqs;
- desc = irq_to_desc_alloc_node(irq, 0);
+ desc = irq_to_desc_alloc_node(irq, -1);
if (WARN_ON(desc == NULL))
return -1;
- dynamic_irq_init(irq);
+ dynamic_irq_init_keep_chip_data(irq);
+
+ return irq;
+
+no_irqs:
+ panic("No available IRQ to bind to: increase nr_irqs!\n");
+}
+
+static bool identity_mapped_irq(unsigned irq)
+{
+ /* identity map all the hardware irqs */
+ return irq < get_nr_hw_irqs();
+}
+
+static void pirq_eoi(unsigned int irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+ struct physdev_eoi eoi = { .irq = info->u.pirq.gsi };
+ bool need_eoi;
+
+ need_eoi = pirq_needs_eoi(irq);
+
+ if (!need_eoi || !pirq_eoi_does_unmask)
+ unmask_evtchn(info->evtchn);
+
+ if (need_eoi) {
+ int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
+ WARN_ON(rc);
+ }
+}
+
+static void pirq_query_unmask(int irq)
+{
+ struct physdev_irq_status_query irq_status;
+ struct irq_info *info = info_for_irq(irq);
+
+ if (pirq_eoi_does_unmask)
+ return;
+
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ irq_status.irq = info->u.pirq.gsi;
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+ irq_status.flags = 0;
+
+ clear_bit(info->u.pirq.gsi, pirq_needs_eoi_bits);
+ if (irq_status.flags & XENIRQSTAT_needs_eoi)
+ set_bit(info->u.pirq.gsi, pirq_needs_eoi_bits);
+}
+
+static bool probing_irq(int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return desc && desc->action == NULL;
+}
+
+static unsigned int startup_pirq(unsigned int irq)
+{
+ struct evtchn_bind_pirq bind_pirq;
+ struct irq_info *info = info_for_irq(irq);
+ int evtchn = evtchn_from_irq(irq);
+ int rc;
+
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ if (VALID_EVTCHN(evtchn))
+ goto out;
+
+ bind_pirq.pirq = info->u.pirq.gsi;
+ /* NB. We are happy to share unless we are probing. */
+ bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
+ BIND_PIRQ__WILL_SHARE : 0;
+ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
+ if (rc != 0) {
+ if (!probing_irq(irq))
+ printk(KERN_INFO "Failed to obtain physical IRQ %d" \
+ " (GSI:%d)\n", irq, info->u.pirq.gsi);
+ return 0;
+ }
+ evtchn = bind_pirq.port;
+
+ pirq_query_unmask(irq);
+
+ evtchn_to_irq[evtchn] = irq;
+ bind_evtchn_to_cpu(evtchn, 0);
+ info->evtchn = evtchn;
+
+ out:
+ pirq_eoi(irq);
+
+ return 0;
+}
+
+static void shutdown_pirq(unsigned int irq)
+{
+ struct evtchn_close close;
+ struct irq_info *info = info_for_irq(irq);
+ int evtchn = evtchn_from_irq(irq);
+
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ if (!VALID_EVTCHN(evtchn))
+ return;
+
+ mask_evtchn(evtchn);
+
+ close.port = evtchn;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+ BUG();
+
+ bind_evtchn_to_cpu(evtchn, 0);
+ evtchn_to_irq[evtchn] = -1;
+ info->evtchn = 0;
+}
+
+static void ack_pirq(unsigned int irq)
+{
+ move_masked_irq(irq);
+
+ pirq_eoi(irq);
+}
+
+static void end_pirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (WARN_ON(!desc))
+ return;
+
+ if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
+ (IRQ_DISABLED|IRQ_PENDING)) {
+ shutdown_pirq(irq);
+ } else if (VALID_EVTCHN(evtchn)) {
+ pirq_eoi(irq);
+ }
+}
+
+static int find_irq_by_gsi(unsigned gsi)
+{
+ int irq;
+
+ for (irq = 0; irq < nr_irqs; irq++) {
+ struct irq_info *info = info_for_irq(irq);
+
+ if (info == NULL || info->type != IRQT_PIRQ)
+ continue;
+
+ if (gsi_from_irq(irq) == gsi)
+ return irq;
+ }
+
+ return -1;
+}
+
+/*
+ * Allocate a physical irq, along with a vector. We don't assign an
+ * event channel until the irq actually started up. Return an
+ * existing irq if we've already got one for the gsi.
+ */
+int xen_allocate_pirq(unsigned gsi, int shareable, char *name)
+{
+ int irq;
+ struct physdev_irq irq_op;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = find_irq_by_gsi(gsi);
+ if (irq != -1) {
+ printk(KERN_INFO "xen_allocate_pirq: returning irq %d for gsi %u\n",
+ irq, gsi);
+ goto out; /* XXX need refcount? */
+ }
+
+ /* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore
+ * we are using the !xen_initial_domain() to drop in the function.*/
+ if (identity_mapped_irq(gsi) || !xen_initial_domain()) {
+ irq = gsi;
+ irq_to_desc_alloc_node(irq, 0);
+ dynamic_irq_init(irq);
+ } else
+ irq = find_unbound_irq();
+
+ set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
+ handle_fasteoi_irq, name);
+
+ irq_op.irq = gsi;
+ irq_op.vector = 0;
+
+ /* Only the privileged domain can do this. For non-priv, the pcifront
+ * driver provides a PCI bus that does the call to do exactly
+ * this in the priv domain. */
+ if (xen_initial_domain() &&
+ HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
+ dynamic_irq_cleanup(irq);
+ irq = -ENOSPC;
+ goto out;
+ }
+
+ irq_info[irq] = mk_pirq_info(0, gsi, irq_op.vector);
+ irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0;
+
+out:
+ spin_unlock(&irq_mapping_update_lock);
return irq;
}
+#ifdef CONFIG_PCI_MSI
+int xen_destroy_irq(int irq)
+{
+ struct irq_desc *desc;
+ struct physdev_unmap_pirq unmap_irq;
+ struct irq_info *info = info_for_irq(irq);
+ int rc = -ENOENT;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ desc = irq_to_desc(irq);
+ if (!desc)
+ goto out;
+
+ if (xen_initial_domain()) {
+ unmap_irq.pirq = info->u.pirq.gsi;
+ unmap_irq.domid = info->u.pirq.domid;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
+ if (rc) {
+ printk(KERN_WARNING "unmap irq failed %d\n", rc);
+ goto out;
+ }
+ }
+ irq_info[irq] = mk_unbound_info();
+
+ dynamic_irq_cleanup(irq);
+
+out:
+ spin_unlock(&irq_mapping_update_lock);
+ return rc;
+}
+
+#ifdef CONFIG_PCI_XEN
+int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
+{
+ int irq = 0;
+ struct physdev_map_pirq map_irq;
+ int rc;
+ domid_t domid;
+ int pos;
+ u32 table_offset, bir;
+
+ domid = rc = xen_find_device_domain_owner(dev);
+ if (rc < 0)
+ domid = DOMID_SELF;
+
+ memset(&map_irq, 0, sizeof(map_irq));
+ map_irq.domid = domid;
+ map_irq.type = MAP_PIRQ_TYPE_MSI;
+ map_irq.index = -1;
+ map_irq.pirq = -1;
+ map_irq.bus = dev->bus->number;
+ map_irq.devfn = dev->devfn;
+
+ if (type == PCI_CAP_ID_MSIX) {
+ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+
+ pci_read_config_dword(dev, msix_table_offset_reg(pos),
+ &table_offset);
+ bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
+
+ map_irq.table_base = pci_resource_start(dev, bir);
+ map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
+ }
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = find_unbound_irq();
+
+ if (irq == -1)
+ goto out;
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+ if (rc) {
+ printk(KERN_WARNING "xen map irq failed %d\n", rc);
+
+ dynamic_irq_cleanup(irq);
+
+ irq = -1;
+ goto out;
+ }
+ irq_info[irq] = mk_pirq_info(0, map_irq.pirq, map_irq.index);
+ if (domid)
+ irq_info[irq].u.pirq.domid = domid;
+
+ set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
+ handle_fasteoi_irq,
+ (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
+
+out:
+ spin_unlock(&irq_mapping_update_lock);
+ return irq;
+}
+#endif
+#endif
+
+int xen_vector_from_irq(unsigned irq)
+{
+ return vector_from_irq(irq);
+}
+
+int xen_gsi_from_irq(unsigned irq)
+{
+ return gsi_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(xen_gsi_from_irq);
+
int bind_evtchn_to_irq(unsigned int evtchn)
{
int irq;
@@ -363,7 +752,7 @@
irq = find_unbound_irq();
set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_edge_irq, "event");
+ handle_fasteoi_irq, "event");
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_evtchn_info(evtchn);
@@ -410,8 +799,23 @@
return irq;
}
+static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
+ unsigned int remote_port)
+{
+ struct evtchn_bind_interdomain bind_interdomain;
+ int err;
-static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+ bind_interdomain.remote_dom = remote_domain;
+ bind_interdomain.remote_port = remote_port;
+
+ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+ &bind_interdomain);
+
+ return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
+}
+
+
+int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
{
struct evtchn_bind_virq bind_virq;
int evtchn, irq;
@@ -421,6 +825,11 @@
irq = per_cpu(virq_to_irq, cpu)[virq];
if (irq == -1) {
+ irq = find_unbound_irq();
+
+ set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
+ handle_percpu_irq, "virq");
+
bind_virq.virq = virq;
bind_virq.vcpu = cpu;
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
@@ -428,11 +837,6 @@
BUG();
evtchn = bind_virq.port;
- irq = find_unbound_irq();
-
- set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
- handle_percpu_irq, "virq");
-
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_virq_info(evtchn, virq);
@@ -505,6 +909,29 @@
}
EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
+ unsigned int remote_port,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
+ if (irq < 0)
+ return irq;
+
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
+
int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
irq_handler_t handler,
unsigned long irqflags, const char *devname, void *dev_id)
@@ -564,41 +991,75 @@
{
struct shared_info *sh = HYPERVISOR_shared_info;
int cpu = smp_processor_id();
+ unsigned long *cpu_evtchn = cpu_evtchn_mask(cpu);
int i;
unsigned long flags;
static DEFINE_SPINLOCK(debug_lock);
+ struct vcpu_info *v;
spin_lock_irqsave(&debug_lock, flags);
- printk("vcpu %d\n ", cpu);
+ printk("\nvcpu %d\n ", cpu);
for_each_online_cpu(i) {
- struct vcpu_info *v = per_cpu(xen_vcpu, i);
- printk("%d: masked=%d pending=%d event_sel %08lx\n ", i,
- (get_irq_regs() && i == cpu) ? xen_irqs_disabled(get_irq_regs()) : v->evtchn_upcall_mask,
- v->evtchn_upcall_pending,
- v->evtchn_pending_sel);
+ int pending;
+ v = per_cpu(xen_vcpu, i);
+ pending = (get_irq_regs() && i == cpu)
+ ? xen_irqs_disabled(get_irq_regs())
+ : v->evtchn_upcall_mask;
+ printk("%d: masked=%d pending=%d event_sel %0*lx\n ", i,
+ pending, v->evtchn_upcall_pending,
+ (int)(sizeof(v->evtchn_pending_sel)*2),
+ v->evtchn_pending_sel);
}
- printk("pending:\n ");
- for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
- printk("%08lx%s", sh->evtchn_pending[i],
- i % 8 == 0 ? "\n " : " ");
- printk("\nmasks:\n ");
- for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
- printk("%08lx%s", sh->evtchn_mask[i],
- i % 8 == 0 ? "\n " : " ");
+ v = per_cpu(xen_vcpu, cpu);
- printk("\nunmasked:\n ");
- for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
- printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
- i % 8 == 0 ? "\n " : " ");
+ printk("\npending:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
+ printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2,
+ sh->evtchn_pending[i],
+ i % 8 == 0 ? "\n " : " ");
+ printk("\nglobal mask:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%0*lx%s",
+ (int)(sizeof(sh->evtchn_mask[0])*2),
+ sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nglobally unmasked:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
+ sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nlocal cpu%d mask:\n ", cpu);
+ for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--)
+ printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
+ cpu_evtchn[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nlocally unmasked:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
+ unsigned long pending = sh->evtchn_pending[i]
+ & ~sh->evtchn_mask[i]
+ & cpu_evtchn[i];
+ printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
+ pending, i % 8 == 0 ? "\n " : " ");
+ }
printk("\npending list:\n");
- for(i = 0; i < NR_EVENT_CHANNELS; i++) {
+ for (i = 0; i < NR_EVENT_CHANNELS; i++) {
if (sync_test_bit(i, sh->evtchn_pending)) {
- printk(" %d: event %d -> irq %d\n",
+ int word_idx = i / BITS_PER_LONG;
+ printk(" %d: event %d -> irq %d%s%s%s\n",
cpu_from_evtchn(i), i,
- evtchn_to_irq[i]);
+ evtchn_to_irq[i],
+ sync_test_bit(word_idx, &v->evtchn_pending_sel)
+ ? "" : " l2-clear",
+ !sync_test_bit(i, sh->evtchn_mask)
+ ? "" : " globally-masked",
+ sync_test_bit(i, cpu_evtchn)
+ ? "" : " locally-masked");
}
}
@@ -608,6 +1069,13 @@
}
static DEFINE_PER_CPU(unsigned, xed_nesting_count);
+static DEFINE_PER_CPU(unsigned int, current_word_idx);
+static DEFINE_PER_CPU(unsigned int, current_bit_idx);
+
+/*
+ * Mask out the i least significant bits of w
+ */
+#define MASK_LSBS(w, i) (w & ((~0UL) << i))
/*
* Search the CPUs pending events bitmasks. For each one found, map
@@ -618,17 +1086,16 @@
* a bitset of words which contain pending event bits. The second
* level is a bitset of pending events themselves.
*/
-void xen_evtchn_do_upcall(struct pt_regs *regs)
+static void __xen_evtchn_do_upcall(struct pt_regs *regs)
{
+ int start_word_idx, start_bit_idx;
+ int word_idx, bit_idx;
+ int i;
int cpu = get_cpu();
- struct pt_regs *old_regs = set_irq_regs(regs);
struct shared_info *s = HYPERVISOR_shared_info;
struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
unsigned count;
- exit_idle();
- irq_enter();
-
do {
unsigned long pending_words;
@@ -642,34 +1109,113 @@
wmb();
#endif
pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
- while (pending_words != 0) {
+
+ start_word_idx = __get_cpu_var(current_word_idx);
+ start_bit_idx = __get_cpu_var(current_bit_idx);
+
+ word_idx = start_word_idx;
+
+ for (i = 0; pending_words != 0; i++) {
unsigned long pending_bits;
- int word_idx = __ffs(pending_words);
- pending_words &= ~(1UL << word_idx);
+ unsigned long words;
- while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
- int bit_idx = __ffs(pending_bits);
- int port = (word_idx * BITS_PER_LONG) + bit_idx;
- int irq = evtchn_to_irq[port];
+ words = MASK_LSBS(pending_words, word_idx);
- if (irq != -1)
- handle_irq(irq, regs);
+ /*
+ * If we masked out all events, wrap to beginning.
+ */
+ if (words == 0) {
+ word_idx = 0;
+ bit_idx = 0;
+ continue;
}
+ word_idx = __ffs(words);
+
+ pending_bits = active_evtchns(cpu, s, word_idx);
+ bit_idx = 0; /* usually scan entire word from start */
+ if (word_idx == start_word_idx) {
+ /* We scan the starting word in two parts */
+ if (i == 0)
+ /* 1st time: start in the middle */
+ bit_idx = start_bit_idx;
+ else
+ /* 2nd time: mask bits done already */
+ bit_idx &= (1UL << start_bit_idx) - 1;
+ }
+
+ do {
+ unsigned long bits;
+ int port, irq;
+ struct irq_desc *desc;
+
+ bits = MASK_LSBS(pending_bits, bit_idx);
+
+ /* If we masked out all events, move on. */
+ if (bits == 0)
+ break;
+
+ bit_idx = __ffs(bits);
+
+ /* Process port. */
+ port = (word_idx * BITS_PER_LONG) + bit_idx;
+ irq = evtchn_to_irq[port];
+
+ mask_evtchn(port);
+ clear_evtchn(port);
+
+ if (irq != -1) {
+ desc = irq_to_desc(irq);
+ if (desc)
+ generic_handle_irq_desc(irq, desc);
+ }
+
+ bit_idx = (bit_idx + 1) % BITS_PER_LONG;
+
+ /* Next caller starts at last processed + 1 */
+ __get_cpu_var(current_word_idx) =
+ bit_idx ? word_idx :
+ (word_idx+1) % BITS_PER_LONG;
+ __get_cpu_var(current_bit_idx) = bit_idx;
+ } while (bit_idx != 0);
+
+ /* Scan start_l1i twice; all others once. */
+ if ((word_idx != start_word_idx) || (i != 0))
+ pending_words &= ~(1UL << word_idx);
+
+ word_idx = (word_idx + 1) % BITS_PER_LONG;
}
BUG_ON(!irqs_disabled());
count = __get_cpu_var(xed_nesting_count);
__get_cpu_var(xed_nesting_count) = 0;
- } while(count != 1);
+ } while (count != 1 || vcpu_info->evtchn_upcall_pending);
out:
- irq_exit();
- set_irq_regs(old_regs);
put_cpu();
}
+void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ exit_idle();
+ irq_enter();
+
+ __xen_evtchn_do_upcall(regs);
+
+ irq_exit();
+ set_irq_regs(old_regs);
+}
+
+void xen_hvm_evtchn_do_upcall(void)
+{
+ struct pt_regs *regs = get_irq_regs();
+ __xen_evtchn_do_upcall(regs);
+}
+EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall);
+
/* Rebind a new event channel to an existing irq. */
void rebind_evtchn_irq(int evtchn, int irq)
{
@@ -705,7 +1251,10 @@
struct evtchn_bind_vcpu bind_vcpu;
int evtchn = evtchn_from_irq(irq);
- if (!VALID_EVTCHN(evtchn))
+ /* events delivered via platform PCI interrupts are always
+ * routed to vcpu 0 */
+ if (!VALID_EVTCHN(evtchn) ||
+ (xen_hvm_domain() && !xen_have_vector_callback))
return -1;
/* Send future instances of this interrupt to other vcpu. */
@@ -746,33 +1295,18 @@
return 1;
}
-static void enable_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- unmask_evtchn(evtchn);
-}
-
-static void disable_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- mask_evtchn(evtchn);
-}
-
static void ack_dynirq(unsigned int irq)
{
int evtchn = evtchn_from_irq(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
- move_native_irq(irq);
+ move_masked_irq(irq);
- if (VALID_EVTCHN(evtchn))
- clear_evtchn(evtchn);
+ if (VALID_EVTCHN(evtchn) && !(desc->status & IRQ_DISABLED))
+ unmask_evtchn(evtchn);
}
-static int retrigger_dynirq(unsigned int irq)
+static int retrigger_irq(unsigned int irq)
{
int evtchn = evtchn_from_irq(irq);
struct shared_info *sh = HYPERVISOR_shared_info;
@@ -850,7 +1384,7 @@
if (VALID_EVTCHN(evtchn))
clear_evtchn(evtchn);
}
-
+EXPORT_SYMBOL(xen_clear_irq_pending);
void xen_set_irq_pending(int irq)
{
int evtchn = evtchn_from_irq(irq);
@@ -870,9 +1404,9 @@
return ret;
}
-/* Poll waiting for an irq to become pending. In the usual case, the
+/* Poll waiting for an irq to become pending with timeout. In the usual case, the
irq will be disabled so it won't deliver an interrupt. */
-void xen_poll_irq(int irq)
+void xen_poll_irq_timeout(int irq, u64 timeout)
{
evtchn_port_t evtchn = evtchn_from_irq(irq);
@@ -880,17 +1414,38 @@
struct sched_poll poll;
poll.nr_ports = 1;
- poll.timeout = 0;
+ poll.timeout = timeout;
set_xen_guest_handle(poll.ports, &evtchn);
if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
BUG();
}
}
+EXPORT_SYMBOL(xen_poll_irq_timeout);
+/* Poll waiting for an irq to become pending. In the usual case, the
+ irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq(int irq)
+{
+ xen_poll_irq_timeout(irq, 0 /* no timeout */);
+}
+
+/* Check whether the IRQ line is shared with other guests. */
+int xen_ignore_irq(int irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+ struct physdev_irq_status_query irq_status = { .irq =
+ info->u.pirq.gsi };
+
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+ return 0;
+ return !(irq_status.flags & XENIRQSTAT_shared);
+}
+EXPORT_SYMBOL_GPL(xen_ignore_irq);
void xen_irq_resume(void)
{
unsigned int cpu, irq, evtchn;
+ struct irq_desc *desc;
init_evtchn_cpu_bindings();
@@ -909,37 +1464,136 @@
restore_cpu_virqs(cpu);
restore_cpu_ipis(cpu);
}
+
+ /*
+ * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These
+ * are not handled by the IRQ core.
+ */
+ for_each_irq_desc(irq, desc) {
+ if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND))
+ continue;
+ if (desc->status & IRQ_DISABLED)
+ continue;
+
+ evtchn = evtchn_from_irq(irq);
+ if (evtchn == -1)
+ continue;
+
+ unmask_evtchn(evtchn);
+ }
+
+ if (pirq_eoi_does_unmask) {
+ struct physdev_pirq_eoi_gmfn eoi_gmfn;
+
+ eoi_gmfn.gmfn = virt_to_mfn(pirq_needs_eoi_bits);
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) != 0) {
+ /* Could recover by reverting to old method...? */
+ BUG();
+ }
+ }
}
static struct irq_chip xen_dynamic_chip __read_mostly = {
.name = "xen-dyn",
- .disable = disable_dynirq,
- .mask = disable_dynirq,
- .unmask = enable_dynirq,
+ .disable = mask_irq,
+ .mask = mask_irq,
+ .unmask = unmask_irq,
- .ack = ack_dynirq,
+ .eoi = ack_dynirq,
.set_affinity = set_affinity_irq,
- .retrigger = retrigger_dynirq,
+ .retrigger = retrigger_irq,
};
static struct irq_chip xen_percpu_chip __read_mostly = {
.name = "xen-percpu",
- .disable = disable_dynirq,
- .mask = disable_dynirq,
- .unmask = enable_dynirq,
+ .disable = mask_irq,
+ .mask = mask_irq,
+ .unmask = unmask_irq,
.ack = ack_dynirq,
};
+static struct irq_chip xen_pirq_chip __read_mostly = {
+ .name = "xen-pirq",
+
+ .startup = startup_pirq,
+ .shutdown = shutdown_pirq,
+
+ .enable = pirq_eoi,
+ .unmask = unmask_irq,
+
+ .disable = mask_irq,
+ .mask = mask_irq,
+
+ .eoi = ack_pirq,
+ .end = end_pirq,
+
+ .set_affinity = set_affinity_irq,
+
+ .retrigger = retrigger_irq,
+};
+
+int xen_set_callback_via(uint64_t via)
+{
+ struct xen_hvm_param a;
+ a.domid = DOMID_SELF;
+ a.index = HVM_PARAM_CALLBACK_IRQ;
+ a.value = via;
+ return HYPERVISOR_hvm_op(HVMOP_set_param, &a);
+}
+EXPORT_SYMBOL_GPL(xen_set_callback_via);
+
+#ifdef CONFIG_XEN_PVHVM
+/* Vector callbacks are better than PCI interrupts to receive event
+ * channel notifications because we can receive vector callbacks on any
+ * vcpu and we don't need PCI support or APIC interactions. */
+void xen_callback_vector(void)
+{
+ int rc;
+ uint64_t callback_via;
+ if (xen_have_vector_callback) {
+ callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK);
+ rc = xen_set_callback_via(callback_via);
+ if (rc) {
+ printk(KERN_ERR "Request for Xen HVM callback vector"
+ " failed.\n");
+ xen_have_vector_callback = 0;
+ return;
+ }
+ printk(KERN_INFO "Xen HVM callback vector for event delivery is "
+ "enabled\n");
+ /* in the restore case the vector has already been allocated */
+ if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors))
+ alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector);
+ }
+}
+#else
+void xen_callback_vector(void) {}
+#endif
+
void __init xen_init_IRQ(void)
{
int i;
+ struct physdev_pirq_eoi_gmfn eoi_gmfn;
+ int nr_pirqs = NR_IRQS;
cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s),
GFP_KERNEL);
- BUG_ON(cpu_evtchn_mask_p == NULL);
+ irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL);
+
+ evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
+ GFP_KERNEL);
+ for(i = 0; i < NR_EVENT_CHANNELS; i++)
+ evtchn_to_irq[i] = -1;
+
+ i = get_order(sizeof(unsigned long) * BITS_TO_LONGS(nr_pirqs));
+ pirq_needs_eoi_bits = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, i);
+
+ eoi_gmfn.gmfn = virt_to_mfn(pirq_needs_eoi_bits);
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0)
+ pirq_eoi_does_unmask = true;
init_evtchn_cpu_bindings();
@@ -947,5 +1601,11 @@
for (i = 0; i < NR_EVENT_CHANNELS; i++)
mask_evtchn(i);
- irq_ctx_init(smp_processor_id());
+ if (xen_hvm_domain()) {
+ xen_callback_vector();
+ native_init_IRQ();
+ } else {
+ irq_ctx_init(smp_processor_id());
+ xen_setup_pirqs();
+ }
}
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 79bedba..b82666a 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -48,6 +48,8 @@
#include <linux/gfp.h>
#include <linux/mutex.h>
#include <linux/cpu.h>
+
+#include <xen/xen.h>
#include <xen/events.h>
#include <xen/evtchn.h>
#include <asm/xen/hypervisor.h>
@@ -68,10 +70,36 @@
const char *name;
};
-/* Who's bound to each port? */
-static struct per_user_data *port_user[NR_EVENT_CHANNELS];
+/*
+ * Who's bound to each port? This is logically an array of struct
+ * per_user_data *, but we encode the current enabled-state in bit 0.
+ */
+static unsigned long *port_user;
static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
+static inline struct per_user_data *get_port_user(unsigned port)
+{
+ return (struct per_user_data *)(port_user[port] & ~1);
+}
+
+static inline void set_port_user(unsigned port, struct per_user_data *u)
+{
+ port_user[port] = (unsigned long)u;
+}
+
+static inline bool get_port_enabled(unsigned port)
+{
+ return port_user[port] & 1;
+}
+
+static inline void set_port_enabled(unsigned port, bool enabled)
+{
+ if (enabled)
+ port_user[port] |= 1;
+ else
+ port_user[port] &= ~1;
+}
+
irqreturn_t evtchn_interrupt(int irq, void *data)
{
unsigned int port = (unsigned long)data;
@@ -79,9 +107,14 @@
spin_lock(&port_user_lock);
- u = port_user[port];
+ u = get_port_user(port);
+
+ WARN(!get_port_enabled(port),
+ "Interrupt for port %d, but apparently not enabled; per-user %p\n",
+ port, u);
disable_irq_nosync(irq);
+ set_port_enabled(port, false);
if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
@@ -91,9 +124,8 @@
kill_fasync(&u->evtchn_async_queue,
SIGIO, POLL_IN);
}
- } else {
+ } else
u->ring_overflow = 1;
- }
spin_unlock(&port_user_lock);
@@ -197,9 +229,18 @@
goto out;
spin_lock_irq(&port_user_lock);
- for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
- if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
- enable_irq(irq_from_evtchn(kbuf[i]));
+
+ for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) {
+ unsigned port = kbuf[i];
+
+ if (port < NR_EVENT_CHANNELS &&
+ get_port_user(port) == u &&
+ !get_port_enabled(port)) {
+ set_port_enabled(port, true);
+ enable_irq(irq_from_evtchn(port));
+ }
+ }
+
spin_unlock_irq(&port_user_lock);
rc = count;
@@ -221,8 +262,9 @@
* interrupt handler yet, and our caller has already
* serialized bind operations.)
*/
- BUG_ON(port_user[port] != NULL);
- port_user[port] = u;
+ BUG_ON(get_port_user(port) != NULL);
+ set_port_user(port, u);
+ set_port_enabled(port, true); /* start enabled */
rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
u->name, (void *)(unsigned long)port);
@@ -238,10 +280,7 @@
unbind_from_irqhandler(irq, (void *)(unsigned long)port);
- /* make sure we unbind the irq handler before clearing the port */
- barrier();
-
- port_user[port] = NULL;
+ set_port_user(port, NULL);
}
static long evtchn_ioctl(struct file *file,
@@ -332,15 +371,17 @@
spin_lock_irq(&port_user_lock);
rc = -ENOTCONN;
- if (port_user[unbind.port] != u) {
+ if (get_port_user(unbind.port) != u) {
spin_unlock_irq(&port_user_lock);
break;
}
- evtchn_unbind_from_user(u, unbind.port);
+ disable_irq(irq_from_evtchn(unbind.port));
spin_unlock_irq(&port_user_lock);
+ evtchn_unbind_from_user(u, unbind.port);
+
rc = 0;
break;
}
@@ -354,7 +395,7 @@
if (notify.port >= NR_EVENT_CHANNELS) {
rc = -EINVAL;
- } else if (port_user[notify.port] != u) {
+ } else if (get_port_user(notify.port) != u) {
rc = -ENOTCONN;
} else {
notify_remote_via_evtchn(notify.port);
@@ -443,14 +484,21 @@
free_page((unsigned long)u->ring);
for (i = 0; i < NR_EVENT_CHANNELS; i++) {
- if (port_user[i] != u)
+ if (get_port_user(i) != u)
continue;
- evtchn_unbind_from_user(port_user[i], i);
+ disable_irq(irq_from_evtchn(i));
}
spin_unlock_irq(&port_user_lock);
+ for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+ if (get_port_user(i) != u)
+ continue;
+
+ evtchn_unbind_from_user(get_port_user(i), i);
+ }
+
kfree(u->name);
kfree(u);
@@ -470,7 +518,7 @@
static struct miscdevice evtchn_miscdev = {
.minor = MISC_DYNAMIC_MINOR,
- .name = "evtchn",
+ .name = "xen/evtchn",
.fops = &evtchn_fops,
};
static int __init evtchn_init(void)
@@ -480,8 +528,11 @@
if (!xen_domain())
return -ENODEV;
+ port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL);
+ if (port_user == NULL)
+ return -ENOMEM;
+
spin_lock_init(&port_user_lock);
- memset(port_user, 0, sizeof(port_user));
/* Create '/dev/misc/evtchn'. */
err = misc_register(&evtchn_miscdev);
@@ -497,6 +548,9 @@
static void __exit evtchn_cleanup(void)
{
+ kfree(port_user);
+ port_user = NULL;
+
misc_deregister(&evtchn_miscdev);
}
diff --git a/drivers/xen/features.c b/drivers/xen/features.c
index 99eda16..9e2b64f 100644
--- a/drivers/xen/features.c
+++ b/drivers/xen/features.c
@@ -18,7 +18,7 @@
u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
EXPORT_SYMBOL_GPL(xen_features);
-void xen_setup_features(void)
+void __init xen_setup_features(void)
{
struct xen_feature_info fi;
int i, j;
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
new file mode 100644
index 0000000..a33e443
--- /dev/null
+++ b/drivers/xen/gntdev.c
@@ -0,0 +1,645 @@
+/******************************************************************************
+ * gntdev.c
+ *
+ * Device for accessing (in user-space) pages that have been granted by other
+ * domains.
+ *
+ * Copyright (c) 2006-2007, D G Murray.
+ * (c) 2009 Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_notifier.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+#include <xen/xen.h>
+#include <xen/grant_table.h>
+#include <xen/gntdev.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, "
+ "Gerd Hoffmann <kraxel@redhat.com>");
+MODULE_DESCRIPTION("User-space granted page access driver");
+
+static int debug = 0;
+module_param(debug, int, 0644);
+static int limit = 1024;
+module_param(limit, int, 0644);
+
+struct gntdev_priv {
+ struct list_head maps;
+ uint32_t used;
+ uint32_t limit;
+ spinlock_t lock;
+ struct mm_struct *mm;
+ struct mmu_notifier mn;
+};
+
+struct grant_map {
+ struct list_head next;
+ struct gntdev_priv *priv;
+ struct vm_area_struct *vma;
+ int index;
+ int count;
+ int flags;
+ int is_mapped;
+ struct ioctl_gntdev_grant_ref *grants;
+ struct gnttab_map_grant_ref *map_ops;
+ struct gnttab_unmap_grant_ref *unmap_ops;
+};
+
+/* ------------------------------------------------------------------ */
+
+static void gntdev_print_maps(struct gntdev_priv *priv,
+ char *text, int text_index)
+{
+ struct grant_map *map;
+
+ printk("%s: maps list (priv %p, usage %d/%d)\n",
+ __FUNCTION__, priv, priv->used, priv->limit);
+ list_for_each_entry(map, &priv->maps, next)
+ printk(" index %2d, count %2d %s\n",
+ map->index, map->count,
+ map->index == text_index && text ? text : "");
+}
+
+static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
+{
+ struct grant_map *add;
+
+ add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
+ if (NULL == add)
+ return NULL;
+
+ add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL);
+ add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL);
+ add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
+ if (NULL == add->grants ||
+ NULL == add->map_ops ||
+ NULL == add->unmap_ops)
+ goto err;
+
+ add->index = 0;
+ add->count = count;
+ add->priv = priv;
+
+ if (add->count + priv->used > priv->limit)
+ goto err;
+
+ return add;
+
+err:
+ kfree(add->grants);
+ kfree(add->map_ops);
+ kfree(add->unmap_ops);
+ kfree(add);
+ return NULL;
+}
+
+static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add)
+{
+ struct grant_map *map;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ if (add->index + add->count < map->index) {
+ list_add_tail(&add->next, &map->next);
+ goto done;
+ }
+ add->index = map->index + map->count;
+ }
+ list_add_tail(&add->next, &priv->maps);
+
+done:
+ priv->used += add->count;
+ if (debug)
+ gntdev_print_maps(priv, "[new]", add->index);
+}
+
+static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, int index,
+ int count)
+{
+ struct grant_map *map;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ if (map->index != index)
+ continue;
+ if (map->count != count)
+ continue;
+ return map;
+ }
+ return NULL;
+}
+
+static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
+ unsigned long vaddr)
+{
+ struct grant_map *map;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ if (!map->vma)
+ continue;
+ if (vaddr < map->vma->vm_start)
+ continue;
+ if (vaddr >= map->vma->vm_end)
+ continue;
+ return map;
+ }
+ return NULL;
+}
+
+static int gntdev_del_map(struct grant_map *map)
+{
+ int i;
+
+ if (map->vma)
+ return -EBUSY;
+ for (i = 0; i < map->count; i++)
+ if (map->unmap_ops[i].handle)
+ return -EBUSY;
+
+ map->priv->used -= map->count;
+ list_del(&map->next);
+ return 0;
+}
+
+static void gntdev_free_map(struct grant_map *map)
+{
+ if (!map)
+ return;
+ kfree(map->grants);
+ kfree(map->map_ops);
+ kfree(map->unmap_ops);
+ kfree(map);
+}
+
+/* ------------------------------------------------------------------ */
+
+static int find_grant_ptes(pte_t *pte, pgtable_t token, unsigned long addr, void *data)
+{
+ struct grant_map *map = data;
+ unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
+ u64 pte_maddr;
+
+ BUG_ON(pgnr >= map->count);
+ pte_maddr = (u64)pfn_to_mfn(page_to_pfn(token)) << PAGE_SHIFT;
+ pte_maddr += (unsigned long)pte & ~PAGE_MASK;
+ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, map->flags,
+ map->grants[pgnr].ref,
+ map->grants[pgnr].domid);
+ gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, map->flags,
+ 0 /* handle */);
+ return 0;
+}
+
+static int map_grant_pages(struct grant_map *map)
+{
+ int i, err = 0;
+
+ if (debug)
+ printk("%s: map %d+%d\n", __FUNCTION__, map->index, map->count);
+ err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+ map->map_ops, map->count);
+ if (WARN_ON(err))
+ return err;
+
+ for (i = 0; i < map->count; i++) {
+ if (map->map_ops[i].status)
+ err = -EINVAL;
+ map->unmap_ops[i].handle = map->map_ops[i].handle;
+ }
+ return err;
+}
+
+static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
+{
+ int i, err = 0;
+
+ if (debug)
+ printk("%s: map %d+%d [%d+%d]\n", __FUNCTION__,
+ map->index, map->count, offset, pages);
+ err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+ map->unmap_ops + offset, pages);
+ if (WARN_ON(err))
+ return err;
+
+ for (i = 0; i < pages; i++) {
+ if (map->unmap_ops[offset+i].status)
+ err = -EINVAL;
+ map->unmap_ops[offset+i].handle = 0;
+ }
+ return err;
+}
+
+/* ------------------------------------------------------------------ */
+
+static void gntdev_vma_close(struct vm_area_struct *vma)
+{
+ struct grant_map *map = vma->vm_private_data;
+
+ if (debug)
+ printk("%s\n", __FUNCTION__);
+ map->is_mapped = 0;
+ map->vma = NULL;
+ vma->vm_private_data = NULL;
+}
+
+static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ if (debug)
+ printk("%s: vaddr %p, pgoff %ld (shouldn't happen)\n",
+ __FUNCTION__, vmf->virtual_address, vmf->pgoff);
+ vmf->flags = VM_FAULT_ERROR;
+ return 0;
+}
+
+static struct vm_operations_struct gntdev_vmops = {
+ .close = gntdev_vma_close,
+ .fault = gntdev_vma_fault,
+};
+
+/* ------------------------------------------------------------------ */
+
+static void mn_invl_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+ struct grant_map *map;
+ unsigned long mstart, mend;
+ int err;
+
+ spin_lock(&priv->lock);
+ list_for_each_entry(map, &priv->maps, next) {
+ if (!map->vma)
+ continue;
+ if (!map->is_mapped)
+ continue;
+ if (map->vma->vm_start >= end)
+ continue;
+ if (map->vma->vm_end <= start)
+ continue;
+ mstart = max(start, map->vma->vm_start);
+ mend = min(end, map->vma->vm_end);
+ if (debug)
+ printk("%s: map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
+ __FUNCTION__, map->index, map->count,
+ map->vma->vm_start, map->vma->vm_end,
+ start, end, mstart, mend);
+ err = unmap_grant_pages(map,
+ (mstart - map->vma->vm_start) >> PAGE_SHIFT,
+ (mend - mstart) >> PAGE_SHIFT);
+ WARN_ON(err);
+ }
+ spin_unlock(&priv->lock);
+}
+
+static void mn_invl_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
+}
+
+static void mn_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+ struct grant_map *map;
+ int err;
+
+ spin_lock(&priv->lock);
+ list_for_each_entry(map, &priv->maps, next) {
+ if (!map->vma)
+ continue;
+ if (debug)
+ printk("%s: map %d+%d (%lx %lx)\n",
+ __FUNCTION__, map->index, map->count,
+ map->vma->vm_start, map->vma->vm_end);
+ err = unmap_grant_pages(map, 0, map->count);
+ WARN_ON(err);
+ }
+ spin_unlock(&priv->lock);
+}
+
+struct mmu_notifier_ops gntdev_mmu_ops = {
+ .release = mn_release,
+ .invalidate_page = mn_invl_page,
+ .invalidate_range_start = mn_invl_range_start,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int gntdev_open(struct inode *inode, struct file *flip)
+{
+ struct gntdev_priv *priv;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&priv->maps);
+ spin_lock_init(&priv->lock);
+ priv->limit = limit;
+
+ priv->mm = get_task_mm(current);
+ if (!priv->mm) {
+ kfree(priv);
+ return -ENOMEM;
+ }
+ priv->mn.ops = &gntdev_mmu_ops;
+ mmu_notifier_register(&priv->mn, priv->mm);
+ mmput(priv->mm);
+
+ flip->private_data = priv;
+ if (debug)
+ printk("%s: priv %p\n", __FUNCTION__, priv);
+
+ return 0;
+}
+
+static int gntdev_release(struct inode *inode, struct file *flip)
+{
+ struct gntdev_priv *priv = flip->private_data;
+ struct grant_map *map;
+ int err;
+
+ if (debug)
+ printk("%s: priv %p\n", __FUNCTION__, priv);
+
+ spin_lock(&priv->lock);
+ while (!list_empty(&priv->maps)) {
+ map = list_entry(priv->maps.next, struct grant_map, next);
+ err = gntdev_del_map(map);
+ if (WARN_ON(err))
+ gntdev_free_map(map);
+
+ }
+ spin_unlock(&priv->lock);
+
+ mmu_notifier_unregister(&priv->mn, priv->mm);
+ kfree(priv);
+ return 0;
+}
+
+static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
+ struct ioctl_gntdev_map_grant_ref __user *u)
+{
+ struct ioctl_gntdev_map_grant_ref op;
+ struct grant_map *map;
+ int err;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ if (debug)
+ printk("%s: priv %p, add %d\n", __FUNCTION__, priv,
+ op.count);
+ if (unlikely(op.count <= 0))
+ return -EINVAL;
+ if (unlikely(op.count > priv->limit))
+ return -EINVAL;
+
+ err = -ENOMEM;
+ map = gntdev_alloc_map(priv, op.count);
+ if (!map)
+ return err;
+ if (copy_from_user(map->grants, &u->refs,
+ sizeof(map->grants[0]) * op.count) != 0) {
+ gntdev_free_map(map);
+ return err;
+ }
+
+ spin_lock(&priv->lock);
+ gntdev_add_map(priv, map);
+ op.index = map->index << PAGE_SHIFT;
+ spin_unlock(&priv->lock);
+
+ if (copy_to_user(u, &op, sizeof(op)) != 0) {
+ spin_lock(&priv->lock);
+ gntdev_del_map(map);
+ spin_unlock(&priv->lock);
+ gntdev_free_map(map);
+ return err;
+ }
+ return 0;
+}
+
+static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
+ struct ioctl_gntdev_unmap_grant_ref __user *u)
+{
+ struct ioctl_gntdev_unmap_grant_ref op;
+ struct grant_map *map;
+ int err = -EINVAL;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ if (debug)
+ printk("%s: priv %p, del %d+%d\n", __FUNCTION__, priv,
+ (int)op.index, (int)op.count);
+
+ spin_lock(&priv->lock);
+ map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
+ if (map)
+ err = gntdev_del_map(map);
+ spin_unlock(&priv->lock);
+ if (!err)
+ gntdev_free_map(map);
+ return err;
+}
+
+static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
+ struct ioctl_gntdev_get_offset_for_vaddr __user *u)
+{
+ struct ioctl_gntdev_get_offset_for_vaddr op;
+ struct grant_map *map;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ if (debug)
+ printk("%s: priv %p, offset for vaddr %lx\n", __FUNCTION__, priv,
+ (unsigned long)op.vaddr);
+
+ spin_lock(&priv->lock);
+ map = gntdev_find_map_vaddr(priv, op.vaddr);
+ if (map == NULL ||
+ map->vma->vm_start != op.vaddr) {
+ spin_unlock(&priv->lock);
+ return -EINVAL;
+ }
+ op.offset = map->index << PAGE_SHIFT;
+ op.count = map->count;
+ spin_unlock(&priv->lock);
+
+ if (copy_to_user(u, &op, sizeof(op)) != 0)
+ return -EFAULT;
+ return 0;
+}
+
+static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
+ struct ioctl_gntdev_set_max_grants __user *u)
+{
+ struct ioctl_gntdev_set_max_grants op;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ if (debug)
+ printk("%s: priv %p, limit %d\n", __FUNCTION__, priv, op.count);
+ if (op.count > limit)
+ return -EINVAL;
+
+ spin_lock(&priv->lock);
+ priv->limit = op.count;
+ spin_unlock(&priv->lock);
+ return 0;
+}
+
+static long gntdev_ioctl(struct file *flip,
+ unsigned int cmd, unsigned long arg)
+{
+ struct gntdev_priv *priv = flip->private_data;
+ void __user *ptr = (void __user *)arg;
+
+ switch (cmd) {
+ case IOCTL_GNTDEV_MAP_GRANT_REF:
+ return gntdev_ioctl_map_grant_ref(priv, ptr);
+
+ case IOCTL_GNTDEV_UNMAP_GRANT_REF:
+ return gntdev_ioctl_unmap_grant_ref(priv, ptr);
+
+ case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
+ return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
+
+ case IOCTL_GNTDEV_SET_MAX_GRANTS:
+ return gntdev_ioctl_set_max_grants(priv, ptr);
+
+ default:
+ if (debug)
+ printk("%s: priv %p, unknown cmd %x\n",
+ __FUNCTION__, priv, cmd);
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
+}
+
+static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
+{
+ struct gntdev_priv *priv = flip->private_data;
+ int index = vma->vm_pgoff;
+ int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ struct grant_map *map;
+ int err = -EINVAL;
+
+ if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ if (debug)
+ printk("%s: map %d+%d at %lx (pgoff %lx)\n", __FUNCTION__,
+ index, count, vma->vm_start, vma->vm_pgoff);
+
+ spin_lock(&priv->lock);
+ map = gntdev_find_map_index(priv, index, count);
+ if (!map)
+ goto unlock_out;
+ if (map->vma)
+ goto unlock_out;
+ if (priv->mm != vma->vm_mm) {
+ printk("%s: Huh? Other mm?\n", __FUNCTION__);
+ goto unlock_out;
+ }
+
+ vma->vm_ops = &gntdev_vmops;
+
+ vma->vm_flags |= VM_RESERVED;
+ vma->vm_flags |= VM_DONTCOPY;
+ vma->vm_flags |= VM_DONTEXPAND;
+
+ vma->vm_private_data = map;
+ map->vma = vma;
+
+ map->flags = GNTMAP_host_map | GNTMAP_application_map | GNTMAP_contains_pte;
+ if (!(vma->vm_flags & VM_WRITE))
+ map->flags |= GNTMAP_readonly;
+
+ err = apply_to_page_range(vma->vm_mm, vma->vm_start,
+ vma->vm_end - vma->vm_start,
+ find_grant_ptes, map);
+ if (err) {
+ goto unlock_out;
+ if (debug)
+ printk("%s: find_grant_ptes() failure.\n", __FUNCTION__);
+ }
+
+ err = map_grant_pages(map);
+ if (err) {
+ goto unlock_out;
+ if (debug)
+ printk("%s: map_grant_pages() failure.\n", __FUNCTION__);
+ }
+ map->is_mapped = 1;
+
+unlock_out:
+ spin_unlock(&priv->lock);
+ return err;
+}
+
+static const struct file_operations gntdev_fops = {
+ .owner = THIS_MODULE,
+ .open = gntdev_open,
+ .release = gntdev_release,
+ .mmap = gntdev_mmap,
+ .unlocked_ioctl = gntdev_ioctl
+};
+
+static struct miscdevice gntdev_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "xen/gntdev",
+ .fops = &gntdev_fops,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int __init gntdev_init(void)
+{
+ int err;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ err = misc_register(&gntdev_miscdev);
+ if (err != 0) {
+ printk(KERN_ERR "Could not register gntdev device\n");
+ return err;
+ }
+ return 0;
+}
+
+static void __exit gntdev_exit(void)
+{
+ misc_deregister(&gntdev_miscdev);
+}
+
+module_init(gntdev_init);
+module_exit(gntdev_exit);
+
+/* ------------------------------------------------------------------ */
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 7d8f531..09bb742 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -36,10 +36,13 @@
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <xen/xen.h>
#include <xen/interface/xen.h>
#include <xen/page.h>
#include <xen/grant_table.h>
+#include <xen/interface/memory.h>
#include <asm/xen/hypercall.h>
#include <asm/pgtable.h>
@@ -57,6 +60,8 @@
static int gnttab_free_count;
static grant_ref_t gnttab_free_head;
static DEFINE_SPINLOCK(gnttab_list_lock);
+unsigned long xen_hvm_resume_frames;
+EXPORT_SYMBOL_GPL(xen_hvm_resume_frames);
static struct grant_entry *shared;
@@ -431,7 +436,7 @@
return query.max_nr_frames;
}
-static inline unsigned int max_nr_grant_frames(void)
+unsigned int gnttab_max_grant_frames(void)
{
unsigned int xen_max = __max_nr_grant_frames();
@@ -439,6 +444,7 @@
return boot_max_nr_grant_frames;
return xen_max;
}
+EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
{
@@ -447,6 +453,30 @@
unsigned int nr_gframes = end_idx + 1;
int rc;
+ if (xen_hvm_domain()) {
+ struct xen_add_to_physmap xatp;
+ unsigned int i = end_idx;
+ rc = 0;
+ /*
+ * Loop backwards, so that the first hypercall has the largest
+ * index, ensuring that the table will grow only once.
+ */
+ do {
+ xatp.domid = DOMID_SELF;
+ xatp.idx = i;
+ xatp.space = XENMAPSPACE_grant_table;
+ xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i;
+ rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp);
+ if (rc != 0) {
+ printk(KERN_WARNING
+ "grant table add_to_physmap failed, err=%d\n", rc);
+ break;
+ }
+ } while (i-- > start_idx);
+
+ return rc;
+ }
+
frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
if (!frames)
return -ENOMEM;
@@ -463,7 +493,7 @@
BUG_ON(rc || setup.status);
- rc = arch_gnttab_map_shared(frames, nr_gframes, max_nr_grant_frames(),
+ rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(),
&shared);
BUG_ON(rc);
@@ -472,11 +502,127 @@
return 0;
}
+static void gnttab_page_free(struct page *page, unsigned int order)
+{
+ BUG_ON(order);
+ ClearPageForeign(page);
+ gnttab_reset_grant_page(page);
+ put_page(page);
+}
+
+/*
+ * Must not be called with IRQs off. This should only be used on the
+ * slow path.
+ *
+ * Copy a foreign granted page to local memory.
+ */
+int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep)
+{
+ struct gnttab_unmap_and_replace unmap;
+ struct mmu_update mmu;
+ struct page *page;
+ struct page *new_page;
+ void *new_addr;
+ void *addr;
+ unsigned long pfn;
+ unsigned long mfn;
+ unsigned long new_mfn;
+ int err;
+
+ page = *pagep;
+ if (!get_page_unless_zero(page))
+ return -ENOENT;
+
+ err = -ENOMEM;
+ new_page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+ if (!new_page)
+ goto out;
+
+ new_addr = page_address(new_page);
+ addr = page_address(page);
+ memcpy(new_addr, addr, PAGE_SIZE);
+
+ pfn = page_to_pfn(page);
+ mfn = pfn_to_mfn(pfn);
+ new_mfn = virt_to_mfn(new_addr);
+
+ /* Make seq visible before checking page_mapped. */
+ smp_mb();
+
+ /* Has the page been DMA-mapped? */
+ if (unlikely(page_mapped(page))) {
+ put_page(new_page);
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ set_phys_to_machine(pfn, new_mfn);
+
+ unmap.host_addr = (unsigned long)addr;
+ unmap.new_addr = (unsigned long)new_addr;
+ unmap.handle = ref;
+
+ err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
+ &unmap, 1);
+ BUG_ON(err);
+ BUG_ON(unmap.status);
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY);
+
+ mmu.ptr = PFN_PHYS(new_mfn) | MMU_MACHPHYS_UPDATE;
+ mmu.val = pfn;
+ err = HYPERVISOR_mmu_update(&mmu, 1, NULL, DOMID_SELF);
+ BUG_ON(err);
+ }
+
+ new_page->mapping = page->mapping;
+ SetPageForeign(new_page, _PageForeignDestructor(page));
+ if (PageReserved(page))
+ SetPageReserved(new_page);
+ *pagep = new_page;
+
+ SetPageForeign(page, gnttab_page_free);
+ ClearPageReserved(page);
+ page->mapping = NULL;
+
+out:
+ put_page(page);
+ return err;
+}
+EXPORT_SYMBOL_GPL(gnttab_copy_grant_page);
+
+void gnttab_reset_grant_page(struct page *page)
+{
+ init_page_count(page);
+ reset_page_mapcount(page);
+}
+EXPORT_SYMBOL_GPL(gnttab_reset_grant_page);
+
int gnttab_resume(void)
{
- if (max_nr_grant_frames() < nr_grant_frames)
+ unsigned int max_nr_gframes;
+
+ max_nr_gframes = gnttab_max_grant_frames();
+ if (max_nr_gframes < nr_grant_frames)
return -ENOSYS;
- return gnttab_map(0, nr_grant_frames - 1);
+
+ if (xen_pv_domain())
+ return gnttab_map(0, nr_grant_frames - 1);
+
+ if (!shared) {
+ shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes);
+ if (shared == NULL) {
+ printk(KERN_WARNING
+ "Failed to ioremap gnttab share frames!");
+ return -ENOMEM;
+ }
+ }
+
+ gnttab_map(0, nr_grant_frames - 1);
+
+ return 0;
}
int gnttab_suspend(void)
@@ -493,7 +639,7 @@
cur = nr_grant_frames;
extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) /
GREFS_PER_GRANT_FRAME);
- if (cur + extra > max_nr_grant_frames())
+ if (cur + extra > gnttab_max_grant_frames())
return -ENOSPC;
rc = gnttab_map(cur, cur + extra - 1);
@@ -503,15 +649,12 @@
return rc;
}
-static int __devinit gnttab_init(void)
+int gnttab_init(void)
{
int i;
unsigned int max_nr_glist_frames, nr_glist_frames;
unsigned int nr_init_grefs;
- if (!xen_domain())
- return -ENODEV;
-
nr_grant_frames = 1;
boot_max_nr_grant_frames = __max_nr_grant_frames();
@@ -554,5 +697,18 @@
kfree(gnttab_list);
return -ENOMEM;
}
+EXPORT_SYMBOL_GPL(gnttab_init);
-core_initcall(gnttab_init);
+static int __devinit __gnttab_init(void)
+{
+ /* Delay grant-table initialization in the PV on HVM case */
+ if (xen_hvm_domain())
+ return 0;
+
+ if (!xen_pv_domain())
+ return -ENODEV;
+
+ return gnttab_init();
+}
+
+core_initcall(__gnttab_init);
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 5d42d55..9efdfb9 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -8,6 +8,7 @@
#include <linux/stop_machine.h>
#include <linux/freezer.h>
+#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/grant_table.h>
#include <xen/events.h>
@@ -31,37 +32,62 @@
/* Ignore multiple shutdown requests. */
static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
-#ifdef CONFIG_PM_SLEEP
+struct suspend_info {
+ int cancelled;
+ unsigned long arg; /* extra hypercall argument */
+ void (*pre)(void);
+ void (*post)(int cancelled);
+};
+
+static void xen_hvm_post_suspend(int cancelled)
+{
+ xen_arch_hvm_post_suspend(cancelled);
+ gnttab_resume();
+}
+
+static void xen_pre_suspend(void)
+{
+ xen_mm_pin_all();
+ gnttab_suspend();
+ xen_arch_pre_suspend();
+}
+
+static void xen_post_suspend(int cancelled)
+{
+ xen_arch_post_suspend(cancelled);
+ gnttab_resume();
+ xen_mm_unpin_all();
+}
+
+#ifdef CONFIG_HIBERNATION
static int xen_suspend(void *data)
{
- int *cancelled = data;
+ struct suspend_info *si = data;
int err;
BUG_ON(!irqs_disabled());
- err = sysdev_suspend(PMSG_SUSPEND);
+ err = sysdev_suspend(PMSG_FREEZE);
if (err) {
printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n",
err);
return err;
}
- xen_mm_pin_all();
- gnttab_suspend();
- xen_pre_suspend();
+ if (si->pre)
+ si->pre();
/*
* This hypercall returns 1 if suspend was cancelled
* or the domain was merely checkpointed, and 0 if it
* is resuming in a new domain.
*/
- *cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
+ si->cancelled = HYPERVISOR_suspend(si->arg);
- xen_post_suspend(*cancelled);
- gnttab_resume();
- xen_mm_unpin_all();
+ if (si->post)
+ si->post(si->cancelled);
- if (!*cancelled) {
+ if (!si->cancelled) {
xen_irq_resume();
xen_console_resume();
xen_timer_resume();
@@ -75,7 +101,7 @@
static void do_suspend(void)
{
int err;
- int cancelled = 1;
+ struct suspend_info si;
shutting_down = SHUTDOWN_SUSPEND;
@@ -96,7 +122,7 @@
}
#endif
- err = dpm_suspend_start(PMSG_SUSPEND);
+ err = dpm_suspend_start(PMSG_FREEZE);
if (err) {
printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err);
goto out_thaw;
@@ -105,29 +131,41 @@
printk(KERN_DEBUG "suspending xenstore...\n");
xs_suspend();
- err = dpm_suspend_noirq(PMSG_SUSPEND);
+ err = dpm_suspend_noirq(PMSG_FREEZE);
if (err) {
printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err);
goto out_resume;
}
- err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
+ si.cancelled = 1;
- dpm_resume_noirq(PMSG_RESUME);
+ if (xen_hvm_domain()) {
+ si.arg = 0UL;
+ si.pre = NULL;
+ si.post = &xen_hvm_post_suspend;
+ } else {
+ si.arg = virt_to_mfn(xen_start_info);
+ si.pre = &xen_pre_suspend;
+ si.post = &xen_post_suspend;
+ }
+
+ err = stop_machine(xen_suspend, &si, cpumask_of(0));
+
+ dpm_resume_noirq(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
if (err) {
printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
- cancelled = 1;
+ si.cancelled = 1;
}
out_resume:
- if (!cancelled) {
+ if (!si.cancelled) {
xen_arch_resume();
xs_resume();
} else
xs_suspend_cancel();
- dpm_resume_end(PMSG_RESUME);
+ dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
/* Make sure timer events get retriggered on all CPUs */
clock_was_set();
@@ -143,7 +181,24 @@
out:
shutting_down = SHUTDOWN_INVALID;
}
-#endif /* CONFIG_PM_SLEEP */
+#endif /* CONFIG_HIBERNATION */
+
+struct shutdown_handler {
+ const char *command;
+ void (*cb)(void);
+};
+
+static void do_poweroff(void)
+{
+ shutting_down = SHUTDOWN_POWEROFF;
+ orderly_poweroff(false);
+}
+
+static void do_reboot(void)
+{
+ shutting_down = SHUTDOWN_POWEROFF; /* ? */
+ ctrl_alt_del();
+}
static void shutdown_handler(struct xenbus_watch *watch,
const char **vec, unsigned int len)
@@ -151,6 +206,16 @@
char *str;
struct xenbus_transaction xbt;
int err;
+ static struct shutdown_handler handlers[] = {
+ { "poweroff", do_poweroff },
+ { "halt", do_poweroff },
+ { "reboot", do_reboot },
+#ifdef CONFIG_HIBERNATION
+ { "suspend", do_suspend },
+#endif
+ {NULL, NULL},
+ };
+ static struct shutdown_handler *handler;
if (shutting_down != SHUTDOWN_INVALID)
return;
@@ -167,7 +232,14 @@
return;
}
- xenbus_write(xbt, "control", "shutdown", "");
+ for (handler = &handlers[0]; handler->command; handler++) {
+ if (strcmp(str, handler->command) == 0)
+ break;
+ }
+
+ /* Only acknowledge commands which we are prepared to handle. */
+ if (handler->cb)
+ xenbus_write(xbt, "control", "shutdown", "");
err = xenbus_transaction_end(xbt, 0);
if (err == -EAGAIN) {
@@ -175,17 +247,8 @@
goto again;
}
- if (strcmp(str, "poweroff") == 0 ||
- strcmp(str, "halt") == 0) {
- shutting_down = SHUTDOWN_POWEROFF;
- orderly_poweroff(false);
- } else if (strcmp(str, "reboot") == 0) {
- shutting_down = SHUTDOWN_POWEROFF; /* ? */
- ctrl_alt_del();
-#ifdef CONFIG_PM_SLEEP
- } else if (strcmp(str, "suspend") == 0) {
- do_suspend();
-#endif
+ if (handler->cb) {
+ handler->cb();
} else {
printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
shutting_down = SHUTDOWN_INVALID;
@@ -260,14 +323,18 @@
return NOTIFY_DONE;
}
-static int __init setup_shutdown_event(void)
+int xen_setup_shutdown_event(void)
{
static struct notifier_block xenstore_notifier = {
.notifier_call = shutdown_event
};
+
+ if (!xen_domain())
+ return -ENODEV;
register_xenstore_notifier(&xenstore_notifier);
return 0;
}
+EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);
-subsys_initcall(setup_shutdown_event);
+subsys_initcall(xen_setup_shutdown_event);
diff --git a/drivers/xen/mce.c b/drivers/xen/mce.c
new file mode 100644
index 0000000..da566a5
--- /dev/null
+++ b/drivers/xen/mce.c
@@ -0,0 +1,216 @@
+/******************************************************************************
+ * mce.c
+ * Add Machine Check event Logging support in DOM0
+ *
+ * Driver for receiving and logging machine check event
+ *
+ * Copyright (c) 2008, 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <xen/interface/xen.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/events.h>
+#include <xen/interface/vcpu.h>
+#include <asm/xen/hypercall.h>
+#include <asm/mce.h>
+#include <xen/xen.h>
+
+static mc_info_t *g_mi;
+static mcinfo_logical_cpu_t *g_physinfo;
+static uint32_t ncpus;
+
+static int convert_log(struct mc_info *mi)
+{
+ struct mcinfo_common *mic = NULL;
+ struct mcinfo_global *mc_global;
+ struct mcinfo_bank *mc_bank;
+ struct mce m;
+ int i, found = 0;
+
+ x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL);
+ WARN_ON(!mic);
+
+ mce_setup(&m);
+ mc_global = (struct mcinfo_global *)mic;
+ m.mcgstatus = mc_global->mc_gstatus;
+ m.apicid = mc_global->mc_apicid;
+ for (i = 0; i < ncpus; i++) {
+ if (g_physinfo[i].mc_apicid == m.apicid) {
+ found = 1;
+ break;
+ }
+ }
+ WARN_ON(!found);
+
+ m.socketid = g_physinfo[i].mc_chipid;
+ m.cpu = m.extcpu = g_physinfo[i].mc_cpunr;
+ m.cpuvendor = (__u8)g_physinfo[i].mc_vendor;
+ m.mcgcap = g_physinfo[i].mc_msrvalues[0].value;
+ x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK);
+ do {
+ if (mic == NULL || mic->size == 0)
+ break;
+ if (mic->type == MC_TYPE_BANK) {
+ mc_bank = (struct mcinfo_bank *)mic;
+ m.misc = mc_bank->mc_misc;
+ m.status = mc_bank->mc_status;
+ m.addr = mc_bank->mc_addr;
+ m.tsc = mc_bank->mc_tsc;
+ m.bank = mc_bank->mc_bank;
+ m.finished = 1;
+ /*log this record*/
+ mce_log(&m);
+ }
+ mic = x86_mcinfo_next(mic);
+ } while (1);
+
+ return 0;
+}
+
+/*pv_ops domain mce virq handler, logging physical mce error info*/
+static irqreturn_t mce_dom_interrupt(int irq, void *dev_id)
+{
+ xen_mc_t mc_op;
+ int result = 0;
+
+ mc_op.cmd = XEN_MC_fetch;
+ mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
+ set_xen_guest_handle(mc_op.u.mc_fetch.data, g_mi);
+urgent:
+ mc_op.u.mc_fetch.flags = XEN_MC_URGENT;
+ result = HYPERVISOR_mca(&mc_op);
+ if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
+ mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
+ goto nonurgent;
+ else {
+ result = convert_log(g_mi);
+ if (result)
+ goto end;
+ /* After fetching the error event log entry from DOM0,
+ * we need to dec the refcnt and release the entry.
+ * The entry is reserved and inc refcnt when filling
+ * the error log entry.
+ */
+ mc_op.u.mc_fetch.flags = XEN_MC_URGENT | XEN_MC_ACK;
+ result = HYPERVISOR_mca(&mc_op);
+ goto urgent;
+ }
+nonurgent:
+ mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT;
+ result = HYPERVISOR_mca(&mc_op);
+ if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
+ mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
+ goto end;
+ else {
+ result = convert_log(g_mi);
+ if (result)
+ goto end;
+ /* After fetching the error event log entry from DOM0,
+ * we need to dec the refcnt and release the entry. The
+ * entry is reserved and inc refcnt when filling the
+ * error log entry.
+ */
+ mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT | XEN_MC_ACK;
+ result = HYPERVISOR_mca(&mc_op);
+ goto nonurgent;
+ }
+end:
+ return IRQ_HANDLED;
+}
+
+static int bind_virq_for_mce(void)
+{
+ int ret;
+ xen_mc_t mc_op;
+
+ g_mi = kmalloc(sizeof(struct mc_info), GFP_KERNEL);
+
+ if (!g_mi)
+ return -ENOMEM;
+
+ /* Fetch physical CPU Numbers */
+ mc_op.cmd = XEN_MC_physcpuinfo;
+ mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
+ set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
+ ret = HYPERVISOR_mca(&mc_op);
+ if (ret) {
+ printk(KERN_ERR "MCE_DOM0_LOG: Fail to get physical CPU numbers\n");
+ kfree(g_mi);
+ return ret;
+ }
+
+ /* Fetch each CPU Physical Info for later reference*/
+ ncpus = mc_op.u.mc_physcpuinfo.ncpus;
+ g_physinfo = kmalloc(sizeof(struct mcinfo_logical_cpu)*ncpus,
+ GFP_KERNEL);
+ if (!g_physinfo) {
+ kfree(g_mi);
+ return -ENOMEM;
+ }
+ set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
+ ret = HYPERVISOR_mca(&mc_op);
+ if (ret) {
+ printk(KERN_ERR "MCE_DOM0_LOG: Fail to get physical CPUs info\n");
+ kfree(g_mi);
+ kfree(g_physinfo);
+ return ret;
+ }
+
+ ret = bind_virq_to_irqhandler(VIRQ_MCA, 0,
+ mce_dom_interrupt, 0, "mce", NULL);
+
+ if (ret < 0) {
+ printk(KERN_ERR "MCE_DOM0_LOG: bind_virq for DOM0 failed\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int __init mcelog_init(void)
+{
+ /* Only DOM0 is responsible for MCE logging */
+ if (xen_initial_domain())
+ return bind_virq_for_mce();
+
+ return 0;
+}
+
+
+static void __exit mcelog_cleanup(void)
+{
+ kfree(g_mi);
+ kfree(g_physinfo);
+}
+module_init(mcelog_init);
+module_exit(mcelog_cleanup);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/netback/Makefile b/drivers/xen/netback/Makefile
new file mode 100644
index 0000000..e346e81
--- /dev/null
+++ b/drivers/xen/netback/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o
+
+xen-netback-y := netback.o xenbus.o interface.o
diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h
new file mode 100644
index 0000000..2f3ccb3
--- /dev/null
+++ b/drivers/xen/netback/common.h
@@ -0,0 +1,327 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/common.h
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __NETIF__BACKEND__COMMON_H__
+#define __NETIF__BACKEND__COMMON_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include <xen/interface/io/netif.h>
+#include <asm/io.h>
+#include <asm/pgalloc.h>
+#include <xen/interface/grant_table.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+
+#define DPRINTK(_f, _a...) \
+ pr_debug("(file=%s, line=%d) " _f, \
+ __FILE__ , __LINE__ , ## _a )
+#define IPRINTK(fmt, args...) \
+ printk(KERN_INFO "xen_net: " fmt, ##args)
+#define WPRINTK(fmt, args...) \
+ printk(KERN_WARNING "xen_net: " fmt, ##args)
+
+struct xen_netif {
+ /* Unique identifier for this interface. */
+ domid_t domid;
+ int group;
+ unsigned int handle;
+
+ u8 fe_dev_addr[6];
+
+ /* Physical parameters of the comms window. */
+ grant_handle_t tx_shmem_handle;
+ grant_ref_t tx_shmem_ref;
+ grant_handle_t rx_shmem_handle;
+ grant_ref_t rx_shmem_ref;
+ unsigned int irq;
+
+ /* The shared rings and indexes. */
+ struct xen_netif_tx_back_ring tx;
+ struct xen_netif_rx_back_ring rx;
+ struct vm_struct *tx_comms_area;
+ struct vm_struct *rx_comms_area;
+
+ /* Flags that must not be set in dev->features */
+ int features_disabled;
+
+ /* Frontend feature information. */
+ u8 can_sg:1;
+ u8 gso:1;
+ u8 gso_prefix:1;
+ u8 csum:1;
+ u8 smart_poll:1;
+
+ /* Internal feature information. */
+ u8 can_queue:1; /* can queue packets for receiver? */
+
+ /* Allow netif_be_start_xmit() to peek ahead in the rx request
+ * ring. This is a prediction of what rx_req_cons will be once
+ * all queued skbs are put on the ring. */
+ RING_IDX rx_req_cons_peek;
+
+ /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
+ unsigned long credit_bytes;
+ unsigned long credit_usec;
+ unsigned long remaining_credit;
+ struct timer_list credit_timeout;
+
+ /* Statistics */
+ int nr_copied_skbs;
+ int rx_gso_checksum_fixup;
+
+ /* Miscellaneous private stuff. */
+ struct list_head list; /* scheduling list */
+ atomic_t refcnt;
+ struct net_device *dev;
+ struct net_device_stats stats;
+
+ unsigned int carrier;
+
+ wait_queue_head_t waiting_to_free;
+};
+
+/*
+ * Implement our own carrier flag: the network stack's version causes delays
+ * when the carrier is re-enabled (in particular, dev_activate() may not
+ * immediately be called, which can cause packet loss; also the etherbridge
+ * can be rather lazy in activating its port).
+ */
+#define netback_carrier_on(netif) ((netif)->carrier = 1)
+#define netback_carrier_off(netif) ((netif)->carrier = 0)
+#define netback_carrier_ok(netif) ((netif)->carrier)
+
+enum {
+ NETBK_DONT_COPY_SKB,
+ NETBK_DELAYED_COPY_SKB,
+ NETBK_ALWAYS_COPY_SKB,
+};
+
+extern int netbk_copy_skb_mode;
+
+/* Function pointers into netback accelerator plugin modules */
+struct netback_accel_hooks {
+ struct module *owner;
+ int (*probe)(struct xenbus_device *dev);
+ int (*remove)(struct xenbus_device *dev);
+};
+
+/* Structure to track the state of a netback accelerator plugin */
+struct netback_accelerator {
+ struct list_head link;
+ int id;
+ char *eth_name;
+ atomic_t use_count;
+ struct netback_accel_hooks *hooks;
+};
+
+struct backend_info {
+ struct xenbus_device *dev;
+ struct xen_netif *netif;
+ enum xenbus_state frontend_state;
+ struct xenbus_watch hotplug_status_watch;
+ int have_hotplug_status_watch:1;
+
+ /* State relating to the netback accelerator */
+ void *netback_accel_priv;
+ /* The accelerator that this backend is currently using */
+ struct netback_accelerator *accelerator;
+};
+
+#define NETBACK_ACCEL_VERSION 0x00010001
+
+/*
+ * Connect an accelerator plugin module to netback. Returns zero on
+ * success, < 0 on error, > 0 (with highest version number supported)
+ * if version mismatch.
+ */
+extern int netback_connect_accelerator(unsigned version,
+ int id, const char *eth_name,
+ struct netback_accel_hooks *hooks);
+/* Disconnect a previously connected accelerator plugin module */
+extern void netback_disconnect_accelerator(int id, const char *eth_name);
+
+
+extern
+void netback_probe_accelerators(struct backend_info *be,
+ struct xenbus_device *dev);
+extern
+void netback_remove_accelerators(struct backend_info *be,
+ struct xenbus_device *dev);
+extern
+void netif_accel_init(void);
+
+
+#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
+#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
+
+void netif_disconnect(struct xen_netif *netif);
+
+void netif_set_features(struct xen_netif *netif);
+struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle);
+int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
+ unsigned long rx_ring_ref, unsigned int evtchn);
+
+static inline void netif_get(struct xen_netif *netif)
+{
+ atomic_inc(&netif->refcnt);
+}
+
+static inline void netif_put(struct xen_netif *netif)
+{
+ if (atomic_dec_and_test(&netif->refcnt))
+ wake_up(&netif->waiting_to_free);
+}
+
+int netif_xenbus_init(void);
+
+#define netif_schedulable(netif) \
+ (netif_running((netif)->dev) && netback_carrier_ok(netif))
+
+void netif_schedule_work(struct xen_netif *netif);
+void netif_deschedule_work(struct xen_netif *netif);
+
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
+struct net_device_stats *netif_be_get_stats(struct net_device *dev);
+irqreturn_t netif_be_int(int irq, void *dev_id);
+
+static inline int netbk_can_queue(struct net_device *dev)
+{
+ struct xen_netif *netif = netdev_priv(dev);
+ return netif->can_queue;
+}
+
+static inline int netbk_can_sg(struct net_device *dev)
+{
+ struct xen_netif *netif = netdev_priv(dev);
+ return netif->can_sg;
+}
+
+struct pending_tx_info {
+ struct xen_netif_tx_request req;
+ struct xen_netif *netif;
+};
+typedef unsigned int pending_ring_idx_t;
+
+struct netbk_rx_meta {
+ int id;
+ int size;
+ int gso_size;
+};
+
+struct netbk_tx_pending_inuse {
+ struct list_head list;
+ unsigned long alloc_time;
+};
+
+#define MAX_PENDING_REQS 256
+
+#define MAX_BUFFER_OFFSET PAGE_SIZE
+
+/* extra field used in struct page */
+union page_ext {
+ struct {
+#if BITS_PER_LONG < 64
+#define IDX_WIDTH 8
+#define GROUP_WIDTH (BITS_PER_LONG - IDX_WIDTH)
+ unsigned int group:GROUP_WIDTH;
+ unsigned int idx:IDX_WIDTH;
+#else
+ unsigned int group, idx;
+#endif
+ } e;
+ void *mapping;
+};
+
+struct xen_netbk {
+ union {
+ struct {
+ struct tasklet_struct net_tx_tasklet;
+ struct tasklet_struct net_rx_tasklet;
+ } tasklet;
+
+ struct {
+ wait_queue_head_t netbk_action_wq;
+ struct task_struct *task;
+ } kthread;
+ };
+
+ struct sk_buff_head rx_queue;
+ struct sk_buff_head tx_queue;
+
+ struct timer_list net_timer;
+ struct timer_list netbk_tx_pending_timer;
+
+ struct page **mmap_pages;
+
+ pending_ring_idx_t pending_prod;
+ pending_ring_idx_t pending_cons;
+ pending_ring_idx_t dealloc_prod;
+ pending_ring_idx_t dealloc_cons;
+
+ struct list_head pending_inuse_head;
+ struct list_head net_schedule_list;
+
+ /* Protect the net_schedule_list in netif. */
+ spinlock_t net_schedule_list_lock;
+
+ atomic_t netfront_count;
+
+ struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+ struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
+ struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
+ struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
+
+ grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
+ u16 pending_ring[MAX_PENDING_REQS];
+ u16 dealloc_ring[MAX_PENDING_REQS];
+
+ /*
+ * Each head or fragment can be up to 4096 bytes. Given
+ * MAX_BUFFER_OFFSET of 4096 the worst case is that each
+ * head/fragment uses 2 copy operation.
+ */
+ struct gnttab_copy grant_copy_op[2*NET_RX_RING_SIZE];
+ unsigned char rx_notify[NR_IRQS];
+ u16 notify_list[NET_RX_RING_SIZE];
+ struct netbk_rx_meta meta[2*NET_RX_RING_SIZE];
+};
+
+extern struct xen_netbk *xen_netbk;
+extern int xen_netbk_group_nr;
+
+#endif /* __NETIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c
new file mode 100644
index 0000000..d3af68e
--- /dev/null
+++ b/drivers/xen/netback/interface.c
@@ -0,0 +1,471 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/interface.c
+ *
+ * Network-device interface management.
+ *
+ * Copyright (c) 2004-2005, Keir Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+
+#include <xen/events.h>
+#include <asm/xen/hypercall.h>
+
+/*
+ * Module parameter 'queue_length':
+ *
+ * Enables queuing in the network stack when a client has run out of receive
+ * descriptors.
+ */
+static unsigned long netbk_queue_length = 32;
+module_param_named(queue_length, netbk_queue_length, ulong, 0644);
+
+static void netbk_add_netif(struct xen_netbk *netbk, int group_nr,
+ struct xen_netif *netif)
+{
+ int i;
+ int min_netfront_count;
+ int min_group = 0;
+ min_netfront_count = atomic_read(&netbk[0].netfront_count);
+ for (i = 0; i < group_nr; i++) {
+ int netfront_count = atomic_read(&netbk[i].netfront_count);
+ if (netfront_count < min_netfront_count) {
+ min_group = i;
+ min_netfront_count = netfront_count;
+ }
+ }
+
+ netif->group = min_group;
+ atomic_inc(&netbk[netif->group].netfront_count);
+}
+
+static void netbk_remove_netif(struct xen_netbk *netbk, struct xen_netif *netif)
+{
+ atomic_dec(&netbk[netif->group].netfront_count);
+}
+
+static void __netif_up(struct xen_netif *netif)
+{
+ netbk_add_netif(xen_netbk, xen_netbk_group_nr, netif);
+ enable_irq(netif->irq);
+ netif_schedule_work(netif);
+}
+
+static void __netif_down(struct xen_netif *netif)
+{
+ disable_irq(netif->irq);
+ netif_deschedule_work(netif);
+ netbk_remove_netif(xen_netbk, netif);
+}
+
+static int net_open(struct net_device *dev)
+{
+ struct xen_netif *netif = netdev_priv(dev);
+ if (netback_carrier_ok(netif)) {
+ __netif_up(netif);
+ netif_start_queue(dev);
+ }
+ return 0;
+}
+
+static int net_close(struct net_device *dev)
+{
+ struct xen_netif *netif = netdev_priv(dev);
+ if (netback_carrier_ok(netif))
+ __netif_down(netif);
+ netif_stop_queue(dev);
+ return 0;
+}
+
+static int netbk_change_mtu(struct net_device *dev, int mtu)
+{
+ int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+
+ if (mtu > max)
+ return -EINVAL;
+ dev->mtu = mtu;
+ return 0;
+}
+
+void netif_set_features(struct xen_netif *netif)
+{
+ struct net_device *dev = netif->dev;
+ int features = dev->features;
+
+ if (netif->can_sg)
+ features |= NETIF_F_SG;
+ if (netif->gso || netif->gso_prefix)
+ features |= NETIF_F_TSO;
+ if (netif->csum)
+ features |= NETIF_F_IP_CSUM;
+
+ features &= ~(netif->features_disabled);
+
+ if (!(features & NETIF_F_SG) && dev->mtu > ETH_DATA_LEN)
+ dev->mtu = ETH_DATA_LEN;
+
+ dev->features = features;
+}
+
+static int netbk_set_tx_csum(struct net_device *dev, u32 data)
+{
+ struct xen_netif *netif = netdev_priv(dev);
+ if (data) {
+ if (!netif->csum)
+ return -ENOSYS;
+ netif->features_disabled &= ~NETIF_F_IP_CSUM;
+ } else {
+ netif->features_disabled |= NETIF_F_IP_CSUM;
+ }
+
+ netif_set_features(netif);
+ return 0;
+}
+
+static int netbk_set_sg(struct net_device *dev, u32 data)
+{
+ struct xen_netif *netif = netdev_priv(dev);
+ if (data) {
+ if (!netif->can_sg)
+ return -ENOSYS;
+ netif->features_disabled &= ~NETIF_F_SG;
+ } else {
+ netif->features_disabled |= NETIF_F_SG;
+ }
+
+ netif_set_features(netif);
+ return 0;
+}
+
+static int netbk_set_tso(struct net_device *dev, u32 data)
+{
+ struct xen_netif *netif = netdev_priv(dev);
+ if (data) {
+ if (!netif->gso && !netif->gso_prefix)
+ return -ENOSYS;
+ netif->features_disabled &= ~NETIF_F_TSO;
+ } else {
+ netif->features_disabled |= NETIF_F_TSO;
+ }
+
+ netif_set_features(netif);
+ return 0;
+}
+
+static void netbk_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *info)
+{
+ strcpy(info->driver, "netbk");
+ strcpy(info->bus_info, dev_name(dev->dev.parent));
+}
+
+static const struct netif_stat {
+ char name[ETH_GSTRING_LEN];
+ u16 offset;
+} netbk_stats[] = {
+ {
+ "copied_skbs",
+ offsetof(struct xen_netif, nr_copied_skbs)
+ },
+ {
+ "rx_gso_checksum_fixup",
+ offsetof(struct xen_netif, rx_gso_checksum_fixup)
+ },
+};
+
+static int netbk_get_sset_count(struct net_device *dev, int string_set)
+{
+ switch (string_set) {
+ case ETH_SS_STATS:
+ return ARRAY_SIZE(netbk_stats);
+ default:
+ return -EINVAL;
+ }
+}
+
+static void netbk_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats, u64 * data)
+{
+ void *netif = netdev_priv(dev);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
+ data[i] = *(int *)(netif + netbk_stats[i].offset);
+}
+
+static void netbk_get_strings(struct net_device *dev, u32 stringset, u8 * data)
+{
+ int i;
+
+ switch (stringset) {
+ case ETH_SS_STATS:
+ for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
+ memcpy(data + i * ETH_GSTRING_LEN,
+ netbk_stats[i].name, ETH_GSTRING_LEN);
+ break;
+ }
+}
+
+static struct ethtool_ops network_ethtool_ops =
+{
+ .get_drvinfo = netbk_get_drvinfo,
+
+ .get_tx_csum = ethtool_op_get_tx_csum,
+ .set_tx_csum = netbk_set_tx_csum,
+ .get_sg = ethtool_op_get_sg,
+ .set_sg = netbk_set_sg,
+ .get_tso = ethtool_op_get_tso,
+ .set_tso = netbk_set_tso,
+ .get_link = ethtool_op_get_link,
+
+ .get_sset_count = netbk_get_sset_count,
+ .get_ethtool_stats = netbk_get_ethtool_stats,
+ .get_strings = netbk_get_strings,
+};
+
+static struct net_device_ops netback_ops =
+{
+ .ndo_start_xmit = netif_be_start_xmit,
+ .ndo_get_stats = netif_be_get_stats,
+ .ndo_open = net_open,
+ .ndo_stop = net_close,
+ .ndo_change_mtu = netbk_change_mtu,
+};
+
+struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
+{
+ int err = 0;
+ struct net_device *dev;
+ struct xen_netif *netif;
+ char name[IFNAMSIZ] = {};
+
+ snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
+ dev = alloc_netdev(sizeof(struct xen_netif), name, ether_setup);
+ if (dev == NULL) {
+ DPRINTK("Could not create netif: out of memory\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ SET_NETDEV_DEV(dev, parent);
+
+ netif = netdev_priv(dev);
+ memset(netif, 0, sizeof(*netif));
+ netif->domid = domid;
+ netif->group = -1;
+ netif->handle = handle;
+ netif->can_sg = 1;
+ netif->csum = 1;
+ atomic_set(&netif->refcnt, 1);
+ init_waitqueue_head(&netif->waiting_to_free);
+ netif->dev = dev;
+ INIT_LIST_HEAD(&netif->list);
+
+ netback_carrier_off(netif);
+
+ netif->credit_bytes = netif->remaining_credit = ~0UL;
+ netif->credit_usec = 0UL;
+ init_timer(&netif->credit_timeout);
+ /* Initialize 'expires' now: it's used to track the credit window. */
+ netif->credit_timeout.expires = jiffies;
+
+ dev->netdev_ops = &netback_ops;
+ netif_set_features(netif);
+ SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
+
+ dev->tx_queue_len = netbk_queue_length;
+
+ /*
+ * Initialise a dummy MAC address. We choose the numerically
+ * largest non-broadcast address to prevent the address getting
+ * stolen by an Ethernet bridge for STP purposes.
+ * (FE:FF:FF:FF:FF:FF)
+ */
+ memset(dev->dev_addr, 0xFF, ETH_ALEN);
+ dev->dev_addr[0] &= ~0x01;
+
+ rtnl_lock();
+ err = register_netdevice(dev);
+ rtnl_unlock();
+ if (err) {
+ DPRINTK("Could not register new net device %s: err=%d\n",
+ dev->name, err);
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
+
+ DPRINTK("Successfully created netif\n");
+ return netif;
+}
+
+static int map_frontend_pages(
+ struct xen_netif *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
+{
+ struct gnttab_map_grant_ref op;
+
+ gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr,
+ GNTMAP_host_map, tx_ring_ref, netif->domid);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status) {
+ DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
+ return op.status;
+ }
+
+ netif->tx_shmem_ref = tx_ring_ref;
+ netif->tx_shmem_handle = op.handle;
+
+ gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr,
+ GNTMAP_host_map, rx_ring_ref, netif->domid);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status) {
+ struct gnttab_unmap_grant_ref unop;
+
+ gnttab_set_unmap_op(&unop,
+ (unsigned long)netif->tx_comms_area->addr,
+ GNTMAP_host_map, netif->tx_shmem_handle);
+ HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unop, 1);
+ DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
+ return op.status;
+ }
+
+ netif->rx_shmem_ref = rx_ring_ref;
+ netif->rx_shmem_handle = op.handle;
+
+ return 0;
+}
+
+static void unmap_frontend_pages(struct xen_netif *netif)
+{
+ struct gnttab_unmap_grant_ref op;
+
+ gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
+ GNTMAP_host_map, netif->tx_shmem_handle);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+ BUG();
+
+ gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
+ GNTMAP_host_map, netif->rx_shmem_handle);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+ BUG();
+}
+
+int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
+ unsigned long rx_ring_ref, unsigned int evtchn)
+{
+ int err = -ENOMEM;
+ struct xen_netif_tx_sring *txs;
+ struct xen_netif_rx_sring *rxs;
+
+ /* Already connected through? */
+ if (netif->irq)
+ return 0;
+
+ netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
+ if (netif->tx_comms_area == NULL)
+ return -ENOMEM;
+ netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
+ if (netif->rx_comms_area == NULL)
+ goto err_rx;
+
+ err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
+ if (err)
+ goto err_map;
+
+ err = bind_interdomain_evtchn_to_irqhandler(
+ netif->domid, evtchn, netif_be_int, 0,
+ netif->dev->name, netif);
+ if (err < 0)
+ goto err_hypervisor;
+ netif->irq = err;
+ disable_irq(netif->irq);
+
+ txs = (struct xen_netif_tx_sring *)netif->tx_comms_area->addr;
+ BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
+
+ rxs = (struct xen_netif_rx_sring *)
+ ((char *)netif->rx_comms_area->addr);
+ BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
+
+ netif->rx_req_cons_peek = 0;
+
+ netif_get(netif);
+
+ rtnl_lock();
+ netback_carrier_on(netif);
+ if (netif_running(netif->dev))
+ __netif_up(netif);
+ rtnl_unlock();
+
+ return 0;
+err_hypervisor:
+ unmap_frontend_pages(netif);
+err_map:
+ free_vm_area(netif->rx_comms_area);
+err_rx:
+ free_vm_area(netif->tx_comms_area);
+ return err;
+}
+
+void netif_disconnect(struct xen_netif *netif)
+{
+ if (netback_carrier_ok(netif)) {
+ rtnl_lock();
+ netback_carrier_off(netif);
+ netif_carrier_off(netif->dev); /* discard queued packets */
+ if (netif_running(netif->dev))
+ __netif_down(netif);
+ rtnl_unlock();
+ netif_put(netif);
+ }
+
+ atomic_dec(&netif->refcnt);
+ wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
+
+ del_timer_sync(&netif->credit_timeout);
+
+ if (netif->irq)
+ unbind_from_irqhandler(netif->irq, netif);
+
+ unregister_netdev(netif->dev);
+
+ if (netif->tx.sring) {
+ unmap_frontend_pages(netif);
+ free_vm_area(netif->tx_comms_area);
+ free_vm_area(netif->rx_comms_area);
+ }
+
+ free_netdev(netif->dev);
+}
diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
new file mode 100644
index 0000000..92bc31c
--- /dev/null
+++ b/drivers/xen/netback/netback.c
@@ -0,0 +1,1923 @@
+/******************************************************************************
+ * drivers/xen/netback/netback.c
+ *
+ * Back-end of the driver for virtual network devices. This portion of the
+ * driver exports a 'unified' network-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ * drivers/xen/netfront/netfront.c
+ *
+ * Copyright (c) 2002-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#include <linux/kthread.h>
+#include <linux/if_vlan.h>
+#include <linux/udp.h>
+
+#include <net/tcp.h>
+
+#include <xen/balloon.h>
+#include <xen/events.h>
+#include <xen/interface/memory.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+/*define NETBE_DEBUG_INTERRUPT*/
+
+struct xen_netbk *xen_netbk;
+int xen_netbk_group_nr;
+
+static void netif_idx_release(struct xen_netbk *netbk, u16 pending_idx);
+static void make_tx_response(struct xen_netif *netif,
+ struct xen_netif_tx_request *txp,
+ s8 st);
+static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
+ u16 id,
+ s8 st,
+ u16 offset,
+ u16 size,
+ u16 flags);
+
+static void net_tx_action(unsigned long data);
+
+static void net_rx_action(unsigned long data);
+
+static inline unsigned long idx_to_pfn(struct xen_netbk *netbk,
+ unsigned int idx)
+{
+ return page_to_pfn(netbk->mmap_pages[idx]);
+}
+
+static inline unsigned long idx_to_kaddr(struct xen_netbk *netbk,
+ unsigned int idx)
+{
+ return (unsigned long)pfn_to_kaddr(idx_to_pfn(netbk, idx));
+}
+
+/* extra field used in struct page */
+static inline void netif_set_page_ext(struct page *pg, unsigned int group,
+ unsigned int idx)
+{
+ union page_ext ext = { .e = { .group = group + 1, .idx = idx } };
+
+ BUILD_BUG_ON(sizeof(ext) > sizeof(ext.mapping));
+ pg->mapping = ext.mapping;
+}
+
+static inline int netif_get_page_ext(struct page *pg, unsigned int *_group, unsigned int *_idx)
+{
+ union page_ext ext = { .mapping = pg->mapping };
+ struct xen_netbk *netbk;
+ unsigned int group, idx;
+
+ if (!PageForeign(pg))
+ return 0;
+
+ group = ext.e.group - 1;
+
+ if (group < 0 || group >= xen_netbk_group_nr)
+ return 0;
+
+ netbk = &xen_netbk[group];
+
+ if (netbk->mmap_pages == NULL)
+ return 0;
+
+ idx = ext.e.idx;
+
+ if ((idx < 0) || (idx >= MAX_PENDING_REQS))
+ return 0;
+
+ if (netbk->mmap_pages[idx] != pg)
+ return 0;
+
+ *_group = group;
+ *_idx = idx;
+
+ return 1;
+}
+
+/*
+ * This is the amount of packet we copy rather than map, so that the
+ * guest can't fiddle with the contents of the headers while we do
+ * packet processing on them (netfilter, routing, etc).
+ */
+#define PKT_PROT_LEN (ETH_HLEN + \
+ VLAN_HLEN + \
+ sizeof(struct iphdr) + MAX_IPOPTLEN + \
+ sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE)
+
+static inline pending_ring_idx_t pending_index(unsigned i)
+{
+ return i & (MAX_PENDING_REQS-1);
+}
+
+static inline pending_ring_idx_t nr_pending_reqs(struct xen_netbk *netbk)
+{
+ return MAX_PENDING_REQS -
+ netbk->pending_prod + netbk->pending_cons;
+}
+
+/* Setting this allows the safe use of this driver without netloop. */
+static int MODPARM_copy_skb = 1;
+module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
+MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
+
+int netbk_copy_skb_mode;
+
+static int MODPARM_netback_kthread;
+module_param_named(netback_kthread, MODPARM_netback_kthread, bool, 0);
+MODULE_PARM_DESC(netback_kthread, "Use kernel thread to replace tasklet");
+
+/*
+ * Netback bottom half handler.
+ * dir indicates the data direction.
+ * rx: 1, tx: 0.
+ */
+static inline void xen_netbk_bh_handler(struct xen_netbk *netbk, int dir)
+{
+ if (MODPARM_netback_kthread)
+ wake_up(&netbk->kthread.netbk_action_wq);
+ else if (dir)
+ tasklet_schedule(&netbk->tasklet.net_rx_tasklet);
+ else
+ tasklet_schedule(&netbk->tasklet.net_tx_tasklet);
+}
+
+static inline void maybe_schedule_tx_action(struct xen_netbk *netbk)
+{
+ smp_mb();
+ if ((nr_pending_reqs(netbk) < (MAX_PENDING_REQS/2)) &&
+ !list_empty(&netbk->net_schedule_list))
+ xen_netbk_bh_handler(netbk, 0);
+}
+
+static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
+{
+ struct skb_shared_info *ninfo;
+ struct sk_buff *nskb;
+ unsigned long offset;
+ int ret;
+ int len;
+ int headlen;
+
+ BUG_ON(skb_shinfo(skb)->frag_list != NULL);
+
+ nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(!nskb))
+ goto err;
+
+ skb_reserve(nskb, NET_SKB_PAD + NET_IP_ALIGN);
+ headlen = skb_end_pointer(nskb) - nskb->data;
+ if (headlen > skb_headlen(skb))
+ headlen = skb_headlen(skb);
+ ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
+ BUG_ON(ret);
+
+ ninfo = skb_shinfo(nskb);
+ ninfo->gso_size = skb_shinfo(skb)->gso_size;
+ ninfo->gso_type = skb_shinfo(skb)->gso_type;
+
+ offset = headlen;
+ len = skb->len - headlen;
+
+ nskb->len = skb->len;
+ nskb->data_len = len;
+ nskb->truesize += len;
+
+ while (len) {
+ struct page *page;
+ int copy;
+ int zero;
+
+ if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
+ dump_stack();
+ goto err_free;
+ }
+
+ copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
+ zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
+
+ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
+ if (unlikely(!page))
+ goto err_free;
+
+ ret = skb_copy_bits(skb, offset, page_address(page), copy);
+ BUG_ON(ret);
+
+ ninfo->frags[ninfo->nr_frags].page = page;
+ ninfo->frags[ninfo->nr_frags].page_offset = 0;
+ ninfo->frags[ninfo->nr_frags].size = copy;
+ ninfo->nr_frags++;
+
+ offset += copy;
+ len -= copy;
+ }
+
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ offset = 0;
+#else
+ offset = nskb->data - skb->data;
+#endif
+
+ nskb->transport_header = skb->transport_header + offset;
+ nskb->network_header = skb->network_header + offset;
+ nskb->mac_header = skb->mac_header + offset;
+
+ return nskb;
+
+ err_free:
+ kfree_skb(nskb);
+ err:
+ return NULL;
+}
+
+static inline int netbk_max_required_rx_slots(struct xen_netif *netif)
+{
+ if (netif->can_sg || netif->gso || netif->gso_prefix)
+ return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
+ return 1; /* all in one */
+}
+
+static inline int netbk_queue_full(struct xen_netif *netif)
+{
+ RING_IDX peek = netif->rx_req_cons_peek;
+ RING_IDX needed = netbk_max_required_rx_slots(netif);
+
+ return ((netif->rx.sring->req_prod - peek) < needed) ||
+ ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
+}
+
+/* Figure out how many ring slots we're going to need to send @skb to
+ the guest. */
+static unsigned count_skb_slots(struct sk_buff *skb, struct xen_netif *netif)
+{
+ unsigned count;
+ unsigned copy_off;
+ unsigned i;
+
+ copy_off = 0;
+ count = 1;
+
+ BUG_ON(offset_in_page(skb->data) + skb_headlen(skb) > MAX_BUFFER_OFFSET);
+
+ copy_off = skb_headlen(skb);
+
+ if (skb_shinfo(skb)->gso_size)
+ count++;
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ unsigned long size = skb_shinfo(skb)->frags[i].size;
+ unsigned long bytes;
+ while (size > 0) {
+ BUG_ON(copy_off > MAX_BUFFER_OFFSET);
+
+ /* These checks are the same as in netbk_gop_frag_copy */
+ if (copy_off == MAX_BUFFER_OFFSET
+ || ((copy_off + size > MAX_BUFFER_OFFSET) && (size <= MAX_BUFFER_OFFSET) && copy_off)) {
+ count++;
+ copy_off = 0;
+ }
+
+ bytes = size;
+ if (copy_off + bytes > MAX_BUFFER_OFFSET)
+ bytes = MAX_BUFFER_OFFSET - copy_off;
+
+ copy_off += bytes;
+ size -= bytes;
+ }
+ }
+ return count;
+}
+
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct xen_netif *netif = netdev_priv(dev);
+ struct xen_netbk *netbk;
+
+ BUG_ON(skb->dev != dev);
+
+ if (netif->group == -1)
+ goto drop;
+
+ netbk = &xen_netbk[netif->group];
+
+ /* Drop the packet if the target domain has no receive buffers. */
+ if (unlikely(!netif_schedulable(netif) || netbk_queue_full(netif)))
+ goto drop;
+
+ /*
+ * XXX For now we also copy skbuffs whose head crosses a page
+ * boundary, because netbk_gop_skb can't handle them.
+ */
+ if ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE) {
+ struct sk_buff *nskb = netbk_copy_skb(skb);
+ if ( unlikely(nskb == NULL) )
+ goto drop;
+ /* Copy only the header fields we use in this driver. */
+ nskb->dev = skb->dev;
+ nskb->ip_summed = skb->ip_summed;
+ dev_kfree_skb(skb);
+ skb = nskb;
+ }
+
+ /* Reserve ring slots for the worst-case number of
+ * fragments. */
+ netif->rx_req_cons_peek += count_skb_slots(skb, netif);
+ netif_get(netif);
+
+ if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
+ netif->rx.sring->req_event = netif->rx_req_cons_peek +
+ netbk_max_required_rx_slots(netif);
+ mb(); /* request notification /then/ check & stop the queue */
+ if (netbk_queue_full(netif))
+ netif_stop_queue(dev);
+ }
+ skb_queue_tail(&netbk->rx_queue, skb);
+
+ xen_netbk_bh_handler(netbk, 1);
+
+ return 0;
+
+ drop:
+ netif->stats.tx_dropped++;
+ dev_kfree_skb(skb);
+ return 0;
+}
+
+struct netrx_pending_operations {
+ unsigned copy_prod, copy_cons;
+ unsigned meta_prod, meta_cons;
+ struct gnttab_copy *copy;
+ struct netbk_rx_meta *meta;
+ int copy_off;
+ grant_ref_t copy_gref;
+};
+
+/* Set up the grant operations for this fragment. If it's a flipping
+ interface, we also set up the unmap request from here. */
+
+static void netbk_gop_frag_copy(struct xen_netif *netif,
+ struct netrx_pending_operations *npo,
+ struct page *page, unsigned long size,
+ unsigned long offset, int head)
+{
+ struct gnttab_copy *copy_gop;
+ struct netbk_rx_meta *meta;
+ /*
+ * These variables a used iff netif_get_page_ext returns true,
+ * in which case they are guaranteed to be initialized.
+ */
+ unsigned int uninitialized_var(group), uninitialized_var(idx);
+ int foreign = netif_get_page_ext(page, &group, &idx);
+ unsigned long bytes;
+
+ /* Data must not cross a page boundary. */
+ BUG_ON(size + offset > PAGE_SIZE);
+
+ meta = npo->meta + npo->meta_prod - 1;
+
+ while (size > 0) {
+ BUG_ON(npo->copy_off > MAX_BUFFER_OFFSET);
+
+ /*
+ * Move to a new receive buffer if:
+ *
+ * simple case: we have completely filled the current buffer.
+ *
+ * complex case: the current frag would overflow
+ * the current buffer but only if:
+ * (i) this frag would fit completely in the next buffer
+ * and (ii) there is already some data in the current buffer
+ * and (iii) this is not the head buffer.
+ *
+ * Where:
+ * - (i) stops us splitting a frag into two copies
+ * unless the frag is too large for a single buffer.
+ * - (ii) stops us from leaving a buffer pointlessly empty.
+ * - (iii) stops us leaving the first buffer
+ * empty. Strictly speaking this is already covered
+ * by (ii) but is explicitly checked because
+ * netfront relies on the first buffer being
+ * non-empty and can crash otherwise.
+ *
+ * This means we will effectively linearise small
+ * frags but do not needlessly split large buffers
+ * into multiple copies tend to give large frags their
+ * own buffers as before.
+ */
+ if (npo->copy_off == MAX_BUFFER_OFFSET
+ || ((npo->copy_off + size > MAX_BUFFER_OFFSET) && (size <= MAX_BUFFER_OFFSET) && npo->copy_off && !head)) {
+ struct xen_netif_rx_request *req;
+
+ BUG_ON(head); /* Netfront requires there to be some data in the head buffer. */
+ /* Overflowed this request, go to the next one */
+ req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons++);
+ meta = npo->meta + npo->meta_prod++;
+ meta->gso_size = 0;
+ meta->size = 0;
+ meta->id = req->id;
+ npo->copy_off = 0;
+ npo->copy_gref = req->gref;
+ }
+
+ bytes = size;
+ if (npo->copy_off + bytes > MAX_BUFFER_OFFSET)
+ bytes = MAX_BUFFER_OFFSET - npo->copy_off;
+
+ copy_gop = npo->copy + npo->copy_prod++;
+ copy_gop->flags = GNTCOPY_dest_gref;
+ if (foreign) {
+ struct xen_netbk *netbk = &xen_netbk[group];
+ struct pending_tx_info *src_pend;
+
+ src_pend = &netbk->pending_tx_info[idx];
+
+ copy_gop->source.domid = src_pend->netif->domid;
+ copy_gop->source.u.ref = src_pend->req.gref;
+ copy_gop->flags |= GNTCOPY_source_gref;
+ } else {
+ copy_gop->source.domid = DOMID_SELF;
+ copy_gop->source.u.gmfn = virt_to_mfn(page_address(page));
+ }
+ copy_gop->source.offset = offset;
+ copy_gop->dest.domid = netif->domid;
+
+ copy_gop->dest.offset = npo->copy_off;
+ copy_gop->dest.u.ref = npo->copy_gref;
+ copy_gop->len = bytes;
+
+ npo->copy_off += bytes;
+ meta->size += bytes;
+
+ offset += bytes;
+ size -= bytes;
+ head = 0; /* Must be something in this buffer now */
+ }
+}
+
+/* Prepare an SKB to be transmitted to the frontend. This is
+ responsible for allocating grant operations, meta structures, etc.
+ It returns the number of meta structures consumed. The number of
+ ring slots used is always equal to the number of meta slots used
+ plus the number of GSO descriptors used. Currently, we use either
+ zero GSO descriptors (for non-GSO packets) or one descriptor (for
+ frontend-side LRO). */
+static int netbk_gop_skb(struct sk_buff *skb,
+ struct netrx_pending_operations *npo)
+{
+ struct xen_netif *netif = netdev_priv(skb->dev);
+ int nr_frags = skb_shinfo(skb)->nr_frags;
+ int i;
+ struct xen_netif_rx_request *req;
+ struct netbk_rx_meta *meta;
+ int old_meta_prod;
+
+ old_meta_prod = npo->meta_prod;
+
+ /* Set up a GSO prefix descriptor, if necessary */
+ if (skb_shinfo(skb)->gso_size && netif->gso_prefix) {
+ req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons++);
+ meta = npo->meta + npo->meta_prod++;
+ meta->gso_size = skb_shinfo(skb)->gso_size;
+ meta->size = 0;
+ meta->id = req->id;
+ }
+
+ req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons++);
+ meta = npo->meta + npo->meta_prod++;
+
+ if (!netif->gso_prefix)
+ meta->gso_size = skb_shinfo(skb)->gso_size;
+ else
+ meta->gso_size = 0;
+
+ meta->size = 0;
+ meta->id = req->id;
+ npo->copy_off = 0;
+ npo->copy_gref = req->gref;
+
+ netbk_gop_frag_copy(netif,
+ npo, virt_to_page(skb->data),
+ skb_headlen(skb),
+ offset_in_page(skb->data), 1);
+
+ /* Leave a gap for the GSO descriptor. */
+ if (skb_shinfo(skb)->gso_size && !netif->gso_prefix)
+ netif->rx.req_cons++;
+
+ for (i = 0; i < nr_frags; i++) {
+ netbk_gop_frag_copy(netif, npo,
+ skb_shinfo(skb)->frags[i].page,
+ skb_shinfo(skb)->frags[i].size,
+ skb_shinfo(skb)->frags[i].page_offset,
+ 0);
+ }
+
+ return npo->meta_prod - old_meta_prod;
+}
+
+/* This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was
+ used to set up the operations on the top of
+ netrx_pending_operations, which have since been done. Check that
+ they didn't give any errors and advance over them. */
+static int netbk_check_gop(int nr_meta_slots, domid_t domid,
+ struct netrx_pending_operations *npo)
+{
+ struct gnttab_copy *copy_op;
+ int status = NETIF_RSP_OKAY;
+ int i;
+
+ for (i = 0; i < nr_meta_slots; i++) {
+ copy_op = npo->copy + npo->copy_cons++;
+ if (copy_op->status != GNTST_okay) {
+ DPRINTK("Bad status %d from copy to DOM%d.\n",
+ copy_op->status, domid);
+ status = NETIF_RSP_ERROR;
+ }
+ }
+
+ return status;
+}
+
+static void netbk_add_frag_responses(struct xen_netif *netif, int status,
+ struct netbk_rx_meta *meta,
+ int nr_meta_slots)
+{
+ int i;
+ unsigned long offset;
+
+ for (i = 0; i < nr_meta_slots; i++) {
+ int flags;
+ if (i == nr_meta_slots - 1)
+ flags = 0;
+ else
+ flags = NETRXF_more_data;
+
+ offset = 0;
+ make_rx_response(netif, meta[i].id, status, offset,
+ meta[i].size, flags);
+ }
+}
+
+struct skb_cb_overlay {
+ int meta_slots_used;
+};
+
+static void net_rx_action(unsigned long data)
+{
+ struct xen_netif *netif = NULL;
+ struct xen_netbk *netbk = (struct xen_netbk *)data;
+ s8 status;
+ u16 irq, flags;
+ struct xen_netif_rx_response *resp;
+ struct sk_buff_head rxq;
+ struct sk_buff *skb;
+ int notify_nr = 0;
+ int ret;
+ int nr_frags;
+ int count;
+ unsigned long offset;
+ struct skb_cb_overlay *sco;
+
+ struct netrx_pending_operations npo = {
+ .copy = netbk->grant_copy_op,
+ .meta = netbk->meta,
+ };
+
+ skb_queue_head_init(&rxq);
+
+ count = 0;
+
+ while ((skb = skb_dequeue(&netbk->rx_queue)) != NULL) {
+ netif = netdev_priv(skb->dev);
+ nr_frags = skb_shinfo(skb)->nr_frags;
+
+ sco = (struct skb_cb_overlay *)skb->cb;
+ sco->meta_slots_used = netbk_gop_skb(skb, &npo);
+
+ count += nr_frags + 1;
+
+ __skb_queue_tail(&rxq, skb);
+
+ /* Filled the batch queue? */
+ if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
+ break;
+ }
+
+ BUG_ON(npo.meta_prod > ARRAY_SIZE(netbk->meta));
+
+ if (!npo.copy_prod)
+ return;
+
+ BUG_ON(npo.copy_prod > ARRAY_SIZE(netbk->grant_copy_op));
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_copy, &netbk->grant_copy_op,
+ npo.copy_prod);
+ BUG_ON(ret != 0);
+
+ while ((skb = __skb_dequeue(&rxq)) != NULL) {
+ sco = (struct skb_cb_overlay *)skb->cb;
+
+ netif = netdev_priv(skb->dev);
+
+ if (netbk->meta[npo.meta_cons].gso_size && netif->gso_prefix) {
+ resp = RING_GET_RESPONSE(&netif->rx,
+ netif->rx.rsp_prod_pvt++);
+
+ resp->flags = NETRXF_gso_prefix | NETRXF_more_data;
+
+ resp->offset = netbk->meta[npo.meta_cons].gso_size;
+ resp->id = netbk->meta[npo.meta_cons].id;
+ resp->status = sco->meta_slots_used;
+
+ npo.meta_cons++;
+ sco->meta_slots_used--;
+ }
+
+
+ netif->stats.tx_bytes += skb->len;
+ netif->stats.tx_packets++;
+
+ status = netbk_check_gop(sco->meta_slots_used,
+ netif->domid, &npo);
+
+ if (sco->meta_slots_used == 1)
+ flags = 0;
+ else
+ flags = NETRXF_more_data;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
+ flags |= NETRXF_csum_blank | NETRXF_data_validated;
+ else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
+ /* remote but checksummed. */
+ flags |= NETRXF_data_validated;
+
+ offset = 0;
+ resp = make_rx_response(netif, netbk->meta[npo.meta_cons].id,
+ status, offset,
+ netbk->meta[npo.meta_cons].size,
+ flags);
+
+ if (netbk->meta[npo.meta_cons].gso_size && !netif->gso_prefix) {
+ struct xen_netif_extra_info *gso =
+ (struct xen_netif_extra_info *)
+ RING_GET_RESPONSE(&netif->rx,
+ netif->rx.rsp_prod_pvt++);
+
+ resp->flags |= NETRXF_extra_info;
+
+ gso->u.gso.size = netbk->meta[npo.meta_cons].gso_size;
+ gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
+ gso->u.gso.pad = 0;
+ gso->u.gso.features = 0;
+
+ gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
+ gso->flags = 0;
+ }
+
+ if (sco->meta_slots_used > 1) {
+ netbk_add_frag_responses(netif, status,
+ netbk->meta + npo.meta_cons + 1,
+ sco->meta_slots_used - 1);
+ }
+
+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
+ irq = netif->irq;
+ if (ret && !netbk->rx_notify[irq] &&
+ (netif->smart_poll != 1)) {
+ netbk->rx_notify[irq] = 1;
+ netbk->notify_list[notify_nr++] = irq;
+ }
+
+ if (netif_queue_stopped(netif->dev) &&
+ netif_schedulable(netif) &&
+ !netbk_queue_full(netif))
+ netif_wake_queue(netif->dev);
+
+ /*
+ * netfront_smartpoll_active indicates whether
+ * netfront timer is active.
+ */
+ if ((netif->smart_poll == 1) &&
+ !(netif->rx.sring->private.netif.smartpoll_active)) {
+ notify_remote_via_irq(irq);
+ netif->rx.sring->private.netif.smartpoll_active = 1;
+ }
+
+ netif_put(netif);
+ npo.meta_cons += sco->meta_slots_used;
+ dev_kfree_skb(skb);
+ }
+
+ while (notify_nr != 0) {
+ irq = netbk->notify_list[--notify_nr];
+ netbk->rx_notify[irq] = 0;
+ notify_remote_via_irq(irq);
+ }
+
+ /* More work to do? */
+ if (!skb_queue_empty(&netbk->rx_queue) &&
+ !timer_pending(&netbk->net_timer))
+ xen_netbk_bh_handler(netbk, 1);
+}
+
+static void net_alarm(unsigned long data)
+{
+ struct xen_netbk *netbk = (struct xen_netbk *)data;
+ xen_netbk_bh_handler(netbk, 1);
+}
+
+static void netbk_tx_pending_timeout(unsigned long data)
+{
+ struct xen_netbk *netbk = (struct xen_netbk *)data;
+ xen_netbk_bh_handler(netbk, 0);
+}
+
+struct net_device_stats *netif_be_get_stats(struct net_device *dev)
+{
+ struct xen_netif *netif = netdev_priv(dev);
+ return &netif->stats;
+}
+
+static int __on_net_schedule_list(struct xen_netif *netif)
+{
+ return !list_empty(&netif->list);
+}
+
+/* Must be called with net_schedule_list_lock held */
+static void remove_from_net_schedule_list(struct xen_netif *netif)
+{
+ if (likely(__on_net_schedule_list(netif))) {
+ list_del_init(&netif->list);
+ netif_put(netif);
+ }
+}
+
+static struct xen_netif *poll_net_schedule_list(struct xen_netbk *netbk)
+{
+ struct xen_netif *netif = NULL;
+
+ spin_lock_irq(&netbk->net_schedule_list_lock);
+ if (list_empty(&netbk->net_schedule_list))
+ goto out;
+
+ netif = list_first_entry(&netbk->net_schedule_list,
+ struct xen_netif, list);
+ if (!netif)
+ goto out;
+
+ netif_get(netif);
+
+ remove_from_net_schedule_list(netif);
+out:
+ spin_unlock_irq(&netbk->net_schedule_list_lock);
+ return netif;
+}
+
+static void add_to_net_schedule_list_tail(struct xen_netif *netif)
+{
+ unsigned long flags;
+
+ struct xen_netbk *netbk = &xen_netbk[netif->group];
+ if (__on_net_schedule_list(netif))
+ return;
+
+ spin_lock_irqsave(&netbk->net_schedule_list_lock, flags);
+ if (!__on_net_schedule_list(netif) &&
+ likely(netif_schedulable(netif))) {
+ list_add_tail(&netif->list, &netbk->net_schedule_list);
+ netif_get(netif);
+ }
+ spin_unlock_irqrestore(&netbk->net_schedule_list_lock, flags);
+}
+
+void netif_schedule_work(struct xen_netif *netif)
+{
+ struct xen_netbk *netbk = &xen_netbk[netif->group];
+ int more_to_do;
+
+ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
+
+ if (more_to_do) {
+ add_to_net_schedule_list_tail(netif);
+ maybe_schedule_tx_action(netbk);
+ }
+}
+
+void netif_deschedule_work(struct xen_netif *netif)
+{
+ struct xen_netbk *netbk = &xen_netbk[netif->group];
+ spin_lock_irq(&netbk->net_schedule_list_lock);
+ remove_from_net_schedule_list(netif);
+ spin_unlock_irq(&netbk->net_schedule_list_lock);
+}
+
+
+static void tx_add_credit(struct xen_netif *netif)
+{
+ unsigned long max_burst, max_credit;
+
+ /*
+ * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
+ * Otherwise the interface can seize up due to insufficient credit.
+ */
+ max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
+ max_burst = min(max_burst, 131072UL);
+ max_burst = max(max_burst, netif->credit_bytes);
+
+ /* Take care that adding a new chunk of credit doesn't wrap to zero. */
+ max_credit = netif->remaining_credit + netif->credit_bytes;
+ if (max_credit < netif->remaining_credit)
+ max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
+
+ netif->remaining_credit = min(max_credit, max_burst);
+}
+
+static void tx_credit_callback(unsigned long data)
+{
+ struct xen_netif *netif = (struct xen_netif *)data;
+ tx_add_credit(netif);
+ netif_schedule_work(netif);
+}
+
+static inline int copy_pending_req(struct xen_netbk *netbk,
+ pending_ring_idx_t pending_idx)
+{
+ return gnttab_copy_grant_page(
+ netbk->grant_tx_handle[pending_idx],
+ &netbk->mmap_pages[pending_idx]);
+}
+
+static inline void net_tx_action_dealloc(struct xen_netbk *netbk)
+{
+ struct netbk_tx_pending_inuse *inuse, *n;
+ struct gnttab_unmap_grant_ref *gop;
+ u16 pending_idx;
+ pending_ring_idx_t dc, dp;
+ struct xen_netif *netif;
+ int ret;
+ LIST_HEAD(list);
+
+ dc = netbk->dealloc_cons;
+ gop = netbk->tx_unmap_ops;
+
+ /*
+ * Free up any grants we have finished using
+ */
+ do {
+ dp = netbk->dealloc_prod;
+
+ /* Ensure we see all indices enqueued by netif_idx_release(). */
+ smp_rmb();
+
+ while (dc != dp) {
+ unsigned long pfn;
+ struct netbk_tx_pending_inuse *pending_inuse =
+ netbk->pending_inuse;
+
+ pending_idx = netbk->dealloc_ring[pending_index(dc++)];
+ list_move_tail(&pending_inuse[pending_idx].list, &list);
+
+ pfn = idx_to_pfn(netbk, pending_idx);
+ /* Already unmapped? */
+ if (!phys_to_machine_mapping_valid(pfn))
+ continue;
+
+ gnttab_set_unmap_op(gop,
+ idx_to_kaddr(netbk, pending_idx),
+ GNTMAP_host_map,
+ netbk->grant_tx_handle[pending_idx]);
+ gop++;
+ }
+
+ } while (dp != netbk->dealloc_prod);
+
+ netbk->dealloc_cons = dc;
+
+ ret = HYPERVISOR_grant_table_op(
+ GNTTABOP_unmap_grant_ref, netbk->tx_unmap_ops,
+ gop - netbk->tx_unmap_ops);
+ BUG_ON(ret);
+
+ /*
+ * Copy any entries that have been pending for too long
+ */
+ if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
+ !list_empty(&netbk->pending_inuse_head)) {
+ list_for_each_entry_safe(inuse, n,
+ &netbk->pending_inuse_head, list) {
+ struct pending_tx_info *pending_tx_info;
+ pending_tx_info = netbk->pending_tx_info;
+
+ if (time_after(inuse->alloc_time + HZ / 2, jiffies))
+ break;
+
+ pending_idx = inuse - netbk->pending_inuse;
+
+ pending_tx_info[pending_idx].netif->nr_copied_skbs++;
+
+ switch (copy_pending_req(netbk, pending_idx)) {
+ case 0:
+ list_move_tail(&inuse->list, &list);
+ continue;
+ case -EBUSY:
+ list_del_init(&inuse->list);
+ continue;
+ case -ENOENT:
+ continue;
+ }
+
+ break;
+ }
+ }
+
+ list_for_each_entry_safe(inuse, n, &list, list) {
+ struct pending_tx_info *pending_tx_info;
+ pending_ring_idx_t index;
+
+ pending_tx_info = netbk->pending_tx_info;
+ pending_idx = inuse - netbk->pending_inuse;
+
+ netif = pending_tx_info[pending_idx].netif;
+
+ make_tx_response(netif, &pending_tx_info[pending_idx].req,
+ NETIF_RSP_OKAY);
+
+ /* Ready for next use. */
+ gnttab_reset_grant_page(netbk->mmap_pages[pending_idx]);
+
+ index = pending_index(netbk->pending_prod++);
+ netbk->pending_ring[index] = pending_idx;
+
+ netif_put(netif);
+
+ list_del_init(&inuse->list);
+ }
+}
+
+static void netbk_tx_err(struct xen_netif *netif,
+ struct xen_netif_tx_request *txp, RING_IDX end)
+{
+ RING_IDX cons = netif->tx.req_cons;
+
+ do {
+ make_tx_response(netif, txp, NETIF_RSP_ERROR);
+ if (cons >= end)
+ break;
+ txp = RING_GET_REQUEST(&netif->tx, cons++);
+ } while (1);
+ netif->tx.req_cons = cons;
+ netif_schedule_work(netif);
+ netif_put(netif);
+}
+
+static int netbk_count_requests(struct xen_netif *netif,
+ struct xen_netif_tx_request *first,
+ struct xen_netif_tx_request *txp, int work_to_do)
+{
+ RING_IDX cons = netif->tx.req_cons;
+ int frags = 0;
+
+ if (!(first->flags & NETTXF_more_data))
+ return 0;
+
+ do {
+ if (frags >= work_to_do) {
+ DPRINTK("Need more frags\n");
+ return -frags;
+ }
+
+ if (unlikely(frags >= MAX_SKB_FRAGS)) {
+ DPRINTK("Too many frags\n");
+ return -frags;
+ }
+
+ memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
+ sizeof(*txp));
+ if (txp->size > first->size) {
+ DPRINTK("Frags galore\n");
+ return -frags;
+ }
+
+ first->size -= txp->size;
+ frags++;
+
+ if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
+ DPRINTK("txp->offset: %x, size: %u\n",
+ txp->offset, txp->size);
+ return -frags;
+ }
+ } while ((txp++)->flags & NETTXF_more_data);
+
+ return frags;
+}
+
+static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netbk *netbk,
+ struct xen_netif *netif,
+ struct sk_buff *skb,
+ struct xen_netif_tx_request *txp,
+ struct gnttab_map_grant_ref *mop)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ skb_frag_t *frags = shinfo->frags;
+ unsigned long pending_idx = *((u16 *)skb->data);
+ int i, start;
+
+ /* Skip first skb fragment if it is on same page as header fragment. */
+ start = ((unsigned long)shinfo->frags[0].page == pending_idx);
+
+ for (i = start; i < shinfo->nr_frags; i++, txp++) {
+ pending_ring_idx_t index;
+ struct pending_tx_info *pending_tx_info =
+ netbk->pending_tx_info;
+
+ index = pending_index(netbk->pending_cons++);
+ pending_idx = netbk->pending_ring[index];
+
+ gnttab_set_map_op(mop++, idx_to_kaddr(netbk, pending_idx),
+ GNTMAP_host_map | GNTMAP_readonly,
+ txp->gref, netif->domid);
+
+ memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
+ netif_get(netif);
+ pending_tx_info[pending_idx].netif = netif;
+ frags[i].page = (void *)pending_idx;
+ }
+
+ return mop;
+}
+
+static int netbk_tx_check_mop(struct xen_netbk *netbk,
+ struct sk_buff *skb,
+ struct gnttab_map_grant_ref **mopp)
+{
+ struct gnttab_map_grant_ref *mop = *mopp;
+ int pending_idx = *((u16 *)skb->data);
+ struct pending_tx_info *pending_tx_info = netbk->pending_tx_info;
+ struct xen_netif *netif = pending_tx_info[pending_idx].netif;
+ struct xen_netif_tx_request *txp;
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int nr_frags = shinfo->nr_frags;
+ int i, err, start;
+
+ /* Check status of header. */
+ err = mop->status;
+ if (unlikely(err)) {
+ pending_ring_idx_t index;
+ index = pending_index(netbk->pending_prod++);
+ txp = &pending_tx_info[pending_idx].req;
+ make_tx_response(netif, txp, NETIF_RSP_ERROR);
+ netbk->pending_ring[index] = pending_idx;
+ netif_put(netif);
+ } else {
+ set_phys_to_machine(
+ __pa(idx_to_kaddr(netbk, pending_idx)) >> PAGE_SHIFT,
+ FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
+ netbk->grant_tx_handle[pending_idx] = mop->handle;
+ }
+
+ /* Skip first skb fragment if it is on same page as header fragment. */
+ start = ((unsigned long)shinfo->frags[0].page == pending_idx);
+
+ for (i = start; i < nr_frags; i++) {
+ int j, newerr;
+ pending_ring_idx_t index;
+
+ pending_idx = (unsigned long)shinfo->frags[i].page;
+
+ /* Check error status: if okay then remember grant handle. */
+ newerr = (++mop)->status;
+ if (likely(!newerr)) {
+ unsigned long addr;
+ addr = idx_to_kaddr(netbk, pending_idx);
+ set_phys_to_machine(
+ __pa(addr)>>PAGE_SHIFT,
+ FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
+ netbk->grant_tx_handle[pending_idx] = mop->handle;
+ /* Had a previous error? Invalidate this fragment. */
+ if (unlikely(err))
+ netif_idx_release(netbk, pending_idx);
+ continue;
+ }
+
+ /* Error on this fragment: respond to client with an error. */
+ txp = &netbk->pending_tx_info[pending_idx].req;
+ make_tx_response(netif, txp, NETIF_RSP_ERROR);
+ index = pending_index(netbk->pending_prod++);
+ netbk->pending_ring[index] = pending_idx;
+ netif_put(netif);
+
+ /* Not the first error? Preceding frags already invalidated. */
+ if (err)
+ continue;
+
+ /* First error: invalidate header and preceding fragments. */
+ pending_idx = *((u16 *)skb->data);
+ netif_idx_release(netbk, pending_idx);
+ for (j = start; j < i; j++) {
+ pending_idx = (unsigned long)shinfo->frags[i].page;
+ netif_idx_release(netbk, pending_idx);
+ }
+
+ /* Remember the error: invalidate all subsequent fragments. */
+ err = newerr;
+ }
+
+ *mopp = mop + 1;
+ return err;
+}
+
+static void netbk_fill_frags(struct xen_netbk *netbk, struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int nr_frags = shinfo->nr_frags;
+ int i;
+
+ for (i = 0; i < nr_frags; i++) {
+ skb_frag_t *frag = shinfo->frags + i;
+ struct xen_netif_tx_request *txp;
+ unsigned long pending_idx;
+
+ pending_idx = (unsigned long)frag->page;
+
+ netbk->pending_inuse[pending_idx].alloc_time = jiffies;
+ list_add_tail(&netbk->pending_inuse[pending_idx].list,
+ &netbk->pending_inuse_head);
+
+ txp = &netbk->pending_tx_info[pending_idx].req;
+ frag->page = virt_to_page(idx_to_kaddr(netbk, pending_idx));
+ frag->size = txp->size;
+ frag->page_offset = txp->offset;
+
+ skb->len += txp->size;
+ skb->data_len += txp->size;
+ skb->truesize += txp->size;
+ }
+}
+
+int netbk_get_extras(struct xen_netif *netif, struct xen_netif_extra_info *extras,
+ int work_to_do)
+{
+ struct xen_netif_extra_info extra;
+ RING_IDX cons = netif->tx.req_cons;
+
+ do {
+ if (unlikely(work_to_do-- <= 0)) {
+ DPRINTK("Missing extra info\n");
+ return -EBADR;
+ }
+
+ memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
+ sizeof(extra));
+ if (unlikely(!extra.type ||
+ extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
+ netif->tx.req_cons = ++cons;
+ DPRINTK("Invalid extra type: %d\n", extra.type);
+ return -EINVAL;
+ }
+
+ memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
+ netif->tx.req_cons = ++cons;
+ } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
+
+ return work_to_do;
+}
+
+static int netbk_set_skb_gso(struct sk_buff *skb, struct xen_netif_extra_info *gso)
+{
+ if (!gso->u.gso.size) {
+ DPRINTK("GSO size must not be zero.\n");
+ return -EINVAL;
+ }
+
+ /* Currently only TCPv4 S.O. is supported. */
+ if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
+ DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
+ return -EINVAL;
+ }
+
+ skb_shinfo(skb)->gso_size = gso->u.gso.size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+ /* Header must be checked, and gso_segs computed. */
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+ skb_shinfo(skb)->gso_segs = 0;
+
+ return 0;
+}
+
+static int checksum_setup(struct xen_netif *netif, struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ unsigned char *th;
+ int err = -EPROTO;
+ int recalculate_partial_csum = 0;
+
+ /*
+ * A GSO SKB must be CHECKSUM_PARTIAL. However some buggy
+ * peers can fail to set NETRXF_csum_blank when sending a GSO
+ * frame. In this case force the SKB to CHECKSUM_PARTIAL and
+ * recalculate the partial checksum.
+ */
+ if (skb->ip_summed != CHECKSUM_PARTIAL && skb_is_gso(skb)) {
+ netif->rx_gso_checksum_fixup++;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ recalculate_partial_csum = 1;
+ }
+
+ /* A non-CHECKSUM_PARTIAL SKB does not require setup. */
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ return 0;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ goto out;
+
+ iph = (void *)skb->data;
+ th = skb->data + 4 * iph->ihl;
+ if (th >= skb_tail_pointer(skb))
+ goto out;
+
+ skb->csum_start = th - skb->head;
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ skb->csum_offset = offsetof(struct tcphdr, check);
+
+ if (recalculate_partial_csum) {
+ struct tcphdr *tcph = (struct tcphdr *)th;
+ tcph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ skb->len - iph->ihl*4,
+ IPPROTO_TCP, 0);
+ }
+ break;
+ case IPPROTO_UDP:
+ skb->csum_offset = offsetof(struct udphdr, check);
+
+ if (recalculate_partial_csum) {
+ struct udphdr *udph = (struct udphdr *)th;
+ udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ skb->len - iph->ihl*4,
+ IPPROTO_UDP, 0);
+ }
+ break;
+ default:
+ if (net_ratelimit())
+ printk(KERN_ERR "Attempting to checksum a non-"
+ "TCP/UDP packet, dropping a protocol"
+ " %d packet", iph->protocol);
+ goto out;
+ }
+
+ if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
+ goto out;
+
+ err = 0;
+
+out:
+ return err;
+}
+
+static bool tx_credit_exceeded(struct xen_netif *netif, unsigned size)
+{
+ unsigned long now = jiffies;
+ unsigned long next_credit =
+ netif->credit_timeout.expires +
+ msecs_to_jiffies(netif->credit_usec / 1000);
+
+ /* Timer could already be pending in rare cases. */
+ if (timer_pending(&netif->credit_timeout))
+ return true;
+
+ /* Passed the point where we can replenish credit? */
+ if (time_after_eq(now, next_credit)) {
+ netif->credit_timeout.expires = now;
+ tx_add_credit(netif);
+ }
+
+ /* Still too big to send right now? Set a callback. */
+ if (size > netif->remaining_credit) {
+ netif->credit_timeout.data =
+ (unsigned long)netif;
+ netif->credit_timeout.function =
+ tx_credit_callback;
+ mod_timer(&netif->credit_timeout,
+ next_credit);
+
+ return true;
+ }
+
+ return false;
+}
+
+static unsigned net_tx_build_mops(struct xen_netbk *netbk)
+{
+ struct gnttab_map_grant_ref *mop;
+ struct sk_buff *skb;
+ int ret;
+
+ mop = netbk->tx_map_ops;
+ while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+ !list_empty(&netbk->net_schedule_list)) {
+ struct xen_netif *netif;
+ struct xen_netif_tx_request txreq;
+ struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
+ struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
+ u16 pending_idx;
+ RING_IDX idx;
+ int work_to_do;
+ unsigned int data_len;
+ pending_ring_idx_t index;
+
+ /* Get a netif from the list with work to do. */
+ netif = poll_net_schedule_list(netbk);
+ if (!netif)
+ continue;
+
+ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
+ if (!work_to_do) {
+ netif_put(netif);
+ continue;
+ }
+
+ idx = netif->tx.req_cons;
+ rmb(); /* Ensure that we see the request before we copy it. */
+ memcpy(&txreq, RING_GET_REQUEST(&netif->tx, idx), sizeof(txreq));
+
+ /* Credit-based scheduling. */
+ if (txreq.size > netif->remaining_credit &&
+ tx_credit_exceeded(netif, txreq.size)) {
+ netif_put(netif);
+ continue;
+ }
+
+ netif->remaining_credit -= txreq.size;
+
+ work_to_do--;
+ netif->tx.req_cons = ++idx;
+
+ memset(extras, 0, sizeof(extras));
+ if (txreq.flags & NETTXF_extra_info) {
+ work_to_do = netbk_get_extras(netif, extras,
+ work_to_do);
+ idx = netif->tx.req_cons;
+ if (unlikely(work_to_do < 0)) {
+ netbk_tx_err(netif, &txreq, idx);
+ continue;
+ }
+ }
+
+ ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
+ if (unlikely(ret < 0)) {
+ netbk_tx_err(netif, &txreq, idx - ret);
+ continue;
+ }
+ idx += ret;
+
+ if (unlikely(txreq.size < ETH_HLEN)) {
+ DPRINTK("Bad packet size: %d\n", txreq.size);
+ netbk_tx_err(netif, &txreq, idx);
+ continue;
+ }
+
+ /* No crossing a page as the payload mustn't fragment. */
+ if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
+ DPRINTK("txreq.offset: %x, size: %u, end: %lu\n",
+ txreq.offset, txreq.size,
+ (txreq.offset &~PAGE_MASK) + txreq.size);
+ netbk_tx_err(netif, &txreq, idx);
+ continue;
+ }
+
+ index = pending_index(netbk->pending_cons);
+ pending_idx = netbk->pending_ring[index];
+
+ data_len = (txreq.size > PKT_PROT_LEN &&
+ ret < MAX_SKB_FRAGS) ?
+ PKT_PROT_LEN : txreq.size;
+
+ skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(skb == NULL)) {
+ DPRINTK("Can't allocate a skb in start_xmit.\n");
+ netbk_tx_err(netif, &txreq, idx);
+ break;
+ }
+
+ /* Packets passed to netif_rx() must have some headroom. */
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+
+ if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
+ struct xen_netif_extra_info *gso;
+ gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
+
+ if (netbk_set_skb_gso(skb, gso)) {
+ kfree_skb(skb);
+ netbk_tx_err(netif, &txreq, idx);
+ continue;
+ }
+ }
+
+ gnttab_set_map_op(mop, idx_to_kaddr(netbk, pending_idx),
+ GNTMAP_host_map | GNTMAP_readonly,
+ txreq.gref, netif->domid);
+ mop++;
+
+ memcpy(&netbk->pending_tx_info[pending_idx].req,
+ &txreq, sizeof(txreq));
+ netbk->pending_tx_info[pending_idx].netif = netif;
+ *((u16 *)skb->data) = pending_idx;
+
+ __skb_put(skb, data_len);
+
+ skb_shinfo(skb)->nr_frags = ret;
+ if (data_len < txreq.size) {
+ skb_shinfo(skb)->nr_frags++;
+ skb_shinfo(skb)->frags[0].page =
+ (void *)(unsigned long)pending_idx;
+ } else {
+ /* Discriminate from any valid pending_idx value. */
+ skb_shinfo(skb)->frags[0].page = (void *)~0UL;
+ }
+
+ __skb_queue_tail(&netbk->tx_queue, skb);
+
+ netbk->pending_cons++;
+
+ mop = netbk_get_requests(netbk, netif, skb, txfrags, mop);
+
+ netif->tx.req_cons = idx;
+ netif_schedule_work(netif);
+
+ if ((mop - netbk->tx_map_ops) >= ARRAY_SIZE(netbk->tx_map_ops))
+ break;
+ }
+
+ return mop - netbk->tx_map_ops;
+}
+
+static void net_tx_submit(struct xen_netbk *netbk)
+{
+ struct gnttab_map_grant_ref *mop;
+ struct sk_buff *skb;
+
+ mop = netbk->tx_map_ops;
+ while ((skb = __skb_dequeue(&netbk->tx_queue)) != NULL) {
+ struct xen_netif_tx_request *txp;
+ struct xen_netif *netif;
+ u16 pending_idx;
+ unsigned data_len;
+
+ pending_idx = *((u16 *)skb->data);
+ netif = netbk->pending_tx_info[pending_idx].netif;
+ txp = &netbk->pending_tx_info[pending_idx].req;
+
+ /* Check the remap error code. */
+ if (unlikely(netbk_tx_check_mop(netbk, skb, &mop))) {
+ DPRINTK("netback grant failed.\n");
+ skb_shinfo(skb)->nr_frags = 0;
+ kfree_skb(skb);
+ continue;
+ }
+
+ data_len = skb->len;
+ memcpy(skb->data,
+ (void *)(idx_to_kaddr(netbk, pending_idx)|txp->offset),
+ data_len);
+ if (data_len < txp->size) {
+ /* Append the packet payload as a fragment. */
+ txp->offset += data_len;
+ txp->size -= data_len;
+ } else {
+ /* Schedule a response immediately. */
+ netif_idx_release(netbk, pending_idx);
+ }
+
+ if (txp->flags & NETTXF_csum_blank)
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ else if (txp->flags & NETTXF_data_validated)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ netbk_fill_frags(netbk, skb);
+
+ /*
+ * If the initial fragment was < PKT_PROT_LEN then
+ * pull through some bytes from the other fragments to
+ * increase the linear region to PKT_PROT_LEN bytes.
+ */
+ if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) {
+ int target = min_t(int, skb->len, PKT_PROT_LEN);
+ __pskb_pull_tail(skb, target - skb_headlen(skb));
+ }
+
+ skb->dev = netif->dev;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+
+ if (checksum_setup(netif, skb)) {
+ DPRINTK("Can't setup checksum in net_tx_action\n");
+ kfree_skb(skb);
+ continue;
+ }
+
+ if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) &&
+ unlikely(skb_linearize(skb))) {
+ DPRINTK("Can't linearize skb in net_tx_action.\n");
+ kfree_skb(skb);
+ continue;
+ }
+
+ netif->stats.rx_bytes += skb->len;
+ netif->stats.rx_packets++;
+
+ netif_rx_ni(skb);
+ netif->dev->last_rx = jiffies;
+ }
+}
+
+/* Called after netfront has transmitted */
+static void net_tx_action(unsigned long data)
+{
+ struct xen_netbk *netbk = (struct xen_netbk *)data;
+ unsigned nr_mops;
+ int ret;
+
+ net_tx_action_dealloc(netbk);
+
+ nr_mops = net_tx_build_mops(netbk);
+
+ if (nr_mops == 0)
+ goto out;
+
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+ netbk->tx_map_ops, nr_mops);
+ BUG_ON(ret);
+
+ net_tx_submit(netbk);
+out:
+ if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
+ !list_empty(&netbk->pending_inuse_head)) {
+ struct netbk_tx_pending_inuse *oldest;
+
+ oldest = list_entry(netbk->pending_inuse_head.next,
+ struct netbk_tx_pending_inuse, list);
+ mod_timer(&netbk->netbk_tx_pending_timer,
+ oldest->alloc_time + HZ);
+ }
+}
+
+static void netif_idx_release(struct xen_netbk *netbk, u16 pending_idx)
+{
+ static DEFINE_SPINLOCK(_lock);
+ unsigned long flags;
+ pending_ring_idx_t index;
+
+ spin_lock_irqsave(&_lock, flags);
+ index = pending_index(netbk->dealloc_prod);
+ netbk->dealloc_ring[index] = pending_idx;
+ /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
+ smp_wmb();
+ netbk->dealloc_prod++;
+ spin_unlock_irqrestore(&_lock, flags);
+
+ xen_netbk_bh_handler(netbk, 0);
+}
+
+static void netif_page_release(struct page *page, unsigned int order)
+{
+ unsigned int group, idx;
+ int foreign = netif_get_page_ext(page, &group, &idx);
+
+ BUG_ON(!foreign);
+ BUG_ON(order);
+
+ netif_idx_release(&xen_netbk[group], idx);
+}
+
+irqreturn_t netif_be_int(int irq, void *dev_id)
+{
+ struct xen_netif *netif = dev_id;
+ struct xen_netbk *netbk;
+
+ if (netif->group == -1)
+ return IRQ_NONE;
+
+ netbk = &xen_netbk[netif->group];
+
+ add_to_net_schedule_list_tail(netif);
+ maybe_schedule_tx_action(netbk);
+
+ if (netif_schedulable(netif) && !netbk_queue_full(netif))
+ netif_wake_queue(netif->dev);
+
+ return IRQ_HANDLED;
+}
+
+static void make_tx_response(struct xen_netif *netif,
+ struct xen_netif_tx_request *txp,
+ s8 st)
+{
+ RING_IDX i = netif->tx.rsp_prod_pvt;
+ struct xen_netif_tx_response *resp;
+ int notify;
+
+ resp = RING_GET_RESPONSE(&netif->tx, i);
+ resp->id = txp->id;
+ resp->status = st;
+
+ if (txp->flags & NETTXF_extra_info)
+ RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
+
+ netif->tx.rsp_prod_pvt = ++i;
+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
+
+ /*
+ * netfront_smartpoll_active indicates whether netfront timer
+ * is active.
+ */
+ if ((netif->smart_poll == 1)) {
+ if (!(netif->rx.sring->private.netif.smartpoll_active)) {
+ notify_remote_via_irq(netif->irq);
+ netif->rx.sring->private.netif.smartpoll_active = 1;
+ }
+ } else if (notify)
+ notify_remote_via_irq(netif->irq);
+}
+
+static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
+ u16 id,
+ s8 st,
+ u16 offset,
+ u16 size,
+ u16 flags)
+{
+ RING_IDX i = netif->rx.rsp_prod_pvt;
+ struct xen_netif_rx_response *resp;
+
+ resp = RING_GET_RESPONSE(&netif->rx, i);
+ resp->offset = offset;
+ resp->flags = flags;
+ resp->id = id;
+ resp->status = (s16)size;
+ if (st < 0)
+ resp->status = (s16)st;
+
+ netif->rx.rsp_prod_pvt = ++i;
+
+ return resp;
+}
+
+#ifdef NETBE_DEBUG_INTERRUPT
+static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
+{
+ struct list_head *ent;
+ struct xen_netif *netif;
+ int i = 0;
+ int group = 0;
+
+ printk(KERN_ALERT "netif_schedule_list:\n");
+
+ for (group = 0; group < xen_netbk_group_nr; group++) {
+ struct xen_netbk *netbk = &xen_netbk[group];
+ spin_lock_irq(&netbk->net_schedule_list_lock);
+ printk(KERN_ALERT "xen_netback group number: %d\n", group);
+ list_for_each(ent, &netbk->net_schedule_list) {
+ netif = list_entry(ent, struct xen_netif, list);
+ printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
+ "rx_resp_prod=%08x\n",
+ i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
+ printk(KERN_ALERT
+ " tx_req_cons=%08x, tx_resp_prod=%08x)\n",
+ netif->tx.req_cons, netif->tx.rsp_prod_pvt);
+ printk(KERN_ALERT
+ " shared(rx_req_prod=%08x "
+ "rx_resp_prod=%08x\n",
+ netif->rx.sring->req_prod,
+ netif->rx.sring->rsp_prod);
+ printk(KERN_ALERT
+ " rx_event=%08x, tx_req_prod=%08x\n",
+ netif->rx.sring->rsp_event,
+ netif->tx.sring->req_prod);
+ printk(KERN_ALERT
+ " tx_resp_prod=%08x, tx_event=%08x)\n",
+ netif->tx.sring->rsp_prod,
+ netif->tx.sring->rsp_event);
+ i++;
+ }
+ spin_unlock_irq(&netbk->net_schedule_list_lock);
+ }
+
+ printk(KERN_ALERT " ** End of netif_schedule_list **\n");
+
+ return IRQ_HANDLED;
+}
+#endif
+
+static inline int rx_work_todo(struct xen_netbk *netbk)
+{
+ return !skb_queue_empty(&netbk->rx_queue);
+}
+
+static inline int tx_work_todo(struct xen_netbk *netbk)
+{
+ if (netbk->dealloc_cons != netbk->dealloc_prod)
+ return 1;
+
+ if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
+ !list_empty(&netbk->pending_inuse_head))
+ return 1;
+
+ if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+ !list_empty(&netbk->net_schedule_list))
+ return 1;
+
+ return 0;
+}
+
+static int netbk_action_thread(void *data)
+{
+ struct xen_netbk *netbk = (struct xen_netbk *)data;
+ while (!kthread_should_stop()) {
+ wait_event_interruptible(netbk->kthread.netbk_action_wq,
+ rx_work_todo(netbk)
+ || tx_work_todo(netbk)
+ || kthread_should_stop());
+ cond_resched();
+
+ if (kthread_should_stop())
+ break;
+
+ if (rx_work_todo(netbk))
+ net_rx_action((unsigned long)netbk);
+
+ if (tx_work_todo(netbk))
+ net_tx_action((unsigned long)netbk);
+ }
+
+ return 0;
+}
+
+static int __init netback_init(void)
+{
+ int i;
+ struct page *page;
+ int rc = 0;
+ int group;
+
+ if (!xen_pv_domain())
+ return -ENODEV;
+
+ xen_netbk_group_nr = num_online_cpus();
+ xen_netbk = vmalloc(sizeof(struct xen_netbk) * xen_netbk_group_nr);
+ if (!xen_netbk) {
+ printk(KERN_ALERT "%s: out of memory\n", __func__);
+ return -ENOMEM;
+ }
+ memset(xen_netbk, 0, sizeof(struct xen_netbk) * xen_netbk_group_nr);
+
+ /* We can increase reservation by this much in net_rx_action(). */
+// balloon_update_driver_allowance(NET_RX_RING_SIZE);
+
+ for (group = 0; group < xen_netbk_group_nr; group++) {
+ struct xen_netbk *netbk = &xen_netbk[group];
+ skb_queue_head_init(&netbk->rx_queue);
+ skb_queue_head_init(&netbk->tx_queue);
+
+ init_timer(&netbk->net_timer);
+ netbk->net_timer.data = (unsigned long)netbk;
+ netbk->net_timer.function = net_alarm;
+
+ init_timer(&netbk->netbk_tx_pending_timer);
+ netbk->netbk_tx_pending_timer.data = (unsigned long)netbk;
+ netbk->netbk_tx_pending_timer.function =
+ netbk_tx_pending_timeout;
+
+ netbk->mmap_pages =
+ alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
+ if (!netbk->mmap_pages) {
+ printk(KERN_ALERT "%s: out of memory\n", __func__);
+ del_timer(&netbk->netbk_tx_pending_timer);
+ del_timer(&netbk->net_timer);
+ rc = -ENOMEM;
+ goto failed_init;
+ }
+
+ for (i = 0; i < MAX_PENDING_REQS; i++) {
+ page = netbk->mmap_pages[i];
+ SetPageForeign(page, netif_page_release);
+ netif_set_page_ext(page, group, i);
+ INIT_LIST_HEAD(&netbk->pending_inuse[i].list);
+ }
+
+ netbk->pending_cons = 0;
+ netbk->pending_prod = MAX_PENDING_REQS;
+ for (i = 0; i < MAX_PENDING_REQS; i++)
+ netbk->pending_ring[i] = i;
+
+ if (MODPARM_netback_kthread) {
+ init_waitqueue_head(&netbk->kthread.netbk_action_wq);
+ netbk->kthread.task =
+ kthread_create(netbk_action_thread,
+ (void *)netbk,
+ "netback/%u", group);
+
+ if (!IS_ERR(netbk->kthread.task)) {
+ kthread_bind(netbk->kthread.task, group);
+ } else {
+ printk(KERN_ALERT
+ "kthread_run() fails at netback\n");
+ free_empty_pages_and_pagevec(netbk->mmap_pages,
+ MAX_PENDING_REQS);
+ del_timer(&netbk->netbk_tx_pending_timer);
+ del_timer(&netbk->net_timer);
+ rc = PTR_ERR(netbk->kthread.task);
+ goto failed_init;
+ }
+ } else {
+ tasklet_init(&netbk->tasklet.net_tx_tasklet,
+ net_tx_action,
+ (unsigned long)netbk);
+ tasklet_init(&netbk->tasklet.net_rx_tasklet,
+ net_rx_action,
+ (unsigned long)netbk);
+ }
+
+ INIT_LIST_HEAD(&netbk->pending_inuse_head);
+ INIT_LIST_HEAD(&netbk->net_schedule_list);
+
+ spin_lock_init(&netbk->net_schedule_list_lock);
+
+ atomic_set(&netbk->netfront_count, 0);
+
+ if (MODPARM_netback_kthread)
+ wake_up_process(netbk->kthread.task);
+ }
+
+ netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
+ if (MODPARM_copy_skb) {
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
+ NULL, 0))
+ netbk_copy_skb_mode = NETBK_ALWAYS_COPY_SKB;
+ else
+ netbk_copy_skb_mode = NETBK_DELAYED_COPY_SKB;
+ }
+
+ //netif_accel_init();
+
+ rc = netif_xenbus_init();
+ if (rc)
+ goto failed_init;
+
+#ifdef NETBE_DEBUG_INTERRUPT
+ (void)bind_virq_to_irqhandler(VIRQ_DEBUG,
+ 0,
+ netif_be_dbg,
+ IRQF_SHARED,
+ "net-be-dbg",
+ &netif_be_dbg);
+#endif
+
+ return 0;
+
+failed_init:
+ for (i = 0; i < group; i++) {
+ struct xen_netbk *netbk = &xen_netbk[i];
+ free_empty_pages_and_pagevec(netbk->mmap_pages,
+ MAX_PENDING_REQS);
+ del_timer(&netbk->netbk_tx_pending_timer);
+ del_timer(&netbk->net_timer);
+ if (MODPARM_netback_kthread)
+ kthread_stop(netbk->kthread.task);
+ }
+ vfree(xen_netbk);
+ return rc;
+
+}
+
+module_init(netback_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/netback/xenbus.c b/drivers/xen/netback/xenbus.c
new file mode 100644
index 0000000..1930f64
--- /dev/null
+++ b/drivers/xen/netback/xenbus.c
@@ -0,0 +1,518 @@
+/* Xenbus code for netif backend
+ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+ Copyright (C) 2005 XenSource Ltd
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <xen/xenbus.h>
+#include "common.h"
+
+#if 0
+#undef DPRINTK
+#define DPRINTK(fmt, args...) \
+ printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
+#endif
+
+
+static int connect_rings(struct backend_info *);
+static void connect(struct backend_info *);
+static void backend_create_netif(struct backend_info *be);
+static void unregister_hotplug_status_watch(struct backend_info *be);
+
+static int netback_remove(struct xenbus_device *dev)
+{
+ struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+ //netback_remove_accelerators(be, dev);
+
+ unregister_hotplug_status_watch(be);
+ if (be->netif) {
+ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
+ xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
+ netif_disconnect(be->netif);
+ be->netif = NULL;
+ }
+ kfree(be);
+ dev_set_drvdata(&dev->dev, NULL);
+ return 0;
+}
+
+
+/**
+ * Entry point to this code when a new device is created. Allocate the basic
+ * structures and switch to InitWait.
+ */
+static int netback_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ const char *message;
+ struct xenbus_transaction xbt;
+ int err;
+ int sg;
+ struct backend_info *be = kzalloc(sizeof(struct backend_info),
+ GFP_KERNEL);
+ if (!be) {
+ xenbus_dev_fatal(dev, -ENOMEM,
+ "allocating backend structure");
+ return -ENOMEM;
+ }
+
+ be->dev = dev;
+ dev_set_drvdata(&dev->dev, be);
+
+ sg = 1;
+ if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB)
+ sg = 0;
+
+ do {
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "starting transaction");
+ goto fail;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
+ if (err) {
+ message = "writing feature-sg";
+ goto abort_transaction;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
+ "%d", sg);
+ if (err) {
+ message = "writing feature-gso-tcpv4";
+ goto abort_transaction;
+ }
+
+ /* We support rx-copy path. */
+ err = xenbus_printf(xbt, dev->nodename,
+ "feature-rx-copy", "%d", 1);
+ if (err) {
+ message = "writing feature-rx-copy";
+ goto abort_transaction;
+ }
+
+ /*
+ * We don't support rx-flip path (except old guests who don't
+ * grok this feature flag).
+ */
+ err = xenbus_printf(xbt, dev->nodename,
+ "feature-rx-flip", "%d", 0);
+ if (err) {
+ message = "writing feature-rx-flip";
+ goto abort_transaction;
+ }
+
+ /* We support data smart poll mechanism */
+ err = xenbus_printf(xbt, dev->nodename,
+ "feature-smart-poll", "%d", 1);
+ if (err) {
+ message = "writing feature-smart-poll";
+ goto abort_transaction;
+ }
+
+ err = xenbus_transaction_end(xbt, 0);
+ } while (err == -EAGAIN);
+
+ if (err) {
+ xenbus_dev_fatal(dev, err, "completing transaction");
+ goto fail;
+ }
+
+ //netback_probe_accelerators(be, dev);
+
+ err = xenbus_switch_state(dev, XenbusStateInitWait);
+ if (err)
+ goto fail;
+
+ /* This kicks hotplug scripts, so do it immediately. */
+ backend_create_netif(be);
+
+ return 0;
+
+abort_transaction:
+ xenbus_transaction_end(xbt, 1);
+ xenbus_dev_fatal(dev, err, "%s", message);
+fail:
+ DPRINTK("failed");
+ netback_remove(dev);
+ return err;
+}
+
+
+/**
+ * Handle the creation of the hotplug script environment. We add the script
+ * and vif variables to the environment, for the benefit of the vif-* hotplug
+ * scripts.
+ */
+static int netback_uevent(struct xenbus_device *xdev, struct kobj_uevent_env *env)
+{
+ struct backend_info *be = dev_get_drvdata(&xdev->dev);
+ char *val;
+
+ DPRINTK("netback_uevent");
+
+ val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
+ if (IS_ERR(val)) {
+ int err = PTR_ERR(val);
+ xenbus_dev_fatal(xdev, err, "reading script");
+ return err;
+ }
+ else {
+ if (add_uevent_var(env, "script=%s", val)) {
+ kfree(val);
+ return -ENOMEM;
+ }
+ kfree(val);
+ }
+
+ if (be && be->netif && add_uevent_var(env, "vif=%s", be->netif->dev->name))
+ return -ENOMEM;
+
+ return 0;
+}
+
+
+static void backend_create_netif(struct backend_info *be)
+{
+ int err;
+ long handle;
+ struct xenbus_device *dev = be->dev;
+
+ if (be->netif != NULL)
+ return;
+
+ err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
+ if (err != 1) {
+ xenbus_dev_fatal(dev, err, "reading handle");
+ return;
+ }
+
+ be->netif = netif_alloc(&dev->dev, dev->otherend_id, handle);
+ if (IS_ERR(be->netif)) {
+ err = PTR_ERR(be->netif);
+ be->netif = NULL;
+ xenbus_dev_fatal(dev, err, "creating interface");
+ return;
+ }
+
+ kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
+}
+
+
+static void disconnect_backend(struct xenbus_device *dev)
+{
+ struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+ if (be->netif) {
+ xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
+ netif_disconnect(be->netif);
+ be->netif = NULL;
+ }
+}
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+ enum xenbus_state frontend_state)
+{
+ struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+ DPRINTK("%s", xenbus_strstate(frontend_state));
+
+ be->frontend_state = frontend_state;
+
+ switch (frontend_state) {
+ case XenbusStateInitialising:
+ if (dev->state == XenbusStateClosed) {
+ printk(KERN_INFO "%s: %s: prepare for reconnect\n",
+ __FUNCTION__, dev->nodename);
+ xenbus_switch_state(dev, XenbusStateInitWait);
+ }
+ break;
+
+ case XenbusStateInitialised:
+ break;
+
+ case XenbusStateConnected:
+ if (dev->state == XenbusStateConnected)
+ break;
+ backend_create_netif(be);
+ if (be->netif)
+ connect(be);
+ break;
+
+ case XenbusStateClosing:
+ if (be->netif)
+ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
+ disconnect_backend(dev);
+ xenbus_switch_state(dev, XenbusStateClosing);
+ break;
+
+ case XenbusStateClosed:
+ xenbus_switch_state(dev, XenbusStateClosed);
+ if (xenbus_dev_is_online(dev))
+ break;
+ /* fall through if not online */
+ case XenbusStateUnknown:
+ device_unregister(&dev->dev);
+ break;
+
+ default:
+ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+ frontend_state);
+ break;
+ }
+}
+
+
+static void xen_net_read_rate(struct xenbus_device *dev,
+ unsigned long *bytes, unsigned long *usec)
+{
+ char *s, *e;
+ unsigned long b, u;
+ char *ratestr;
+
+ /* Default to unlimited bandwidth. */
+ *bytes = ~0UL;
+ *usec = 0;
+
+ ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
+ if (IS_ERR(ratestr))
+ return;
+
+ s = ratestr;
+ b = simple_strtoul(s, &e, 10);
+ if ((s == e) || (*e != ','))
+ goto fail;
+
+ s = e + 1;
+ u = simple_strtoul(s, &e, 10);
+ if ((s == e) || (*e != '\0'))
+ goto fail;
+
+ *bytes = b;
+ *usec = u;
+
+ kfree(ratestr);
+ return;
+
+ fail:
+ WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
+ kfree(ratestr);
+}
+
+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+{
+ char *s, *e, *macstr;
+ int i;
+
+ macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
+ if (IS_ERR(macstr))
+ return PTR_ERR(macstr);
+
+ for (i = 0; i < ETH_ALEN; i++) {
+ mac[i] = simple_strtoul(s, &e, 16);
+ if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
+ kfree(macstr);
+ return -ENOENT;
+ }
+ s = e+1;
+ }
+
+ kfree(macstr);
+ return 0;
+}
+
+static void unregister_hotplug_status_watch(struct backend_info *be)
+{
+ if (be->have_hotplug_status_watch) {
+ unregister_xenbus_watch(&be->hotplug_status_watch);
+ kfree(be->hotplug_status_watch.node);
+ }
+ be->have_hotplug_status_watch = 0;
+}
+
+static void hotplug_status_changed(struct xenbus_watch *watch,
+ const char **vec,
+ unsigned int vec_size)
+{
+ struct backend_info *be = container_of(watch,
+ struct backend_info,
+ hotplug_status_watch);
+ char *str;
+ unsigned int len;
+
+ str = xenbus_read(XBT_NIL, be->dev->nodename, "hotplug-status", &len);
+ if (IS_ERR(str))
+ return;
+ if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) {
+ xenbus_switch_state(be->dev, XenbusStateConnected);
+ /* Not interested in this watch anymore. */
+ unregister_hotplug_status_watch(be);
+ }
+ kfree(str);
+}
+
+static void connect(struct backend_info *be)
+{
+ int err;
+ struct xenbus_device *dev = be->dev;
+
+ err = connect_rings(be);
+ if (err)
+ return;
+
+ err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
+ return;
+ }
+
+ xen_net_read_rate(dev, &be->netif->credit_bytes,
+ &be->netif->credit_usec);
+ be->netif->remaining_credit = be->netif->credit_bytes;
+
+ unregister_hotplug_status_watch(be);
+ err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch,
+ hotplug_status_changed,
+ "%s/%s", dev->nodename, "hotplug-status");
+ if (err) {
+ /* Switch now, since we can't do a watch. */
+ xenbus_switch_state(dev, XenbusStateConnected);
+ } else {
+ be->have_hotplug_status_watch = 1;
+ }
+
+ netif_wake_queue(be->netif->dev);
+}
+
+
+static int connect_rings(struct backend_info *be)
+{
+ struct xen_netif *netif = be->netif;
+ struct xenbus_device *dev = be->dev;
+ unsigned long tx_ring_ref, rx_ring_ref;
+ unsigned int evtchn, rx_copy;
+ int err;
+ int val;
+
+ DPRINTK("");
+
+ err = xenbus_gather(XBT_NIL, dev->otherend,
+ "tx-ring-ref", "%lu", &tx_ring_ref,
+ "rx-ring-ref", "%lu", &rx_ring_ref,
+ "event-channel", "%u", &evtchn, NULL);
+ if (err) {
+ xenbus_dev_fatal(dev, err,
+ "reading %s/ring-ref and event-channel",
+ dev->otherend);
+ return err;
+ }
+
+ err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
+ &rx_copy);
+ if (err == -ENOENT) {
+ err = 0;
+ rx_copy = 0;
+ }
+ if (err < 0) {
+ xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
+ dev->otherend);
+ return err;
+ }
+ if (!rx_copy)
+ return -EOPNOTSUPP;
+
+ if (netif->dev->tx_queue_len != 0) {
+ if (xenbus_scanf(XBT_NIL, dev->otherend,
+ "feature-rx-notify", "%d", &val) < 0)
+ val = 0;
+ if (val)
+ netif->can_queue = 1;
+ else
+ /* Must be non-zero for pfifo_fast to work. */
+ netif->dev->tx_queue_len = 1;
+ }
+
+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg",
+ "%d", &val) < 0)
+ val = 0;
+ netif->can_sg = !!val;
+
+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4",
+ "%d", &val) < 0)
+ val = 0;
+ netif->gso = !!val;
+
+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4-prefix",
+ "%d", &val) < 0)
+ val = 0;
+ netif->gso_prefix = !!val;
+
+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
+ "%d", &val) < 0)
+ val = 0;
+ netif->csum = !val;
+
+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-smart-poll",
+ "%d", &val) < 0)
+ val = 0;
+ netif->smart_poll = !!val;
+
+ /* Set dev->features */
+ netif_set_features(netif);
+
+ /* Map the shared frame, irq etc. */
+ err = netif_map(netif, tx_ring_ref, rx_ring_ref, evtchn);
+ if (err) {
+ xenbus_dev_fatal(dev, err,
+ "mapping shared-frames %lu/%lu port %u",
+ tx_ring_ref, rx_ring_ref, evtchn);
+ return err;
+ }
+ return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id netback_ids[] = {
+ { "vif" },
+ { "" }
+};
+
+
+static struct xenbus_driver netback = {
+ .name = "vif",
+ .owner = THIS_MODULE,
+ .ids = netback_ids,
+ .probe = netback_probe,
+ .remove = netback_remove,
+ .uevent = netback_uevent,
+ .otherend_changed = frontend_changed,
+};
+
+
+int netif_xenbus_init(void)
+{
+ printk(KERN_CRIT "registering netback\n");
+ return xenbus_register_backend(&netback);
+}
diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
new file mode 100644
index 0000000..ae693e7
--- /dev/null
+++ b/drivers/xen/pci.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Weidong Han <weidong.han@intel.com>
+ */
+
+#include <linux/pci.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include "../pci/pci.h"
+
+
+#ifdef CONFIG_PCI_IOV
+#define HANDLE_PCI_IOV 1
+#else
+#define HANDLE_PCI_IOV 0
+#endif
+
+static int xen_add_device(struct device *dev)
+{
+ int r;
+ struct pci_dev *pci_dev = to_pci_dev(dev);
+
+ if (HANDLE_PCI_IOV && pci_dev->is_virtfn) {
+ struct physdev_manage_pci_ext manage_pci_ext = {
+ .bus = pci_dev->bus->number,
+ .devfn = pci_dev->devfn,
+ .is_virtfn = 1,
+#ifdef CONFIG_PCI_IOV
+ .physfn.bus = pci_dev->physfn->bus->number,
+ .physfn.devfn = pci_dev->physfn->devfn,
+#endif
+ };
+
+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
+ &manage_pci_ext);
+ } else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
+ struct physdev_manage_pci_ext manage_pci_ext = {
+ .bus = pci_dev->bus->number,
+ .devfn = pci_dev->devfn,
+ .is_extfn = 1,
+ };
+
+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
+ &manage_pci_ext);
+ } else {
+ struct physdev_manage_pci manage_pci = {
+ .bus = pci_dev->bus->number,
+ .devfn = pci_dev->devfn,
+ };
+
+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add,
+ &manage_pci);
+ }
+
+ return r;
+}
+
+static int xen_remove_device(struct device *dev)
+{
+ int r;
+ struct pci_dev *pci_dev = to_pci_dev(dev);
+ struct physdev_manage_pci manage_pci;
+
+ manage_pci.bus = pci_dev->bus->number;
+ manage_pci.devfn = pci_dev->devfn;
+
+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
+ &manage_pci);
+
+ return r;
+}
+
+static int xen_pci_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+ int r = 0;
+
+ switch (action) {
+ case BUS_NOTIFY_ADD_DEVICE:
+ r = xen_add_device(dev);
+ break;
+ case BUS_NOTIFY_DEL_DEVICE:
+ r = xen_remove_device(dev);
+ break;
+ default:
+ break;
+ }
+
+ return r;
+}
+
+struct notifier_block device_nb = {
+ .notifier_call = xen_pci_notifier,
+};
+
+static int __init register_xen_pci_notifier(void)
+{
+ if (!xen_pv_domain())
+ return 0;
+
+ return bus_register_notifier(&pci_bus_type, &device_nb);
+}
+
+arch_initcall(register_xen_pci_notifier);
diff --git a/drivers/xen/pciback/Makefile b/drivers/xen/pciback/Makefile
new file mode 100644
index 0000000..38bc123
--- /dev/null
+++ b/drivers/xen/pciback/Makefile
@@ -0,0 +1,17 @@
+obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o
+
+xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o
+xen-pciback-y += conf_space.o conf_space_header.o \
+ conf_space_capability.o \
+ conf_space_capability_vpd.o \
+ conf_space_capability_pm.o \
+ conf_space_quirks.o
+xen-pciback-$(CONFIG_PCI_MSI) += conf_space_capability_msi.o
+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o
+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o
+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER) += controller.o
+
+ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y)
+EXTRA_CFLAGS += -DDEBUG
+endif
diff --git a/drivers/xen/pciback/conf_space.c b/drivers/xen/pciback/conf_space.c
new file mode 100644
index 0000000..370c18e
--- /dev/null
+++ b/drivers/xen/pciback/conf_space.c
@@ -0,0 +1,435 @@
+/*
+ * PCI Backend - Functions for creating a virtual configuration space for
+ * exported PCI Devices.
+ * It's dangerous to allow PCI Driver Domains to change their
+ * device's resources (memory, i/o ports, interrupts). We need to
+ * restrict changes to certain PCI Configuration registers:
+ * BARs, INTERRUPT_PIN, most registers in the header...
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+static int permissive;
+module_param(permissive, bool, 0644);
+
+#define DEFINE_PCI_CONFIG(op, size, type) \
+int pciback_##op##_config_##size \
+(struct pci_dev *dev, int offset, type value, void *data) \
+{ \
+ return pci_##op##_config_##size(dev, offset, value); \
+}
+
+DEFINE_PCI_CONFIG(read, byte, u8 *)
+DEFINE_PCI_CONFIG(read, word, u16 *)
+DEFINE_PCI_CONFIG(read, dword, u32 *)
+
+DEFINE_PCI_CONFIG(write, byte, u8)
+DEFINE_PCI_CONFIG(write, word, u16)
+DEFINE_PCI_CONFIG(write, dword, u32)
+
+static int conf_space_read(struct pci_dev *dev,
+ const struct config_field_entry *entry,
+ int offset, u32 *value)
+{
+ int ret = 0;
+ const struct config_field *field = entry->field;
+
+ *value = 0;
+
+ switch (field->size) {
+ case 1:
+ if (field->u.b.read)
+ ret = field->u.b.read(dev, offset, (u8 *) value,
+ entry->data);
+ break;
+ case 2:
+ if (field->u.w.read)
+ ret = field->u.w.read(dev, offset, (u16 *) value,
+ entry->data);
+ break;
+ case 4:
+ if (field->u.dw.read)
+ ret = field->u.dw.read(dev, offset, value, entry->data);
+ break;
+ }
+ return ret;
+}
+
+static int conf_space_write(struct pci_dev *dev,
+ const struct config_field_entry *entry,
+ int offset, u32 value)
+{
+ int ret = 0;
+ const struct config_field *field = entry->field;
+
+ switch (field->size) {
+ case 1:
+ if (field->u.b.write)
+ ret = field->u.b.write(dev, offset, (u8) value,
+ entry->data);
+ break;
+ case 2:
+ if (field->u.w.write)
+ ret = field->u.w.write(dev, offset, (u16) value,
+ entry->data);
+ break;
+ case 4:
+ if (field->u.dw.write)
+ ret = field->u.dw.write(dev, offset, value,
+ entry->data);
+ break;
+ }
+ return ret;
+}
+
+static inline u32 get_mask(int size)
+{
+ if (size == 1)
+ return 0xff;
+ else if (size == 2)
+ return 0xffff;
+ else
+ return 0xffffffff;
+}
+
+static inline int valid_request(int offset, int size)
+{
+ /* Validate request (no un-aligned requests) */
+ if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
+ return 1;
+ return 0;
+}
+
+static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
+ int offset)
+{
+ if (offset >= 0) {
+ new_val_mask <<= (offset * 8);
+ new_val <<= (offset * 8);
+ } else {
+ new_val_mask >>= (offset * -8);
+ new_val >>= (offset * -8);
+ }
+ val = (val & ~new_val_mask) | (new_val & new_val_mask);
+
+ return val;
+}
+
+static int pcibios_err_to_errno(int err)
+{
+ switch (err) {
+ case PCIBIOS_SUCCESSFUL:
+ return XEN_PCI_ERR_success;
+ case PCIBIOS_DEVICE_NOT_FOUND:
+ return XEN_PCI_ERR_dev_not_found;
+ case PCIBIOS_BAD_REGISTER_NUMBER:
+ return XEN_PCI_ERR_invalid_offset;
+ case PCIBIOS_FUNC_NOT_SUPPORTED:
+ return XEN_PCI_ERR_not_implemented;
+ case PCIBIOS_SET_FAILED:
+ return XEN_PCI_ERR_access_denied;
+ }
+ return err;
+}
+
+int pciback_config_read(struct pci_dev *dev, int offset, int size,
+ u32 *ret_val)
+{
+ int err = 0;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ const struct config_field_entry *cfg_entry;
+ const struct config_field *field;
+ int req_start, req_end, field_start, field_end;
+ /* if read fails for any reason, return 0
+ * (as if device didn't respond) */
+ u32 value = 0, tmp_val;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n",
+ pci_name(dev), size, offset);
+
+ if (!valid_request(offset, size)) {
+ err = XEN_PCI_ERR_invalid_offset;
+ goto out;
+ }
+
+ /* Get the real value first, then modify as appropriate */
+ switch (size) {
+ case 1:
+ err = pci_read_config_byte(dev, offset, (u8 *) &value);
+ break;
+ case 2:
+ err = pci_read_config_word(dev, offset, (u16 *) &value);
+ break;
+ case 4:
+ err = pci_read_config_dword(dev, offset, &value);
+ break;
+ }
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ req_start = offset;
+ req_end = offset + size;
+ field_start = OFFSET(cfg_entry);
+ field_end = OFFSET(cfg_entry) + field->size;
+
+ if ((req_start >= field_start && req_start < field_end)
+ || (req_end > field_start && req_end <= field_end)) {
+ err = conf_space_read(dev, cfg_entry, field_start,
+ &tmp_val);
+ if (err)
+ goto out;
+
+ value = merge_value(value, tmp_val,
+ get_mask(field->size),
+ field_start - req_start);
+ }
+ }
+
+out:
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n",
+ pci_name(dev), size, offset, value);
+
+ *ret_val = value;
+ return pcibios_err_to_errno(err);
+}
+
+int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value)
+{
+ int err = 0, handled = 0;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ const struct config_field_entry *cfg_entry;
+ const struct config_field *field;
+ u32 tmp_val;
+ int req_start, req_end, field_start, field_end;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG
+ "pciback: %s: write request %d bytes at 0x%x = %x\n",
+ pci_name(dev), size, offset, value);
+
+ if (!valid_request(offset, size))
+ return XEN_PCI_ERR_invalid_offset;
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ req_start = offset;
+ req_end = offset + size;
+ field_start = OFFSET(cfg_entry);
+ field_end = OFFSET(cfg_entry) + field->size;
+
+ if ((req_start >= field_start && req_start < field_end)
+ || (req_end > field_start && req_end <= field_end)) {
+ tmp_val = 0;
+
+ err = pciback_config_read(dev, field_start,
+ field->size, &tmp_val);
+ if (err)
+ break;
+
+ tmp_val = merge_value(tmp_val, value, get_mask(size),
+ req_start - field_start);
+
+ err = conf_space_write(dev, cfg_entry, field_start,
+ tmp_val);
+
+ /* handled is set true here, but not every byte
+ * may have been written! Properly detecting if
+ * every byte is handled is unnecessary as the
+ * flag is used to detect devices that need
+ * special helpers to work correctly.
+ */
+ handled = 1;
+ }
+ }
+
+ if (!handled && !err) {
+ /* By default, anything not specificially handled above is
+ * read-only. The permissive flag changes this behavior so
+ * that anything not specifically handled above is writable.
+ * This means that some fields may still be read-only because
+ * they have entries in the config_field list that intercept
+ * the write and do nothing. */
+ if (dev_data->permissive || permissive) {
+ switch (size) {
+ case 1:
+ err = pci_write_config_byte(dev, offset,
+ (u8) value);
+ break;
+ case 2:
+ err = pci_write_config_word(dev, offset,
+ (u16) value);
+ break;
+ case 4:
+ err = pci_write_config_dword(dev, offset,
+ (u32) value);
+ break;
+ }
+ } else if (!dev_data->warned_on_write) {
+ dev_data->warned_on_write = 1;
+ dev_warn(&dev->dev, "Driver tried to write to a "
+ "read-only configuration space field at offset"
+ " 0x%x, size %d. This may be harmless, but if "
+ "you have problems with your device:\n"
+ "1) see permissive attribute in sysfs\n"
+ "2) report problems to the xen-devel "
+ "mailing list along with details of your "
+ "device obtained from lspci.\n", offset, size);
+ }
+ }
+
+ return pcibios_err_to_errno(err);
+}
+
+void pciback_config_free_dyn_fields(struct pci_dev *dev)
+{
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry, *t;
+ const struct config_field *field;
+
+ dev_dbg(&dev->dev, "free-ing dynamically allocated virtual "
+ "configuration space fields\n");
+ if (!dev_data)
+ return;
+
+ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ if (field->clean) {
+ field->clean((struct config_field *)field);
+
+ kfree(cfg_entry->data);
+
+ list_del(&cfg_entry->list);
+ kfree(cfg_entry);
+ }
+
+ }
+}
+
+void pciback_config_reset_dev(struct pci_dev *dev)
+{
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ const struct config_field_entry *cfg_entry;
+ const struct config_field *field;
+
+ dev_dbg(&dev->dev, "resetting virtual configuration space\n");
+ if (!dev_data)
+ return;
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ if (field->reset)
+ field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
+ }
+}
+
+void pciback_config_free_dev(struct pci_dev *dev)
+{
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry, *t;
+ const struct config_field *field;
+
+ dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
+ if (!dev_data)
+ return;
+
+ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
+ list_del(&cfg_entry->list);
+
+ field = cfg_entry->field;
+
+ if (field->release)
+ field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
+
+ kfree(cfg_entry);
+ }
+}
+
+int pciback_config_add_field_offset(struct pci_dev *dev,
+ const struct config_field *field,
+ unsigned int base_offset)
+{
+ int err = 0;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry;
+ void *tmp;
+
+ cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
+ if (!cfg_entry) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ cfg_entry->data = NULL;
+ cfg_entry->field = field;
+ cfg_entry->base_offset = base_offset;
+
+ /* silently ignore duplicate fields */
+ err = pciback_field_is_dup(dev, OFFSET(cfg_entry));
+ if (err)
+ goto out;
+
+ if (field->init) {
+ tmp = field->init(dev, OFFSET(cfg_entry));
+
+ if (IS_ERR(tmp)) {
+ err = PTR_ERR(tmp);
+ goto out;
+ }
+
+ cfg_entry->data = tmp;
+ }
+
+ dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
+ OFFSET(cfg_entry));
+ list_add_tail(&cfg_entry->list, &dev_data->config_fields);
+
+out:
+ if (err)
+ kfree(cfg_entry);
+
+ return err;
+}
+
+/* This sets up the device's virtual configuration space to keep track of
+ * certain registers (like the base address registers (BARs) so that we can
+ * keep the client from manipulating them directly.
+ */
+int pciback_config_init_dev(struct pci_dev *dev)
+{
+ int err = 0;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+
+ dev_dbg(&dev->dev, "initializing virtual configuration space\n");
+
+ INIT_LIST_HEAD(&dev_data->config_fields);
+
+ err = pciback_config_header_add_fields(dev);
+ if (err)
+ goto out;
+
+ err = pciback_config_capability_add_fields(dev);
+ if (err)
+ goto out;
+
+ err = pciback_config_quirks_init(dev);
+
+out:
+ return err;
+}
+
+int pciback_config_init(void)
+{
+ return pciback_config_capability_init();
+}
diff --git a/drivers/xen/pciback/conf_space.h b/drivers/xen/pciback/conf_space.h
new file mode 100644
index 0000000..50ebef2
--- /dev/null
+++ b/drivers/xen/pciback/conf_space.h
@@ -0,0 +1,126 @@
+/*
+ * PCI Backend - Common data structures for overriding the configuration space
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#ifndef __XEN_PCIBACK_CONF_SPACE_H__
+#define __XEN_PCIBACK_CONF_SPACE_H__
+
+#include <linux/list.h>
+#include <linux/err.h>
+
+/* conf_field_init can return an errno in a ptr with ERR_PTR() */
+typedef void *(*conf_field_init) (struct pci_dev *dev, int offset);
+typedef void (*conf_field_reset) (struct pci_dev *dev, int offset, void *data);
+typedef void (*conf_field_free) (struct pci_dev *dev, int offset, void *data);
+
+typedef int (*conf_dword_write) (struct pci_dev *dev, int offset, u32 value,
+ void *data);
+typedef int (*conf_word_write) (struct pci_dev *dev, int offset, u16 value,
+ void *data);
+typedef int (*conf_byte_write) (struct pci_dev *dev, int offset, u8 value,
+ void *data);
+typedef int (*conf_dword_read) (struct pci_dev *dev, int offset, u32 *value,
+ void *data);
+typedef int (*conf_word_read) (struct pci_dev *dev, int offset, u16 *value,
+ void *data);
+typedef int (*conf_byte_read) (struct pci_dev *dev, int offset, u8 *value,
+ void *data);
+
+/* These are the fields within the configuration space which we
+ * are interested in intercepting reads/writes to and changing their
+ * values.
+ */
+struct config_field {
+ unsigned int offset;
+ unsigned int size;
+ unsigned int mask;
+ conf_field_init init;
+ conf_field_reset reset;
+ conf_field_free release;
+ void (*clean) (struct config_field *field);
+ union {
+ struct {
+ conf_dword_write write;
+ conf_dword_read read;
+ } dw;
+ struct {
+ conf_word_write write;
+ conf_word_read read;
+ } w;
+ struct {
+ conf_byte_write write;
+ conf_byte_read read;
+ } b;
+ } u;
+ struct list_head list;
+};
+
+struct config_field_entry {
+ struct list_head list;
+ const struct config_field *field;
+ unsigned int base_offset;
+ void *data;
+};
+
+#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
+
+/* Add fields to a device - the add_fields macro expects to get a pointer to
+ * the first entry in an array (of which the ending is marked by size==0)
+ */
+int pciback_config_add_field_offset(struct pci_dev *dev,
+ const struct config_field *field,
+ unsigned int offset);
+
+static inline int pciback_config_add_field(struct pci_dev *dev,
+ const struct config_field *field)
+{
+ return pciback_config_add_field_offset(dev, field, 0);
+}
+
+static inline int pciback_config_add_fields(struct pci_dev *dev,
+ const struct config_field *field)
+{
+ int i, err = 0;
+ for (i = 0; field[i].size != 0; i++) {
+ err = pciback_config_add_field(dev, &field[i]);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+static inline int pciback_config_add_fields_offset(struct pci_dev *dev,
+ const struct config_field *field,
+ unsigned int offset)
+{
+ int i, err = 0;
+ for (i = 0; field[i].size != 0; i++) {
+ err = pciback_config_add_field_offset(dev, &field[i], offset);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+/* Read/Write the real configuration space */
+int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 *value,
+ void *data);
+int pciback_read_config_word(struct pci_dev *dev, int offset, u16 *value,
+ void *data);
+int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 *value,
+ void *data);
+int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value,
+ void *data);
+int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value,
+ void *data);
+int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value,
+ void *data);
+
+int pciback_config_capability_init(void);
+
+int pciback_config_header_add_fields(struct pci_dev *dev);
+int pciback_config_capability_add_fields(struct pci_dev *dev);
+
+#endif /* __XEN_PCIBACK_CONF_SPACE_H__ */
diff --git a/drivers/xen/pciback/conf_space_capability.c b/drivers/xen/pciback/conf_space_capability.c
new file mode 100644
index 0000000..0ea84d6
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability.c
@@ -0,0 +1,66 @@
+/*
+ * PCI Backend - Handles the virtual fields found on the capability lists
+ * in the configuration space.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_capability.h"
+
+static LIST_HEAD(capabilities);
+
+static const struct config_field caplist_header[] = {
+ {
+ .offset = PCI_CAP_LIST_ID,
+ .size = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
+ .u.w.read = pciback_read_config_word,
+ .u.w.write = NULL,
+ },
+ {}
+};
+
+static inline void register_capability(struct pciback_config_capability *cap)
+{
+ list_add_tail(&cap->cap_list, &capabilities);
+}
+
+int pciback_config_capability_add_fields(struct pci_dev *dev)
+{
+ int err = 0;
+ struct pciback_config_capability *cap;
+ int cap_offset;
+
+ list_for_each_entry(cap, &capabilities, cap_list) {
+ cap_offset = pci_find_capability(dev, cap->capability);
+ if (cap_offset) {
+ dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
+ cap->capability, cap_offset);
+
+ err = pciback_config_add_fields_offset(dev,
+ caplist_header,
+ cap_offset);
+ if (err)
+ goto out;
+ err = pciback_config_add_fields_offset(dev,
+ cap->fields,
+ cap_offset);
+ if (err)
+ goto out;
+ }
+ }
+
+out:
+ return err;
+}
+
+int pciback_config_capability_init(void)
+{
+ register_capability(&pciback_config_capability_vpd);
+ register_capability(&pciback_config_capability_pm);
+
+ return 0;
+}
diff --git a/drivers/xen/pciback/conf_space_capability.h b/drivers/xen/pciback/conf_space_capability.h
new file mode 100644
index 0000000..8da3ac4
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability.h
@@ -0,0 +1,26 @@
+/*
+ * PCI Backend - Data structures for special overlays for structures on
+ * the capability list.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#ifndef __PCIBACK_CONFIG_CAPABILITY_H__
+#define __PCIBACK_CONFIG_CAPABILITY_H__
+
+#include <linux/pci.h>
+#include <linux/list.h>
+
+struct pciback_config_capability {
+ struct list_head cap_list;
+
+ int capability;
+
+ /* If the device has the capability found above, add these fields */
+ const struct config_field *fields;
+};
+
+extern struct pciback_config_capability pciback_config_capability_vpd;
+extern struct pciback_config_capability pciback_config_capability_pm;
+
+#endif
diff --git a/drivers/xen/pciback/conf_space_capability_msi.c b/drivers/xen/pciback/conf_space_capability_msi.c
new file mode 100644
index 0000000..b15131e
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability_msi.c
@@ -0,0 +1,110 @@
+/*
+ * PCI Backend -- Configuration overlay for MSI capability
+ */
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include "conf_space.h"
+#include "conf_space_capability.h"
+#include <xen/interface/io/pciif.h>
+#include <xen/events.h>
+#include "pciback.h"
+
+int pciback_enable_msi(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op)
+{
+ struct pciback_dev_data *dev_data;
+ int otherend = pdev->xdev->otherend_id;
+ int status;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: enable MSI\n", pci_name(dev));
+
+ status = pci_enable_msi(dev);
+
+ if (status) {
+ printk(KERN_ERR "error enable msi for guest %x status %x\n",
+ otherend, status);
+ op->value = 0;
+ return XEN_PCI_ERR_op_failed;
+ }
+
+ /* The value the guest needs is actually the IDT vector, not the
+ * the local domain's IRQ number. */
+ op->value = xen_gsi_from_irq(dev->irq);
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ dev_data->ack_intr = 0;
+
+ return 0;
+}
+
+int pciback_disable_msi(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op)
+{
+ struct pciback_dev_data *dev_data;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: disable MSI\n", pci_name(dev));
+ pci_disable_msi(dev);
+
+ op->value = xen_gsi_from_irq(dev->irq);
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ dev_data->ack_intr = 1;
+ return 0;
+}
+
+int pciback_enable_msix(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op)
+{
+ struct pciback_dev_data *dev_data;
+ int i, result;
+ struct msix_entry *entries;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: enable MSI-X\n", pci_name(dev));
+ if (op->value > SH_INFO_MAX_VEC)
+ return -EINVAL;
+
+ entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL);
+ if (entries == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < op->value; i++) {
+ entries[i].entry = op->msix_entries[i].entry;
+ entries[i].vector = op->msix_entries[i].vector;
+ }
+
+ result = pci_enable_msix(dev, entries, op->value);
+
+ for (i = 0; i < op->value; i++) {
+ op->msix_entries[i].entry = entries[i].entry;
+ op->msix_entries[i].vector =
+ xen_gsi_from_irq(entries[i].vector);
+ }
+
+ kfree(entries);
+
+ op->value = result;
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ dev_data->ack_intr = 0;
+
+ return result;
+}
+
+int pciback_disable_msix(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op)
+{
+ struct pciback_dev_data *dev_data;
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: disable MSI-X\n", pci_name(dev));
+ pci_disable_msix(dev);
+
+ op->value = xen_gsi_from_irq(dev->irq);
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ dev_data->ack_intr = 1;
+ return 0;
+}
+
diff --git a/drivers/xen/pciback/conf_space_capability_pm.c b/drivers/xen/pciback/conf_space_capability_pm.c
new file mode 100644
index 0000000..0442616
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability_pm.c
@@ -0,0 +1,113 @@
+/*
+ * PCI Backend - Configuration space overlay for power management
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/pci.h>
+#include "conf_space.h"
+#include "conf_space_capability.h"
+
+static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
+ void *data)
+{
+ int err;
+ u16 real_value;
+
+ err = pci_read_config_word(dev, offset, &real_value);
+ if (err)
+ goto out;
+
+ *value = real_value & ~PCI_PM_CAP_PME_MASK;
+
+out:
+ return err;
+}
+
+/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
+ * Can't allow driver domain to enable PMEs - they're shared */
+#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
+
+static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
+ void *data)
+{
+ int err;
+ u16 old_value;
+ pci_power_t new_state, old_state;
+
+ err = pci_read_config_word(dev, offset, &old_value);
+ if (err)
+ goto out;
+
+ old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK);
+ new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
+
+ new_value &= PM_OK_BITS;
+ if ((old_value & PM_OK_BITS) != new_value) {
+ new_value = (old_value & ~PM_OK_BITS) | new_value;
+ err = pci_write_config_word(dev, offset, new_value);
+ if (err)
+ goto out;
+ }
+
+ /* Let pci core handle the power management change */
+ dev_dbg(&dev->dev, "set power state to %x\n", new_state);
+ err = pci_set_power_state(dev, new_state);
+ if (err) {
+ err = PCIBIOS_SET_FAILED;
+ goto out;
+ }
+
+ out:
+ return err;
+}
+
+/* Ensure PMEs are disabled */
+static void *pm_ctrl_init(struct pci_dev *dev, int offset)
+{
+ int err;
+ u16 value;
+
+ err = pci_read_config_word(dev, offset, &value);
+ if (err)
+ goto out;
+
+ if (value & PCI_PM_CTRL_PME_ENABLE) {
+ value &= ~PCI_PM_CTRL_PME_ENABLE;
+ err = pci_write_config_word(dev, offset, value);
+ }
+
+out:
+ return ERR_PTR(err);
+}
+
+static const struct config_field caplist_pm[] = {
+ {
+ .offset = PCI_PM_PMC,
+ .size = 2,
+ .u.w.read = pm_caps_read,
+ },
+ {
+ .offset = PCI_PM_CTRL,
+ .size = 2,
+ .init = pm_ctrl_init,
+ .u.w.read = pciback_read_config_word,
+ .u.w.write = pm_ctrl_write,
+ },
+ {
+ .offset = PCI_PM_PPB_EXTENSIONS,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ },
+ {
+ .offset = PCI_PM_DATA_REGISTER,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ },
+ {}
+};
+
+struct pciback_config_capability pciback_config_capability_pm = {
+ .capability = PCI_CAP_ID_PM,
+ .fields = caplist_pm,
+};
diff --git a/drivers/xen/pciback/conf_space_capability_vpd.c b/drivers/xen/pciback/conf_space_capability_vpd.c
new file mode 100644
index 0000000..e7b4d66
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability_vpd.c
@@ -0,0 +1,40 @@
+/*
+ * PCI Backend - Configuration space overlay for Vital Product Data
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/pci.h>
+#include "conf_space.h"
+#include "conf_space_capability.h"
+
+static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
+ void *data)
+{
+ /* Disallow writes to the vital product data */
+ if (value & PCI_VPD_ADDR_F)
+ return PCIBIOS_SET_FAILED;
+ else
+ return pci_write_config_word(dev, offset, value);
+}
+
+static const struct config_field caplist_vpd[] = {
+ {
+ .offset = PCI_VPD_ADDR,
+ .size = 2,
+ .u.w.read = pciback_read_config_word,
+ .u.w.write = vpd_address_write,
+ },
+ {
+ .offset = PCI_VPD_DATA,
+ .size = 4,
+ .u.dw.read = pciback_read_config_dword,
+ .u.dw.write = NULL,
+ },
+ {}
+};
+
+struct pciback_config_capability pciback_config_capability_vpd = {
+ .capability = PCI_CAP_ID_VPD,
+ .fields = caplist_vpd,
+};
diff --git a/drivers/xen/pciback/conf_space_header.c b/drivers/xen/pciback/conf_space_header.c
new file mode 100644
index 0000000..cb450f4
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_header.c
@@ -0,0 +1,385 @@
+/*
+ * PCI Backend - Handles the virtual fields in the configuration space headers.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+
+struct pci_bar_info {
+ u32 val;
+ u32 len_val;
+ int which;
+};
+
+#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
+#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
+
+static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data)
+{
+ int i;
+ int ret;
+
+ ret = pciback_read_config_word(dev, offset, value, data);
+ if (!atomic_read(&dev->enable_cnt))
+ return ret;
+
+ for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+ if (dev->resource[i].flags & IORESOURCE_IO)
+ *value |= PCI_COMMAND_IO;
+ if (dev->resource[i].flags & IORESOURCE_MEM)
+ *value |= PCI_COMMAND_MEMORY;
+ }
+
+ return ret;
+}
+
+static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
+{
+ struct pciback_dev_data *dev_data;
+ int err;
+
+ dev_data = pci_get_drvdata(dev);
+ if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: enable\n",
+ pci_name(dev));
+ err = pci_enable_device(dev);
+ if (err)
+ return err;
+ if (dev_data)
+ dev_data->enable_intx = 1;
+ } else if (pci_is_enabled(dev) && !is_enable_cmd(value)) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: disable\n",
+ pci_name(dev));
+ pci_disable_device(dev);
+ if (dev_data)
+ dev_data->enable_intx = 0;
+ }
+
+ if (!dev->is_busmaster && is_master_cmd(value)) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: set bus master\n",
+ pci_name(dev));
+ pci_set_master(dev);
+ }
+
+ if (value & PCI_COMMAND_INVALIDATE) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG
+ "pciback: %s: enable memory-write-invalidate\n",
+ pci_name(dev));
+ err = pci_set_mwi(dev);
+ if (err) {
+ printk(KERN_WARNING
+ "pciback: %s: cannot enable "
+ "memory-write-invalidate (%d)\n",
+ pci_name(dev), err);
+ value &= ~PCI_COMMAND_INVALIDATE;
+ }
+ }
+
+ return pci_write_config_word(dev, offset, value);
+}
+
+static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ if (unlikely(!bar)) {
+ printk(KERN_WARNING "pciback: driver data not found for %s\n",
+ pci_name(dev));
+ return XEN_PCI_ERR_op_failed;
+ }
+
+ /* A write to obtain the length must happen as a 32-bit write.
+ * This does not (yet) support writing individual bytes
+ */
+ if (value == ~PCI_ROM_ADDRESS_ENABLE)
+ bar->which = 1;
+ else {
+ u32 tmpval;
+ pci_read_config_dword(dev, offset, &tmpval);
+ if (tmpval != bar->val && value == bar->val) {
+ /* Allow restoration of bar value. */
+ pci_write_config_dword(dev, offset, bar->val);
+ }
+ bar->which = 0;
+ }
+
+ /* Do we need to support enabling/disabling the rom address here? */
+
+ return 0;
+}
+
+/* For the BARs, only allow writes which write ~0 or
+ * the correct resource information
+ * (Needed for when the driver probes the resource usage)
+ */
+static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ if (unlikely(!bar)) {
+ printk(KERN_WARNING "pciback: driver data not found for %s\n",
+ pci_name(dev));
+ return XEN_PCI_ERR_op_failed;
+ }
+
+ /* A write to obtain the length must happen as a 32-bit write.
+ * This does not (yet) support writing individual bytes
+ */
+ if (value == ~0)
+ bar->which = 1;
+ else {
+ u32 tmpval;
+ pci_read_config_dword(dev, offset, &tmpval);
+ if (tmpval != bar->val && value == bar->val) {
+ /* Allow restoration of bar value. */
+ pci_write_config_dword(dev, offset, bar->val);
+ }
+ bar->which = 0;
+ }
+
+ return 0;
+}
+
+static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ if (unlikely(!bar)) {
+ printk(KERN_WARNING "pciback: driver data not found for %s\n",
+ pci_name(dev));
+ return XEN_PCI_ERR_op_failed;
+ }
+
+ *value = bar->which ? bar->len_val : bar->val;
+
+ return 0;
+}
+
+static inline void read_dev_bar(struct pci_dev *dev,
+ struct pci_bar_info *bar_info, int offset,
+ u32 len_mask)
+{
+ int pos;
+ struct resource *res = dev->resource;
+
+ if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1)
+ pos = PCI_ROM_RESOURCE;
+ else {
+ pos = (offset - PCI_BASE_ADDRESS_0) / 4;
+ if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE |
+ PCI_BASE_ADDRESS_MEM_TYPE_MASK)) ==
+ (PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_64))) {
+ bar_info->val = res[pos - 1].start >> 32;
+ bar_info->len_val = res[pos - 1].end >> 32;
+ return;
+ }
+ }
+
+ bar_info->val = res[pos].start |
+ (res[pos].flags & PCI_REGION_FLAG_MASK);
+ bar_info->len_val = res[pos].end - res[pos].start + 1;
+}
+
+static void *bar_init(struct pci_dev *dev, int offset)
+{
+ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
+
+ if (!bar)
+ return ERR_PTR(-ENOMEM);
+
+ read_dev_bar(dev, bar, offset, ~0);
+ bar->which = 0;
+
+ return bar;
+}
+
+static void *rom_init(struct pci_dev *dev, int offset)
+{
+ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
+
+ if (!bar)
+ return ERR_PTR(-ENOMEM);
+
+ read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
+ bar->which = 0;
+
+ return bar;
+}
+
+static void bar_reset(struct pci_dev *dev, int offset, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ bar->which = 0;
+}
+
+static void bar_release(struct pci_dev *dev, int offset, void *data)
+{
+ kfree(data);
+}
+
+static int pciback_read_vendor(struct pci_dev *dev, int offset,
+ u16 *value, void *data)
+{
+ *value = dev->vendor;
+
+ return 0;
+}
+
+static int pciback_read_device(struct pci_dev *dev, int offset,
+ u16 *value, void *data)
+{
+ *value = dev->device;
+
+ return 0;
+}
+
+static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
+ void *data)
+{
+ *value = (u8) dev->irq;
+
+ return 0;
+}
+
+static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
+{
+ u8 cur_value;
+ int err;
+
+ err = pci_read_config_byte(dev, offset, &cur_value);
+ if (err)
+ goto out;
+
+ if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
+ || value == PCI_BIST_START)
+ err = pci_write_config_byte(dev, offset, value);
+
+out:
+ return err;
+}
+
+static const struct config_field header_common[] = {
+ {
+ .offset = PCI_VENDOR_ID,
+ .size = 2,
+ .u.w.read = pciback_read_vendor,
+ },
+ {
+ .offset = PCI_DEVICE_ID,
+ .size = 2,
+ .u.w.read = pciback_read_device,
+ },
+ {
+ .offset = PCI_COMMAND,
+ .size = 2,
+ .u.w.read = command_read,
+ .u.w.write = command_write,
+ },
+ {
+ .offset = PCI_INTERRUPT_LINE,
+ .size = 1,
+ .u.b.read = interrupt_read,
+ },
+ {
+ .offset = PCI_INTERRUPT_PIN,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ },
+ {
+ /* Any side effects of letting driver domain control cache line? */
+ .offset = PCI_CACHE_LINE_SIZE,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ .u.b.write = pciback_write_config_byte,
+ },
+ {
+ .offset = PCI_LATENCY_TIMER,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ },
+ {
+ .offset = PCI_BIST,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ .u.b.write = bist_write,
+ },
+ {}
+};
+
+#define CFG_FIELD_BAR(reg_offset) \
+ { \
+ .offset = reg_offset, \
+ .size = 4, \
+ .init = bar_init, \
+ .reset = bar_reset, \
+ .release = bar_release, \
+ .u.dw.read = bar_read, \
+ .u.dw.write = bar_write, \
+ }
+
+#define CFG_FIELD_ROM(reg_offset) \
+ { \
+ .offset = reg_offset, \
+ .size = 4, \
+ .init = rom_init, \
+ .reset = bar_reset, \
+ .release = bar_release, \
+ .u.dw.read = bar_read, \
+ .u.dw.write = rom_write, \
+ }
+
+static const struct config_field header_0[] = {
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
+ CFG_FIELD_ROM(PCI_ROM_ADDRESS),
+ {}
+};
+
+static const struct config_field header_1[] = {
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
+ CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
+ {}
+};
+
+int pciback_config_header_add_fields(struct pci_dev *dev)
+{
+ int err;
+
+ err = pciback_config_add_fields(dev, header_common);
+ if (err)
+ goto out;
+
+ switch (dev->hdr_type) {
+ case PCI_HEADER_TYPE_NORMAL:
+ err = pciback_config_add_fields(dev, header_0);
+ break;
+
+ case PCI_HEADER_TYPE_BRIDGE:
+ err = pciback_config_add_fields(dev, header_1);
+ break;
+
+ default:
+ err = -EINVAL;
+ printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n",
+ pci_name(dev), dev->hdr_type);
+ break;
+ }
+
+out:
+ return err;
+}
diff --git a/drivers/xen/pciback/conf_space_quirks.c b/drivers/xen/pciback/conf_space_quirks.c
new file mode 100644
index 0000000..45c31fb
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_quirks.c
@@ -0,0 +1,140 @@
+/*
+ * PCI Backend - Handle special overlays for broken devices.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Author: Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+LIST_HEAD(pciback_quirks);
+
+static inline const struct pci_device_id *
+match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
+{
+ if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
+ (id->device == PCI_ANY_ID || id->device == dev->device) &&
+ (id->subvendor == PCI_ANY_ID ||
+ id->subvendor == dev->subsystem_vendor) &&
+ (id->subdevice == PCI_ANY_ID ||
+ id->subdevice == dev->subsystem_device) &&
+ !((id->class ^ dev->class) & id->class_mask))
+ return id;
+ return NULL;
+}
+
+struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev)
+{
+ struct pciback_config_quirk *tmp_quirk;
+
+ list_for_each_entry(tmp_quirk, &pciback_quirks, quirks_list)
+ if (match_one_device(&tmp_quirk->devid, dev) != NULL)
+ goto out;
+ tmp_quirk = NULL;
+ printk(KERN_DEBUG
+ "quirk didn't match any device pciback knows about\n");
+out:
+ return tmp_quirk;
+}
+
+static inline void register_quirk(struct pciback_config_quirk *quirk)
+{
+ list_add_tail(&quirk->quirks_list, &pciback_quirks);
+}
+
+int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg)
+{
+ int ret = 0;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry;
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ if (OFFSET(cfg_entry) == reg) {
+ ret = 1;
+ break;
+ }
+ }
+ return ret;
+}
+
+int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
+ *field)
+{
+ int err = 0;
+
+ switch (field->size) {
+ case 1:
+ field->u.b.read = pciback_read_config_byte;
+ field->u.b.write = pciback_write_config_byte;
+ break;
+ case 2:
+ field->u.w.read = pciback_read_config_word;
+ field->u.w.write = pciback_write_config_word;
+ break;
+ case 4:
+ field->u.dw.read = pciback_read_config_dword;
+ field->u.dw.write = pciback_write_config_dword;
+ break;
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+
+ pciback_config_add_field(dev, field);
+
+out:
+ return err;
+}
+
+int pciback_config_quirks_init(struct pci_dev *dev)
+{
+ struct pciback_config_quirk *quirk;
+ int ret = 0;
+
+ quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC);
+ if (!quirk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ quirk->devid.vendor = dev->vendor;
+ quirk->devid.device = dev->device;
+ quirk->devid.subvendor = dev->subsystem_vendor;
+ quirk->devid.subdevice = dev->subsystem_device;
+ quirk->devid.class = 0;
+ quirk->devid.class_mask = 0;
+ quirk->devid.driver_data = 0UL;
+
+ quirk->pdev = dev;
+
+ register_quirk(quirk);
+out:
+ return ret;
+}
+
+void pciback_config_field_free(struct config_field *field)
+{
+ kfree(field);
+}
+
+int pciback_config_quirk_release(struct pci_dev *dev)
+{
+ struct pciback_config_quirk *quirk;
+ int ret = 0;
+
+ quirk = pciback_find_quirk(dev);
+ if (!quirk) {
+ ret = -ENXIO;
+ goto out;
+ }
+
+ list_del(&quirk->quirks_list);
+ kfree(quirk);
+
+out:
+ return ret;
+}
diff --git a/drivers/xen/pciback/conf_space_quirks.h b/drivers/xen/pciback/conf_space_quirks.h
new file mode 100644
index 0000000..acd0e1a
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_quirks.h
@@ -0,0 +1,35 @@
+/*
+ * PCI Backend - Data structures for special overlays for broken devices.
+ *
+ * Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
+#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
+
+#include <linux/pci.h>
+#include <linux/list.h>
+
+struct pciback_config_quirk {
+ struct list_head quirks_list;
+ struct pci_device_id devid;
+ struct pci_dev *pdev;
+};
+
+struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev);
+
+int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
+ *field);
+
+int pciback_config_quirks_remove_field(struct pci_dev *dev, int reg);
+
+int pciback_config_quirks_init(struct pci_dev *dev);
+
+void pciback_config_field_free(struct config_field *field);
+
+int pciback_config_quirk_release(struct pci_dev *dev);
+
+int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg);
+
+#endif
diff --git a/drivers/xen/pciback/controller.c b/drivers/xen/pciback/controller.c
new file mode 100644
index 0000000..7f04f11
--- /dev/null
+++ b/drivers/xen/pciback/controller.c
@@ -0,0 +1,442 @@
+/*
+ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
+ * Alex Williamson <alex.williamson@hp.com>
+ *
+ * PCI "Controller" Backend - virtualize PCI bus topology based on PCI
+ * controllers. Devices under the same PCI controller are exposed on the
+ * same virtual domain:bus. Within a bus, device slots are virtualized
+ * to compact the bus.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/acpi.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+#define PCI_MAX_BUSSES 255
+#define PCI_MAX_SLOTS 32
+
+struct controller_dev_entry {
+ struct list_head list;
+ struct pci_dev *dev;
+ unsigned int devfn;
+};
+
+struct controller_list_entry {
+ struct list_head list;
+ struct pci_controller *controller;
+ unsigned int domain;
+ unsigned int bus;
+ unsigned int next_devfn;
+ struct list_head dev_list;
+};
+
+struct controller_dev_data {
+ struct list_head list;
+ unsigned int next_domain;
+ unsigned int next_bus;
+ spinlock_t lock;
+};
+
+struct walk_info {
+ struct pciback_device *pdev;
+ int resource_count;
+ int root_num;
+};
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_dev_entry *dev_entry;
+ struct controller_list_entry *cntrl_entry;
+ struct pci_dev *dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+ if (cntrl_entry->domain != domain ||
+ cntrl_entry->bus != bus)
+ continue;
+
+ list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+ if (devfn == dev_entry->devfn) {
+ dev = dev_entry->dev;
+ goto found;
+ }
+ }
+ }
+found:
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+
+ return dev;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_dev_entry *dev_entry;
+ struct controller_list_entry *cntrl_entry;
+ struct pci_controller *dev_controller = PCI_CONTROLLER(dev);
+ unsigned long flags;
+ int ret = 0, found = 0;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ /* Look to see if we already have a domain:bus for this controller */
+ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+ if (cntrl_entry->controller == dev_controller) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found) {
+ cntrl_entry = kmalloc(sizeof(*cntrl_entry), GFP_ATOMIC);
+ if (!cntrl_entry) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ cntrl_entry->controller = dev_controller;
+ cntrl_entry->next_devfn = PCI_DEVFN(0, 0);
+
+ cntrl_entry->domain = dev_data->next_domain;
+ cntrl_entry->bus = dev_data->next_bus++;
+ if (dev_data->next_bus > PCI_MAX_BUSSES) {
+ dev_data->next_domain++;
+ dev_data->next_bus = 0;
+ }
+
+ INIT_LIST_HEAD(&cntrl_entry->dev_list);
+
+ list_add_tail(&cntrl_entry->list, &dev_data->list);
+ }
+
+ if (PCI_SLOT(cntrl_entry->next_devfn) > PCI_MAX_SLOTS) {
+ /*
+ * While it seems unlikely, this can actually happen if
+ * a controller has P2P bridges under it.
+ */
+ xenbus_dev_fatal(pdev->xdev, -ENOSPC, "Virtual bus %04x:%02x "
+ "is full, no room to export %04x:%02x:%02x.%x",
+ cntrl_entry->domain, cntrl_entry->bus,
+ pci_domain_nr(dev->bus), dev->bus->number,
+ PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ dev_entry = kmalloc(sizeof(*dev_entry), GFP_ATOMIC);
+ if (!dev_entry) {
+ if (list_empty(&cntrl_entry->dev_list)) {
+ list_del(&cntrl_entry->list);
+ kfree(cntrl_entry);
+ }
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ dev_entry->dev = dev;
+ dev_entry->devfn = cntrl_entry->next_devfn;
+
+ list_add_tail(&dev_entry->list, &cntrl_entry->dev_list);
+
+ cntrl_entry->next_devfn += PCI_DEVFN(1, 0);
+
+out:
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+
+ /* TODO: Publish virtual domain:bus:slot.func here. */
+
+ return ret;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_list_entry *cntrl_entry;
+ struct controller_dev_entry *dev_entry = NULL;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+ if (cntrl_entry->controller != PCI_CONTROLLER(dev))
+ continue;
+
+ list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+ if (dev_entry->dev == dev) {
+ found_dev = dev_entry->dev;
+ break;
+ }
+ }
+ }
+
+ if (!found_dev) {
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+ return;
+ }
+
+ list_del(&dev_entry->list);
+ kfree(dev_entry);
+
+ if (list_empty(&cntrl_entry->dev_list)) {
+ list_del(&cntrl_entry->list);
+ kfree(cntrl_entry);
+ }
+
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+ pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+ struct controller_dev_data *dev_data;
+
+ dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
+ if (!dev_data)
+ return -ENOMEM;
+
+ spin_lock_init(&dev_data->lock);
+
+ INIT_LIST_HEAD(&dev_data->list);
+
+ /* Starting domain:bus numbers */
+ dev_data->next_domain = 0;
+ dev_data->next_bus = 0;
+
+ pdev->pci_dev_data = dev_data;
+
+ return 0;
+}
+
+static acpi_status write_xenbus_resource(struct acpi_resource *res, void *data)
+{
+ struct walk_info *info = data;
+ struct acpi_resource_address64 addr;
+ acpi_status status;
+ int i, len, err;
+ char str[32], tmp[3];
+ unsigned char *ptr, *buf;
+
+ status = acpi_resource_to_address64(res, &addr);
+
+ /* Do we care about this range? Let's check. */
+ if (!ACPI_SUCCESS(status) ||
+ !(addr.resource_type == ACPI_MEMORY_RANGE ||
+ addr.resource_type == ACPI_IO_RANGE) ||
+ !addr.address_length || addr.producer_consumer != ACPI_PRODUCER)
+ return AE_OK;
+
+ /*
+ * Furthermore, we really only care to tell the guest about
+ * address ranges that require address translation of some sort.
+ */
+ if (!(addr.resource_type == ACPI_MEMORY_RANGE &&
+ addr.info.mem.translation) &&
+ !(addr.resource_type == ACPI_IO_RANGE &&
+ addr.info.io.translation))
+ return AE_OK;
+
+ /* Store the resource in xenbus for the guest */
+ len = snprintf(str, sizeof(str), "root-%d-resource-%d",
+ info->root_num, info->resource_count);
+ if (unlikely(len >= (sizeof(str) - 1)))
+ return AE_OK;
+
+ buf = kzalloc((sizeof(*res) * 2) + 1, GFP_KERNEL);
+ if (!buf)
+ return AE_OK;
+
+ /* Clean out resource_source */
+ res->data.address64.resource_source.index = 0xFF;
+ res->data.address64.resource_source.string_length = 0;
+ res->data.address64.resource_source.string_ptr = NULL;
+
+ ptr = (unsigned char *)res;
+
+ /* Turn the acpi_resource into an ASCII byte stream */
+ for (i = 0; i < sizeof(*res); i++) {
+ snprintf(tmp, sizeof(tmp), "%02x", ptr[i]);
+ strncat(buf, tmp, 2);
+ }
+
+ err = xenbus_printf(XBT_NIL, info->pdev->xdev->nodename,
+ str, "%s", buf);
+
+ if (!err)
+ info->resource_count++;
+
+ kfree(buf);
+
+ return AE_OK;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+ publish_pci_root_cb publish_root_cb)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_list_entry *cntrl_entry;
+ int i, root_num, len, err = 0;
+ unsigned int domain, bus;
+ char str[64];
+ struct walk_info info;
+
+ spin_lock(&dev_data->lock);
+
+ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+ /* First publish all the domain:bus info */
+ err = publish_root_cb(pdev, cntrl_entry->domain,
+ cntrl_entry->bus);
+ if (err)
+ goto out;
+
+ /*
+ * Now figure out which root-%d this belongs to
+ * so we can associate resources with it.
+ */
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ "root_num", "%d", &root_num);
+
+ if (err != 1)
+ goto out;
+
+ for (i = 0; i < root_num; i++) {
+ len = snprintf(str, sizeof(str), "root-%d", i);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ str, "%x:%x", &domain, &bus);
+ if (err != 2)
+ goto out;
+
+ /* Is this the one we just published? */
+ if (domain == cntrl_entry->domain &&
+ bus == cntrl_entry->bus)
+ break;
+ }
+
+ if (i == root_num)
+ goto out;
+
+ info.pdev = pdev;
+ info.resource_count = 0;
+ info.root_num = i;
+
+ /* Let ACPI do the heavy lifting on decoding resources */
+ acpi_walk_resources(cntrl_entry->controller->acpi_handle,
+ METHOD_NAME__CRS, write_xenbus_resource,
+ &info);
+
+ /* No resouces. OK. On to the next one */
+ if (!info.resource_count)
+ continue;
+
+ /* Store the number of resources we wrote for this root-%d */
+ len = snprintf(str, sizeof(str), "root-%d-resources", i);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+ "%d", info.resource_count);
+ if (err)
+ goto out;
+ }
+
+ /* Finally, write some magic to synchronize with the guest. */
+ len = snprintf(str, sizeof(str), "root-resource-magic");
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+ "%lx", (sizeof(struct acpi_resource) * 2) + 1);
+
+out:
+ spin_unlock(&dev_data->lock);
+
+ return err;
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_list_entry *cntrl_entry, *c;
+ struct controller_dev_entry *dev_entry, *d;
+
+ list_for_each_entry_safe(cntrl_entry, c, &dev_data->list, list) {
+ list_for_each_entry_safe(dev_entry, d,
+ &cntrl_entry->dev_list, list) {
+ list_del(&dev_entry->list);
+ pcistub_put_pci_dev(dev_entry->dev);
+ kfree(dev_entry);
+ }
+ list_del(&cntrl_entry->list);
+ kfree(cntrl_entry);
+ }
+
+ kfree(dev_data);
+ pdev->pci_dev_data = NULL;
+}
+
+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
+ struct pciback_device *pdev,
+ unsigned int *domain, unsigned int *bus, unsigned int *devfn)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_dev_entry *dev_entry;
+ struct controller_list_entry *cntrl_entry;
+ unsigned long flags;
+ int found = 0;
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+ list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+ if ((dev_entry->dev->bus->number ==
+ pcidev->bus->number) &&
+ (dev_entry->dev->devfn ==
+ pcidev->devfn) &&
+ (pci_domain_nr(dev_entry->dev->bus) ==
+ pci_domain_nr(pcidev->bus))) {
+ found = 1;
+ *domain = cntrl_entry->domain;
+ *bus = cntrl_entry->bus;
+ *devfn = dev_entry->devfn;
+ goto out;
+ }
+ }
+ }
+out:
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+ return found;
+
+}
+
diff --git a/drivers/xen/pciback/passthrough.c b/drivers/xen/pciback/passthrough.c
new file mode 100644
index 0000000..5386bebf
--- /dev/null
+++ b/drivers/xen/pciback/passthrough.c
@@ -0,0 +1,178 @@
+/*
+ * PCI Backend - Provides restricted access to the real PCI bus topology
+ * to the frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+struct passthrough_dev_data {
+ /* Access to dev_list must be protected by lock */
+ struct list_head dev_list;
+ spinlock_t lock;
+};
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry;
+ struct pci_dev *dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
+ if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
+ && bus == (unsigned int)dev_entry->dev->bus->number
+ && devfn == dev_entry->dev->devfn) {
+ dev = dev_entry->dev;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+
+ return dev;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry;
+ unsigned long flags;
+ unsigned int domain, bus, devfn;
+ int err;
+
+ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
+ if (!dev_entry)
+ return -ENOMEM;
+ dev_entry->dev = dev;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+ list_add_tail(&dev_entry->list, &dev_data->dev_list);
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+
+ /* Publish this device. */
+ domain = (unsigned int)pci_domain_nr(dev->bus);
+ bus = (unsigned int)dev->bus->number;
+ devfn = dev->devfn;
+ err = publish_cb(pdev, domain, bus, devfn, devid);
+
+ return err;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry, *t;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
+ if (dev_entry->dev == dev) {
+ list_del(&dev_entry->list);
+ found_dev = dev_entry->dev;
+ kfree(dev_entry);
+ }
+ }
+
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+
+ if (found_dev)
+ pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+ struct passthrough_dev_data *dev_data;
+
+ dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
+ if (!dev_data)
+ return -ENOMEM;
+
+ spin_lock_init(&dev_data->lock);
+
+ INIT_LIST_HEAD(&dev_data->dev_list);
+
+ pdev->pci_dev_data = dev_data;
+
+ return 0;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+ publish_pci_root_cb publish_root_cb)
+{
+ int err = 0;
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry, *e;
+ struct pci_dev *dev;
+ int found;
+ unsigned int domain, bus;
+
+ spin_lock(&dev_data->lock);
+
+ list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
+ /* Only publish this device as a root if none of its
+ * parent bridges are exported
+ */
+ found = 0;
+ dev = dev_entry->dev->bus->self;
+ for (; !found && dev != NULL; dev = dev->bus->self) {
+ list_for_each_entry(e, &dev_data->dev_list, list) {
+ if (dev == e->dev) {
+ found = 1;
+ break;
+ }
+ }
+ }
+
+ domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
+ bus = (unsigned int)dev_entry->dev->bus->number;
+
+ if (!found) {
+ err = publish_root_cb(pdev, domain, bus);
+ if (err)
+ break;
+ }
+ }
+
+ spin_unlock(&dev_data->lock);
+
+ return err;
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry, *t;
+
+ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
+ list_del(&dev_entry->list);
+ pcistub_put_pci_dev(dev_entry->dev);
+ kfree(dev_entry);
+ }
+
+ kfree(dev_data);
+ pdev->pci_dev_data = NULL;
+}
+
+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
+ struct pciback_device *pdev,
+ unsigned int *domain, unsigned int *bus,
+ unsigned int *devfn)
+
+{
+ *domain = pci_domain_nr(pcidev->bus);
+ *bus = pcidev->bus->number;
+ *devfn = pcidev->devfn;
+ return 1;
+}
diff --git a/drivers/xen/pciback/pci_stub.c b/drivers/xen/pciback/pci_stub.c
new file mode 100644
index 0000000..88c7ca1
--- /dev/null
+++ b/drivers/xen/pciback/pci_stub.c
@@ -0,0 +1,1370 @@
+/*
+ * PCI Stub Driver - Grabs devices in backend to be exported later
+ *
+ * Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rwsem.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+#include <linux/pci.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+#include <asm/xen/hypervisor.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+#define DRV_NAME "pciback"
+
+static char *pci_devs_to_hide;
+wait_queue_head_t aer_wait_queue;
+/*Add sem for sync AER handling and pciback remove/reconfigue ops,
+* We want to avoid in middle of AER ops, pciback devices is being removed
+*/
+static DECLARE_RWSEM(pcistub_sem);
+module_param_named(hide, pci_devs_to_hide, charp, 0444);
+
+struct pcistub_device_id {
+ struct list_head slot_list;
+ int domain;
+ unsigned char bus;
+ unsigned int devfn;
+};
+static LIST_HEAD(pcistub_device_ids);
+static DEFINE_SPINLOCK(device_ids_lock);
+
+struct pcistub_device {
+ struct kref kref;
+ struct list_head dev_list;
+ spinlock_t lock;
+
+ struct pci_dev *dev;
+ struct pciback_device *pdev;/* non-NULL if struct pci_dev is in use */
+};
+
+/* Access to pcistub_devices & seized_devices lists and the initialize_devices
+ * flag must be locked with pcistub_devices_lock
+ */
+static DEFINE_SPINLOCK(pcistub_devices_lock);
+static LIST_HEAD(pcistub_devices);
+
+/* wait for device_initcall before initializing our devices
+ * (see pcistub_init_devices_late)
+ */
+static int initialize_devices;
+static LIST_HEAD(seized_devices);
+
+static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+
+ dev_dbg(&dev->dev, "pcistub_device_alloc\n");
+
+ psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
+ if (!psdev)
+ return NULL;
+
+ psdev->dev = pci_dev_get(dev);
+ if (!psdev->dev) {
+ kfree(psdev);
+ return NULL;
+ }
+
+ kref_init(&psdev->kref);
+ spin_lock_init(&psdev->lock);
+
+ return psdev;
+}
+
+/* Don't call this directly as it's called by pcistub_device_put */
+static void pcistub_device_release(struct kref *kref)
+{
+ struct pcistub_device *psdev;
+
+ psdev = container_of(kref, struct pcistub_device, kref);
+
+ dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
+
+ xen_unregister_device_domain_owner(psdev->dev);
+
+ /* Clean-up the device */
+ pciback_reset_device(psdev->dev);
+ pciback_config_free_dyn_fields(psdev->dev);
+ pciback_config_free_dev(psdev->dev);
+ kfree(pci_get_drvdata(psdev->dev));
+ pci_set_drvdata(psdev->dev, NULL);
+
+ pci_dev_put(psdev->dev);
+
+ kfree(psdev);
+}
+
+static inline void pcistub_device_get(struct pcistub_device *psdev)
+{
+ kref_get(&psdev->kref);
+}
+
+static inline void pcistub_device_put(struct pcistub_device *psdev)
+{
+ kref_put(&psdev->kref, pcistub_device_release);
+}
+
+static struct pcistub_device *pcistub_device_find(int domain, int bus,
+ int slot, int func)
+{
+ struct pcistub_device *psdev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev != NULL
+ && domain == pci_domain_nr(psdev->dev->bus)
+ && bus == psdev->dev->bus->number
+ && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
+ pcistub_device_get(psdev);
+ goto out;
+ }
+ }
+
+ /* didn't find it */
+ psdev = NULL;
+
+out:
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return psdev;
+}
+
+static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev,
+ struct pcistub_device *psdev)
+{
+ struct pci_dev *pci_dev = NULL;
+ unsigned long flags;
+
+ pcistub_device_get(psdev);
+
+ spin_lock_irqsave(&psdev->lock, flags);
+ if (!psdev->pdev) {
+ psdev->pdev = pdev;
+ pci_dev = psdev->dev;
+ }
+ spin_unlock_irqrestore(&psdev->lock, flags);
+
+ if (!pci_dev)
+ pcistub_device_put(psdev);
+
+ return pci_dev;
+}
+
+struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
+ int domain, int bus,
+ int slot, int func)
+{
+ struct pcistub_device *psdev;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev != NULL
+ && domain == pci_domain_nr(psdev->dev->bus)
+ && bus == psdev->dev->bus->number
+ && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
+ found_dev = pcistub_device_get_pci_dev(pdev, psdev);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return found_dev;
+}
+
+struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
+ struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev == dev) {
+ found_dev = pcistub_device_get_pci_dev(pdev, psdev);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return found_dev;
+}
+
+void pcistub_put_pci_dev(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev, *found_psdev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev == dev) {
+ found_psdev = psdev;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ /*hold this lock for avoiding breaking link between
+ * pcistub and pciback when AER is in processing
+ */
+ down_write(&pcistub_sem);
+ /* Cleanup our device
+ * (so it's ready for the next domain)
+ */
+ pciback_reset_device(found_psdev->dev);
+ pciback_config_free_dyn_fields(found_psdev->dev);
+ pciback_config_reset_dev(found_psdev->dev);
+
+ spin_lock_irqsave(&found_psdev->lock, flags);
+ found_psdev->pdev = NULL;
+ spin_unlock_irqrestore(&found_psdev->lock, flags);
+
+ pcistub_device_put(found_psdev);
+ up_write(&pcistub_sem);
+}
+
+static int __devinit pcistub_match_one(struct pci_dev *dev,
+ struct pcistub_device_id *pdev_id)
+{
+ /* Match the specified device by domain, bus, slot, func and also if
+ * any of the device's parent bridges match.
+ */
+ for (; dev != NULL; dev = dev->bus->self) {
+ if (pci_domain_nr(dev->bus) == pdev_id->domain
+ && dev->bus->number == pdev_id->bus
+ && dev->devfn == pdev_id->devfn)
+ return 1;
+
+ /* Sometimes topmost bridge links to itself. */
+ if (dev == dev->bus->self)
+ break;
+ }
+
+ return 0;
+}
+
+static int __devinit pcistub_match(struct pci_dev *dev)
+{
+ struct pcistub_device_id *pdev_id;
+ unsigned long flags;
+ int found = 0;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
+ if (pcistub_match_one(dev, pdev_id)) {
+ found = 1;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return found;
+}
+
+static int __devinit pcistub_init_device(struct pci_dev *dev)
+{
+ struct pciback_dev_data *dev_data;
+ int err = 0;
+
+ dev_dbg(&dev->dev, "initializing...\n");
+
+ /* The PCI backend is not intended to be a module (or to work with
+ * removable PCI devices (yet). If it were, pciback_config_free()
+ * would need to be called somewhere to free the memory allocated
+ * here and then to call kfree(pci_get_drvdata(psdev->dev)).
+ */
+ dev_data = kzalloc(sizeof(*dev_data) + strlen(DRV_NAME "[]")
+ + strlen(pci_name(dev)) + 1, GFP_ATOMIC);
+ if (!dev_data) {
+ err = -ENOMEM;
+ goto out;
+ }
+ pci_set_drvdata(dev, dev_data);
+
+ /*
+ * Setup name for fake IRQ handler. It will only be enabled
+ * once the device is turned on by the guest.
+ */
+ sprintf(dev_data->irq_name, DRV_NAME "[%s]", pci_name(dev));
+
+ dev_dbg(&dev->dev, "initializing config\n");
+
+ init_waitqueue_head(&aer_wait_queue);
+ err = pciback_config_init_dev(dev);
+ if (err)
+ goto out;
+
+ /* HACK: Force device (& ACPI) to determine what IRQ it's on - we
+ * must do this here because pcibios_enable_device may specify
+ * the pci device's true irq (and possibly its other resources)
+ * if they differ from what's in the configuration space.
+ * This makes the assumption that the device's resources won't
+ * change after this point (otherwise this code may break!)
+ */
+ dev_dbg(&dev->dev, "enabling device\n");
+ err = pci_enable_device(dev);
+ if (err)
+ goto config_release;
+
+ /* Now disable the device (this also ensures some private device
+ * data is setup before we export)
+ */
+ dev_dbg(&dev->dev, "reset device\n");
+ pciback_reset_device(dev);
+
+ return 0;
+
+config_release:
+ pciback_config_free_dev(dev);
+
+out:
+ pci_set_drvdata(dev, NULL);
+ kfree(dev_data);
+ return err;
+}
+
+/*
+ * Because some initialization still happens on
+ * devices during fs_initcall, we need to defer
+ * full initialization of our devices until
+ * device_initcall.
+ */
+static int __init pcistub_init_devices_late(void)
+{
+ struct pcistub_device *psdev;
+ unsigned long flags;
+ int err = 0;
+
+ pr_debug("pciback: pcistub_init_devices_late\n");
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ while (!list_empty(&seized_devices)) {
+ psdev = container_of(seized_devices.next,
+ struct pcistub_device, dev_list);
+ list_del(&psdev->dev_list);
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ err = pcistub_init_device(psdev->dev);
+ if (err) {
+ dev_err(&psdev->dev->dev,
+ "error %d initializing device\n", err);
+ kfree(psdev);
+ psdev = NULL;
+ }
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ if (psdev)
+ list_add_tail(&psdev->dev_list, &pcistub_devices);
+ }
+
+ initialize_devices = 1;
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ return 0;
+}
+
+static int __devinit pcistub_seize(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+ unsigned long flags;
+ int err = 0;
+
+ psdev = pcistub_device_alloc(dev);
+ if (!psdev)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ if (initialize_devices) {
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ /* don't want irqs disabled when calling pcistub_init_device */
+ err = pcistub_init_device(psdev->dev);
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ if (!err)
+ list_add(&psdev->dev_list, &pcistub_devices);
+ } else {
+ dev_dbg(&dev->dev, "deferring initialization\n");
+ list_add(&psdev->dev_list, &seized_devices);
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ if (err)
+ pcistub_device_put(psdev);
+
+ return err;
+}
+
+static int __devinit pcistub_probe(struct pci_dev *dev,
+ const struct pci_device_id *id)
+{
+ int err = 0;
+
+ dev_dbg(&dev->dev, "probing...\n");
+
+ if (pcistub_match(dev)) {
+
+ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
+ && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
+ dev_err(&dev->dev, "can't export pci devices that "
+ "don't have a normal (0) or bridge (1) "
+ "header type!\n");
+ err = -ENODEV;
+ goto out;
+ }
+
+ dev_info(&dev->dev, "seizing device\n");
+ err = pcistub_seize(dev);
+ } else
+ /* Didn't find the device */
+ err = -ENODEV;
+
+out:
+ return err;
+}
+
+static void pcistub_remove(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev, *found_psdev = NULL;
+ unsigned long flags;
+
+ dev_dbg(&dev->dev, "removing\n");
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ pciback_config_quirk_release(dev);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev == dev) {
+ found_psdev = psdev;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ if (found_psdev) {
+ dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
+ found_psdev->pdev);
+
+ if (found_psdev->pdev) {
+ printk(KERN_WARNING "pciback: ****** removing device "
+ "%s while still in-use! ******\n",
+ pci_name(found_psdev->dev));
+ printk(KERN_WARNING "pciback: ****** driver domain may "
+ "still access this device's i/o resources!\n");
+ printk(KERN_WARNING "pciback: ****** shutdown driver "
+ "domain before binding device\n");
+ printk(KERN_WARNING "pciback: ****** to other drivers "
+ "or domains\n");
+
+ pciback_release_pci_dev(found_psdev->pdev,
+ found_psdev->dev);
+ }
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+ list_del(&found_psdev->dev_list);
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ /* the final put for releasing from the list */
+ pcistub_device_put(found_psdev);
+ }
+}
+
+static const struct pci_device_id pcistub_ids[] = {
+ {
+ .vendor = PCI_ANY_ID,
+ .device = PCI_ANY_ID,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ },
+ {0,},
+};
+
+#define PCI_NODENAME_MAX 40
+static void kill_domain_by_device(struct pcistub_device *psdev)
+{
+ struct xenbus_transaction xbt;
+ int err;
+ char nodename[PCI_NODENAME_MAX];
+
+ if (!psdev)
+ dev_err(&psdev->dev->dev,
+ "device is NULL when do AER recovery/kill_domain\n");
+ snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0",
+ psdev->pdev->xdev->otherend_id);
+ nodename[strlen(nodename)] = '\0';
+
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ dev_err(&psdev->dev->dev,
+ "error %d when start xenbus transaction\n", err);
+ return;
+ }
+ /*PV AER handlers will set this flag*/
+ xenbus_printf(xbt, nodename, "aerState" , "aerfail");
+ err = xenbus_transaction_end(xbt, 0);
+ if (err) {
+ if (err == -EAGAIN)
+ goto again;
+ dev_err(&psdev->dev->dev,
+ "error %d when end xenbus transaction\n", err);
+ return;
+ }
+}
+
+/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and
+ * backend need to have cooperation. In pciback, those steps will do similar
+ * jobs: send service request and waiting for front_end response.
+*/
+static pci_ers_result_t common_process(struct pcistub_device *psdev,
+ pci_channel_state_t state, int aer_cmd, pci_ers_result_t result)
+{
+ pci_ers_result_t res = result;
+ struct xen_pcie_aer_op *aer_op;
+ int ret;
+
+ /*with PV AER drivers*/
+ aer_op = &(psdev->pdev->sh_info->aer_op);
+ aer_op->cmd = aer_cmd ;
+ /*useful for error_detected callback*/
+ aer_op->err = state;
+ /*pcifront_end BDF*/
+ ret = pciback_get_pcifront_dev(psdev->dev, psdev->pdev,
+ &aer_op->domain, &aer_op->bus, &aer_op->devfn);
+ if (!ret) {
+ dev_err(&psdev->dev->dev,
+ "pciback: failed to get pcifront device\n");
+ return PCI_ERS_RESULT_NONE;
+ }
+ wmb();
+
+ dev_dbg(&psdev->dev->dev,
+ "pciback: aer_op %x dom %x bus %x devfn %x\n",
+ aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn);
+ /*local flag to mark there's aer request, pciback callback will use this
+ * flag to judge whether we need to check pci-front give aer service
+ * ack signal
+ */
+ set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
+
+ /*It is possible that a pcifront conf_read_write ops request invokes
+ * the callback which cause the spurious execution of wake_up.
+ * Yet it is harmless and better than a spinlock here
+ */
+ set_bit(_XEN_PCIB_active,
+ (unsigned long *)&psdev->pdev->sh_info->flags);
+ wmb();
+ notify_remote_via_irq(psdev->pdev->evtchn_irq);
+
+ ret = wait_event_timeout(aer_wait_queue, !(test_bit(_XEN_PCIB_active,
+ (unsigned long *)&psdev->pdev->sh_info->flags)), 300*HZ);
+
+ if (!ret) {
+ if (test_bit(_XEN_PCIB_active,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_err(&psdev->dev->dev,
+ "pcifront aer process not responding!\n");
+ clear_bit(_XEN_PCIB_active,
+ (unsigned long *)&psdev->pdev->sh_info->flags);
+ aer_op->err = PCI_ERS_RESULT_NONE;
+ return res;
+ }
+ }
+ clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
+
+ if (test_bit(_XEN_PCIF_active,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_dbg(&psdev->dev->dev,
+ "schedule pci_conf service in pciback \n");
+ test_and_schedule_op(psdev->pdev);
+ }
+
+ res = (pci_ers_result_t)aer_op->err;
+ return res;
+}
+
+/*
+* pciback_slot_reset: it will send the slot_reset request to pcifront in case
+* of the device driver could provide this service, and then wait for pcifront
+* ack.
+* @dev: pointer to PCI devices
+* return value is used by aer_core do_recovery policy
+*/
+static pci_ers_result_t pciback_slot_reset(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+ pci_ers_result_t result;
+
+ result = PCI_ERS_RESULT_RECOVERED;
+ dev_dbg(&dev->dev, "pciback_slot_reset(bus:%x,devfn:%x)\n",
+ dev->bus->number, dev->devfn);
+
+ down_write(&pcistub_sem);
+ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+ dev->bus->number,
+ PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn));
+
+ if (!psdev || !psdev->pdev) {
+ dev_err(&dev->dev,
+ "pciback device is not found/assigned\n");
+ goto end;
+ }
+
+ if (!psdev->pdev->sh_info) {
+ dev_err(&dev->dev, "pciback device is not connected or owned"
+ " by HVM, kill it\n");
+ kill_domain_by_device(psdev);
+ goto release;
+ }
+
+ if (!test_bit(_XEN_PCIB_AERHANDLER,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_err(&dev->dev,
+ "guest with no AER driver should have been killed\n");
+ goto release;
+ }
+ result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result);
+
+ if (result == PCI_ERS_RESULT_NONE ||
+ result == PCI_ERS_RESULT_DISCONNECT) {
+ dev_dbg(&dev->dev,
+ "No AER slot_reset service or disconnected!\n");
+ kill_domain_by_device(psdev);
+ }
+release:
+ pcistub_device_put(psdev);
+end:
+ up_write(&pcistub_sem);
+ return result;
+
+}
+
+
+/*pciback_mmio_enabled: it will send the mmio_enabled request to pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack
+* @dev: pointer to PCI devices
+* return value is used by aer_core do_recovery policy
+*/
+
+static pci_ers_result_t pciback_mmio_enabled(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+ pci_ers_result_t result;
+
+ result = PCI_ERS_RESULT_RECOVERED;
+ dev_dbg(&dev->dev, "pciback_mmio_enabled(bus:%x,devfn:%x)\n",
+ dev->bus->number, dev->devfn);
+
+ down_write(&pcistub_sem);
+ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+ dev->bus->number,
+ PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn));
+
+ if (!psdev || !psdev->pdev) {
+ dev_err(&dev->dev,
+ "pciback device is not found/assigned\n");
+ goto end;
+ }
+
+ if (!psdev->pdev->sh_info) {
+ dev_err(&dev->dev, "pciback device is not connected or owned"
+ " by HVM, kill it\n");
+ kill_domain_by_device(psdev);
+ goto release;
+ }
+
+ if (!test_bit(_XEN_PCIB_AERHANDLER,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_err(&dev->dev,
+ "guest with no AER driver should have been killed\n");
+ goto release;
+ }
+ result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result);
+
+ if (result == PCI_ERS_RESULT_NONE ||
+ result == PCI_ERS_RESULT_DISCONNECT) {
+ dev_dbg(&dev->dev,
+ "No AER mmio_enabled service or disconnected!\n");
+ kill_domain_by_device(psdev);
+ }
+release:
+ pcistub_device_put(psdev);
+end:
+ up_write(&pcistub_sem);
+ return result;
+}
+
+/*pciback_error_detected: it will send the error_detected request to pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack.
+* @dev: pointer to PCI devices
+* @error: the current PCI connection state
+* return value is used by aer_core do_recovery policy
+*/
+
+static pci_ers_result_t pciback_error_detected(struct pci_dev *dev,
+ pci_channel_state_t error)
+{
+ struct pcistub_device *psdev;
+ pci_ers_result_t result;
+
+ result = PCI_ERS_RESULT_CAN_RECOVER;
+ dev_dbg(&dev->dev, "pciback_error_detected(bus:%x,devfn:%x)\n",
+ dev->bus->number, dev->devfn);
+
+ down_write(&pcistub_sem);
+ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+ dev->bus->number,
+ PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn));
+
+ if (!psdev || !psdev->pdev) {
+ dev_err(&dev->dev,
+ "pciback device is not found/assigned\n");
+ goto end;
+ }
+
+ if (!psdev->pdev->sh_info) {
+ dev_err(&dev->dev, "pciback device is not connected or owned"
+ " by HVM, kill it\n");
+ kill_domain_by_device(psdev);
+ goto release;
+ }
+
+ /*Guest owns the device yet no aer handler regiested, kill guest*/
+ if (!test_bit(_XEN_PCIB_AERHANDLER,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n");
+ kill_domain_by_device(psdev);
+ goto release;
+ }
+ result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result);
+
+ if (result == PCI_ERS_RESULT_NONE ||
+ result == PCI_ERS_RESULT_DISCONNECT) {
+ dev_dbg(&dev->dev,
+ "No AER error_detected service or disconnected!\n");
+ kill_domain_by_device(psdev);
+ }
+release:
+ pcistub_device_put(psdev);
+end:
+ up_write(&pcistub_sem);
+ return result;
+}
+
+/*pciback_error_resume: it will send the error_resume request to pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack.
+* @dev: pointer to PCI devices
+*/
+
+static void pciback_error_resume(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+
+ dev_dbg(&dev->dev, "pciback_error_resume(bus:%x,devfn:%x)\n",
+ dev->bus->number, dev->devfn);
+
+ down_write(&pcistub_sem);
+ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+ dev->bus->number,
+ PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn));
+
+ if (!psdev || !psdev->pdev) {
+ dev_err(&dev->dev,
+ "pciback device is not found/assigned\n");
+ goto end;
+ }
+
+ if (!psdev->pdev->sh_info) {
+ dev_err(&dev->dev, "pciback device is not connected or owned"
+ " by HVM, kill it\n");
+ kill_domain_by_device(psdev);
+ goto release;
+ }
+
+ if (!test_bit(_XEN_PCIB_AERHANDLER,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_err(&dev->dev,
+ "guest with no AER driver should have been killed\n");
+ kill_domain_by_device(psdev);
+ goto release;
+ }
+ common_process(psdev, 1, XEN_PCI_OP_aer_resume,
+ PCI_ERS_RESULT_RECOVERED);
+release:
+ pcistub_device_put(psdev);
+end:
+ up_write(&pcistub_sem);
+ return;
+}
+
+/*add pciback AER handling*/
+static struct pci_error_handlers pciback_error_handler = {
+ .error_detected = pciback_error_detected,
+ .mmio_enabled = pciback_mmio_enabled,
+ .slot_reset = pciback_slot_reset,
+ .resume = pciback_error_resume,
+};
+
+/*
+ * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
+ * for a normal device. I don't want it to be loaded automatically.
+ */
+
+static struct pci_driver pciback_pci_driver = {
+ .name = DRV_NAME,
+ .id_table = pcistub_ids,
+ .probe = pcistub_probe,
+ .remove = pcistub_remove,
+ .err_handler = &pciback_error_handler,
+};
+
+static inline int str_to_slot(const char *buf, int *domain, int *bus,
+ int *slot, int *func)
+{
+ int err;
+
+ err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
+ if (err == 4)
+ return 0;
+ else if (err < 0)
+ return -EINVAL;
+
+ /* try again without domain */
+ *domain = 0;
+ err = sscanf(buf, " %x:%x.%x", bus, slot, func);
+ if (err == 3)
+ return 0;
+
+ return -EINVAL;
+}
+
+static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
+ *slot, int *func, int *reg, int *size, int *mask)
+{
+ int err;
+
+ err =
+ sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot,
+ func, reg, size, mask);
+ if (err == 7)
+ return 0;
+ return -EINVAL;
+}
+
+static int pcistub_device_id_add(int domain, int bus, int slot, int func)
+{
+ struct pcistub_device_id *pci_dev_id;
+ unsigned long flags;
+
+ pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
+ if (!pci_dev_id)
+ return -ENOMEM;
+
+ pci_dev_id->domain = domain;
+ pci_dev_id->bus = bus;
+ pci_dev_id->devfn = PCI_DEVFN(slot, func);
+
+ pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n",
+ domain, bus, slot, func);
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return 0;
+}
+
+static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
+{
+ struct pcistub_device_id *pci_dev_id, *t;
+ int devfn = PCI_DEVFN(slot, func);
+ int err = -ENOENT;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids,
+ slot_list) {
+ if (pci_dev_id->domain == domain
+ && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
+ /* Don't break; here because it's possible the same
+ * slot could be in the list more than once
+ */
+ list_del(&pci_dev_id->slot_list);
+ kfree(pci_dev_id);
+
+ err = 0;
+
+ pr_debug("pciback: removed %04x:%02x:%02x.%01x from "
+ "seize list\n", domain, bus, slot, func);
+ }
+ }
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return err;
+}
+
+static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg,
+ int size, int mask)
+{
+ int err = 0;
+ struct pcistub_device *psdev;
+ struct pci_dev *dev;
+ struct config_field *field;
+
+ psdev = pcistub_device_find(domain, bus, slot, func);
+ if (!psdev || !psdev->dev) {
+ err = -ENODEV;
+ goto out;
+ }
+ dev = psdev->dev;
+
+ field = kzalloc(sizeof(*field), GFP_ATOMIC);
+ if (!field) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ field->offset = reg;
+ field->size = size;
+ field->mask = mask;
+ field->init = NULL;
+ field->reset = NULL;
+ field->release = NULL;
+ field->clean = pciback_config_field_free;
+
+ err = pciback_config_quirks_add_field(dev, field);
+ if (err)
+ kfree(field);
+out:
+ return err;
+}
+
+static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func;
+ int err;
+
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ goto out;
+
+ err = pcistub_device_id_add(domain, bus, slot, func);
+
+out:
+ if (!err)
+ err = count;
+ return err;
+}
+
+DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
+
+static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func;
+ int err;
+
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ goto out;
+
+ err = pcistub_device_id_remove(domain, bus, slot, func);
+
+out:
+ if (!err)
+ err = count;
+ return err;
+}
+
+DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
+
+static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
+{
+ struct pcistub_device_id *pci_dev_id;
+ size_t count = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
+ if (count >= PAGE_SIZE)
+ break;
+
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "%04x:%02x:%02x.%01x\n",
+ pci_dev_id->domain, pci_dev_id->bus,
+ PCI_SLOT(pci_dev_id->devfn),
+ PCI_FUNC(pci_dev_id->devfn));
+ }
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return count;
+}
+
+DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
+
+static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf)
+{
+ struct pcistub_device *psdev;
+ struct pciback_dev_data *dev_data;
+ size_t count = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (count >= PAGE_SIZE)
+ break;
+ if (!psdev->dev)
+ continue;
+ dev_data = pci_get_drvdata(psdev->dev);
+ if (!dev_data)
+ continue;
+ count +=
+ scnprintf(buf + count, PAGE_SIZE - count, "%s:%s:%sing:%ld\n",
+ pci_name(psdev->dev),
+ dev_data->isr_on ? "on" : "off",
+ dev_data->ack_intr ? "ack" : "not ack",
+ dev_data->handled);
+ }
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return count;
+}
+
+DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL);
+
+static ssize_t pcistub_irq_handler_switch(struct device_driver *drv,
+ const char *buf,
+ size_t count)
+{
+ struct pcistub_device *psdev;
+ struct pciback_dev_data *dev_data;
+ int domain, bus, slot, func;
+ int err = -ENOENT;
+
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ goto out;
+
+ psdev = pcistub_device_find(domain, bus, slot, func);
+
+ if (!psdev)
+ goto out;
+
+ dev_data = pci_get_drvdata(psdev->dev);
+ if (!dev_data)
+ goto out;
+
+ dev_dbg(&psdev->dev->dev, "%s fake irq handler: %d->%d\n",
+ dev_data->irq_name, dev_data->isr_on,
+ !dev_data->isr_on);
+
+ dev_data->isr_on = !(dev_data->isr_on);
+ if (dev_data->isr_on)
+ dev_data->ack_intr = 1;
+out:
+ if (!err)
+ err = count;
+ return err;
+}
+DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL, pcistub_irq_handler_switch);
+
+static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func, reg, size, mask;
+ int err;
+
+ err = str_to_quirk(buf, &domain, &bus, &slot, &func, ®, &size,
+ &mask);
+ if (err)
+ goto out;
+
+ err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask);
+
+out:
+ if (!err)
+ err = count;
+ return err;
+}
+
+static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf)
+{
+ int count = 0;
+ unsigned long flags;
+ struct pciback_config_quirk *quirk;
+ struct pciback_dev_data *dev_data;
+ const struct config_field *field;
+ const struct config_field_entry *cfg_entry;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry(quirk, &pciback_quirks, quirks_list) {
+ if (count >= PAGE_SIZE)
+ goto out;
+
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n",
+ quirk->pdev->bus->number,
+ PCI_SLOT(quirk->pdev->devfn),
+ PCI_FUNC(quirk->pdev->devfn),
+ quirk->devid.vendor, quirk->devid.device,
+ quirk->devid.subvendor,
+ quirk->devid.subdevice);
+
+ dev_data = pci_get_drvdata(quirk->pdev);
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+ if (count >= PAGE_SIZE)
+ goto out;
+
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "\t\t%08x:%01x:%08x\n",
+ cfg_entry->base_offset +
+ field->offset, field->size,
+ field->mask);
+ }
+ }
+
+out:
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return count;
+}
+
+DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add);
+
+static ssize_t permissive_add(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func;
+ int err;
+ struct pcistub_device *psdev;
+ struct pciback_dev_data *dev_data;
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ goto out;
+ psdev = pcistub_device_find(domain, bus, slot, func);
+ if (!psdev) {
+ err = -ENODEV;
+ goto out;
+ }
+ if (!psdev->dev) {
+ err = -ENODEV;
+ goto release;
+ }
+ dev_data = pci_get_drvdata(psdev->dev);
+ /* the driver data for a device should never be null at this point */
+ if (!dev_data) {
+ err = -ENXIO;
+ goto release;
+ }
+ if (!dev_data->permissive) {
+ dev_data->permissive = 1;
+ /* Let user know that what they're doing could be unsafe */
+ dev_warn(&psdev->dev->dev, "enabling permissive mode "
+ "configuration space accesses!\n");
+ dev_warn(&psdev->dev->dev,
+ "permissive mode is potentially unsafe!\n");
+ }
+release:
+ pcistub_device_put(psdev);
+out:
+ if (!err)
+ err = count;
+ return err;
+}
+
+static ssize_t permissive_show(struct device_driver *drv, char *buf)
+{
+ struct pcistub_device *psdev;
+ struct pciback_dev_data *dev_data;
+ size_t count = 0;
+ unsigned long flags;
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (count >= PAGE_SIZE)
+ break;
+ if (!psdev->dev)
+ continue;
+ dev_data = pci_get_drvdata(psdev->dev);
+ if (!dev_data || !dev_data->permissive)
+ continue;
+ count +=
+ scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
+ pci_name(psdev->dev));
+ }
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return count;
+}
+
+DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add);
+
+static void pcistub_exit(void)
+{
+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
+ driver_remove_file(&pciback_pci_driver.driver,
+ &driver_attr_remove_slot);
+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots);
+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_quirks);
+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_permissive);
+ driver_remove_file(&pciback_pci_driver.driver,
+ &driver_attr_irq_handlers);
+ driver_remove_file(&pciback_pci_driver.driver,
+ &driver_attr_irq_handler_state);
+ pci_unregister_driver(&pciback_pci_driver);
+}
+
+static int __init pcistub_init(void)
+{
+ int pos = 0;
+ int err = 0;
+ int domain, bus, slot, func;
+ int parsed;
+
+ if (pci_devs_to_hide && *pci_devs_to_hide) {
+ do {
+ parsed = 0;
+
+ err = sscanf(pci_devs_to_hide + pos,
+ " (%x:%x:%x.%x) %n",
+ &domain, &bus, &slot, &func, &parsed);
+ if (err != 4) {
+ domain = 0;
+ err = sscanf(pci_devs_to_hide + pos,
+ " (%x:%x.%x) %n",
+ &bus, &slot, &func, &parsed);
+ if (err != 3)
+ goto parse_error;
+ }
+
+ err = pcistub_device_id_add(domain, bus, slot, func);
+ if (err)
+ goto out;
+
+ /* if parsed<=0, we've reached the end of the string */
+ pos += parsed;
+ } while (parsed > 0 && pci_devs_to_hide[pos]);
+ }
+
+ /* If we're the first PCI Device Driver to register, we're the
+ * first one to get offered PCI devices as they become
+ * available (and thus we can be the first to grab them)
+ */
+ err = pci_register_driver(&pciback_pci_driver);
+ if (err < 0)
+ goto out;
+
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_new_slot);
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_remove_slot);
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_slots);
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_quirks);
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_permissive);
+
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_irq_handlers);
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_irq_handler_state);
+ if (err)
+ pcistub_exit();
+
+out:
+ return err;
+
+parse_error:
+ printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n",
+ pci_devs_to_hide + pos);
+ return -EINVAL;
+}
+
+#ifndef MODULE
+/*
+ * fs_initcall happens before device_initcall
+ * so pciback *should* get called first (b/c we
+ * want to suck up any device before other drivers
+ * get a chance by being the first pci device
+ * driver to register)
+ */
+fs_initcall(pcistub_init);
+#endif
+
+static int __init pciback_init(void)
+{
+ int err;
+
+ if (!xen_initial_domain())
+ return -ENODEV;
+
+ err = pciback_config_init();
+ if (err)
+ return err;
+
+#ifdef MODULE
+ err = pcistub_init();
+ if (err < 0)
+ return err;
+#endif
+
+ pcistub_init_devices_late();
+ err = pciback_xenbus_register();
+ if (err)
+ pcistub_exit();
+
+ return err;
+}
+
+static void __exit pciback_cleanup(void)
+{
+ pciback_xenbus_unregister();
+ pcistub_exit();
+}
+
+module_init(pciback_init);
+module_exit(pciback_cleanup);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/pciback/pciback.h b/drivers/xen/pciback/pciback.h
new file mode 100644
index 0000000..fc31052
--- /dev/null
+++ b/drivers/xen/pciback/pciback.h
@@ -0,0 +1,142 @@
+/*
+ * PCI Backend Common Data Structures & Function Declarations
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#ifndef __XEN_PCIBACK_H__
+#define __XEN_PCIBACK_H__
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <xen/xenbus.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <asm/atomic.h>
+#include <xen/interface/io/pciif.h>
+
+struct pci_dev_entry {
+ struct list_head list;
+ struct pci_dev *dev;
+};
+
+#define _PDEVF_op_active (0)
+#define PDEVF_op_active (1<<(_PDEVF_op_active))
+#define _PCIB_op_pending (1)
+#define PCIB_op_pending (1<<(_PCIB_op_pending))
+
+struct pciback_device {
+ void *pci_dev_data;
+ spinlock_t dev_lock;
+
+ struct xenbus_device *xdev;
+
+ struct xenbus_watch be_watch;
+ u8 be_watching;
+
+ int evtchn_irq;
+
+ struct xen_pci_sharedinfo *sh_info;
+
+ unsigned long flags;
+
+ struct work_struct op_work;
+};
+
+struct pciback_dev_data {
+ struct list_head config_fields;
+ unsigned int permissive : 1;
+ unsigned int warned_on_write : 1;
+ unsigned int enable_intx : 1;
+ unsigned int isr_on : 1; /* Whether the IRQ handler is installed. */
+ unsigned int ack_intr : 1; /* .. and ACK-ing */
+ unsigned long handled;
+ unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */
+ char irq_name[0]; /* pciback[000:04:00.0] */
+};
+
+/* Used by XenBus and pciback_ops.c */
+extern wait_queue_head_t aer_wait_queue;
+extern struct workqueue_struct *pciback_wq;
+/* Used by pcistub.c and conf_space_quirks.c */
+extern struct list_head pciback_quirks;
+
+/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
+struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
+ int domain, int bus,
+ int slot, int func);
+struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
+ struct pci_dev *dev);
+void pcistub_put_pci_dev(struct pci_dev *dev);
+
+/* Ensure a device is turned off or reset */
+void pciback_reset_device(struct pci_dev *pdev);
+
+/* Access a virtual configuration space for a PCI device */
+int pciback_config_init(void);
+int pciback_config_init_dev(struct pci_dev *dev);
+void pciback_config_free_dyn_fields(struct pci_dev *dev);
+void pciback_config_reset_dev(struct pci_dev *dev);
+void pciback_config_free_dev(struct pci_dev *dev);
+int pciback_config_read(struct pci_dev *dev, int offset, int size,
+ u32 *ret_val);
+int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value);
+
+/* Handle requests for specific devices from the frontend */
+typedef int (*publish_pci_dev_cb) (struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn, unsigned int devid);
+typedef int (*publish_pci_root_cb) (struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus);
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb);
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn);
+
+/**
+* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in pciback
+* before sending aer request to pcifront, so that guest could identify
+* device, coopearte with pciback to finish aer recovery job if device driver
+* has the capability
+*/
+
+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
+ struct pciback_device *pdev,
+ unsigned int *domain, unsigned int *bus,
+ unsigned int *devfn);
+int pciback_init_devices(struct pciback_device *pdev);
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+ publish_pci_root_cb cb);
+void pciback_release_devices(struct pciback_device *pdev);
+
+/* Handles events from front-end */
+irqreturn_t pciback_handle_event(int irq, void *dev_id);
+void pciback_do_op(struct work_struct *data);
+
+int pciback_xenbus_register(void);
+void pciback_xenbus_unregister(void);
+
+#ifdef CONFIG_PCI_MSI
+int pciback_enable_msi(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op);
+
+int pciback_disable_msi(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op);
+
+
+int pciback_enable_msix(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op);
+
+int pciback_disable_msix(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op);
+#endif
+extern int verbose_request;
+
+void test_and_schedule_op(struct pciback_device *pdev);
+#endif
+
+/* Handles shared IRQs that can to device domain and control domain. */
+void pciback_irq_handler(struct pci_dev *dev, int reset);
+irqreturn_t pciback_guest_interrupt(int irq, void *dev_id);
diff --git a/drivers/xen/pciback/pciback_ops.c b/drivers/xen/pciback/pciback_ops.c
new file mode 100644
index 0000000..5543881
--- /dev/null
+++ b/drivers/xen/pciback/pciback_ops.c
@@ -0,0 +1,242 @@
+/*
+ * PCI Backend Operations - respond to PCI requests from Frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/wait.h>
+#include <linux/bitops.h>
+#include <xen/events.h>
+#include <linux/sched.h>
+#include "pciback.h"
+
+int verbose_request;
+module_param(verbose_request, int, 0644);
+
+/* Ensure a device is has the fake IRQ handler "turned on/off" and is
+ * ready to be exported. This MUST be run after pciback_reset_device
+ * which does the actual PCI device enable/disable.
+ */
+void pciback_control_isr(struct pci_dev *dev, int reset)
+{
+ struct pciback_dev_data *dev_data;
+ int rc;
+ int enable = 0;
+
+ dev_data = pci_get_drvdata(dev);
+ if (!dev_data)
+ return;
+
+ /* We don't deal with bridges */
+ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+ return;
+
+ if (reset) {
+ dev_data->enable_intx = 0;
+ dev_data->ack_intr = 0;
+ }
+ enable = dev_data->enable_intx;
+
+ /* Asked to disable, but ISR isn't runnig */
+ if (!enable && !dev_data->isr_on)
+ return;
+
+ /* Squirrel away the IRQs in the dev_data. We need this
+ * b/c when device transitions to MSI, the dev->irq is
+ * overwritten with the MSI vector.
+ */
+ if (enable)
+ dev_data->irq = dev->irq;
+
+ dev_dbg(&dev->dev, "%s: #%d %s %s%s %s-> %s\n",
+ dev_data->irq_name,
+ dev_data->irq,
+ pci_is_enabled(dev) ? "on" : "off",
+ dev->msi_enabled ? "MSI" : "",
+ dev->msix_enabled ? "MSI/X" : "",
+ dev_data->isr_on ? "enable" : "disable",
+ enable ? "enable" : "disable");
+
+ if (enable) {
+ rc = request_irq(dev_data->irq,
+ pciback_guest_interrupt, IRQF_SHARED,
+ dev_data->irq_name, dev);
+ if (rc) {
+ dev_err(&dev->dev, "%s: failed to install fake IRQ " \
+ "handler for IRQ %d! (rc:%d)\n", dev_data->irq_name,
+ dev_data->irq, rc);
+ goto out;
+ }
+ }
+ else {
+ free_irq(dev_data->irq, dev);
+ dev_data->irq = 0;
+ }
+ dev_data->isr_on = enable;
+ dev_data->ack_intr = enable;
+out:
+ dev_dbg(&dev->dev, "%s: #%d %s %s%s %s\n",
+ dev_data->irq_name,
+ dev_data->irq,
+ pci_is_enabled(dev) ? "on" : "off",
+ dev->msi_enabled ? "MSI" : "",
+ dev->msix_enabled ? "MSI/X" : "",
+ enable ? (dev_data->isr_on ? "enabled" : "failed to enable") :
+ (dev_data->isr_on ? "failed to disable" : "disabled"));
+}
+
+/* Ensure a device is "turned off" and ready to be exported.
+ * (Also see pciback_config_reset to ensure virtual configuration space is
+ * ready to be re-exported)
+ */
+void pciback_reset_device(struct pci_dev *dev)
+{
+ u16 cmd;
+
+ pciback_control_isr(dev, 1 /* reset device */);
+
+ /* Disable devices (but not bridges) */
+ if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
+#ifdef CONFIG_PCI_MSI
+ /* The guest could have been abruptly killed without
+ * disabling MSI/MSI-X interrupts.*/
+ if (dev->msix_enabled)
+ pci_disable_msix(dev);
+ if (dev->msi_enabled)
+ pci_disable_msi(dev);
+#endif
+ pci_disable_device(dev);
+
+ pci_write_config_word(dev, PCI_COMMAND, 0);
+
+ dev->is_busmaster = 0;
+ } else {
+ pci_read_config_word(dev, PCI_COMMAND, &cmd);
+ if (cmd & (PCI_COMMAND_INVALIDATE)) {
+ cmd &= ~(PCI_COMMAND_INVALIDATE);
+ pci_write_config_word(dev, PCI_COMMAND, cmd);
+
+ dev->is_busmaster = 0;
+ }
+ }
+}
+/*
+* Now the same evtchn is used for both pcifront conf_read_write request
+* as well as pcie aer front end ack. We use a new work_queue to schedule
+* pciback conf_read_write service for avoiding confict with aer_core
+* do_recovery job which also use the system default work_queue
+*/
+void test_and_schedule_op(struct pciback_device *pdev)
+{
+ /* Check that frontend is requesting an operation and that we are not
+ * already processing a request */
+ if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
+ && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) {
+ queue_work(pciback_wq, &pdev->op_work);
+ }
+ /*_XEN_PCIB_active should have been cleared by pcifront. And also make
+ sure pciback is waiting for ack by checking _PCIB_op_pending*/
+ if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
+ && test_bit(_PCIB_op_pending, &pdev->flags)) {
+ wake_up(&aer_wait_queue);
+ }
+}
+
+/* Performing the configuration space reads/writes must not be done in atomic
+ * context because some of the pci_* functions can sleep (mostly due to ACPI
+ * use of semaphores). This function is intended to be called from a work
+ * queue in process context taking a struct pciback_device as a parameter */
+
+void pciback_do_op(struct work_struct *data)
+{
+ struct pciback_device *pdev =
+ container_of(data, struct pciback_device, op_work);
+ struct pci_dev *dev;
+ struct pciback_dev_data *dev_data = NULL;
+ struct xen_pci_op *op = &pdev->sh_info->op;
+ int test_intx = 0;
+
+ dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
+
+ if (dev == NULL)
+ op->err = XEN_PCI_ERR_dev_not_found;
+ else {
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ test_intx = dev_data->enable_intx;
+ switch (op->cmd) {
+ case XEN_PCI_OP_conf_read:
+ op->err = pciback_config_read(dev,
+ op->offset, op->size, &op->value);
+ break;
+ case XEN_PCI_OP_conf_write:
+ op->err = pciback_config_write(dev,
+ op->offset, op->size, op->value);
+ break;
+#ifdef CONFIG_PCI_MSI
+ case XEN_PCI_OP_enable_msi:
+ op->err = pciback_enable_msi(pdev, dev, op);
+ break;
+ case XEN_PCI_OP_disable_msi:
+ op->err = pciback_disable_msi(pdev, dev, op);
+ break;
+ case XEN_PCI_OP_enable_msix:
+ op->err = pciback_enable_msix(pdev, dev, op);
+ break;
+ case XEN_PCI_OP_disable_msix:
+ op->err = pciback_disable_msix(pdev, dev, op);
+ break;
+#endif
+ default:
+ op->err = XEN_PCI_ERR_not_implemented;
+ break;
+ }
+ }
+ if (!op->err && dev && dev_data) {
+ /* Transition detected */
+ if ((dev_data->enable_intx != test_intx))
+ pciback_control_isr(dev, 0 /* no reset */);
+ }
+ /* Tell the driver domain that we're done. */
+ wmb();
+ clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
+ notify_remote_via_irq(pdev->evtchn_irq);
+
+ /* Mark that we're done. */
+ smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */
+ clear_bit(_PDEVF_op_active, &pdev->flags);
+ smp_mb__after_clear_bit(); /* /before/ final check for work */
+
+ /* Check to see if the driver domain tried to start another request in
+ * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active.
+ */
+ test_and_schedule_op(pdev);
+}
+
+irqreturn_t pciback_handle_event(int irq, void *dev_id)
+{
+ struct pciback_device *pdev = dev_id;
+
+ test_and_schedule_op(pdev);
+
+ return IRQ_HANDLED;
+}
+irqreturn_t pciback_guest_interrupt(int irq, void *dev_id)
+{
+ struct pci_dev *dev = (struct pci_dev *)dev_id;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+
+ if (dev_data->isr_on && dev_data->ack_intr) {
+ dev_data->handled++;
+ if ((dev_data->handled % 1000) == 0) {
+ if (xen_ignore_irq(irq)) {
+ printk(KERN_INFO "%s IRQ line is not shared "
+ "with other domains. Turning ISR off\n",
+ dev_data->irq_name);
+ dev_data->ack_intr = 0;
+ }
+ }
+ return IRQ_HANDLED;
+ }
+ return IRQ_NONE;
+}
diff --git a/drivers/xen/pciback/slot.c b/drivers/xen/pciback/slot.c
new file mode 100644
index 0000000..efb922d
--- /dev/null
+++ b/drivers/xen/pciback/slot.c
@@ -0,0 +1,191 @@
+/*
+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
+ * to the frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> (vpci.c)
+ * Author: Tristan Gingold <tristan.gingold@bull.net>, from vpci.c
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+/* There are at most 32 slots in a pci bus. */
+#define PCI_SLOT_MAX 32
+
+#define PCI_BUS_NBR 2
+
+struct slot_dev_data {
+ /* Access to dev_list must be protected by lock */
+ struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX];
+ spinlock_t lock;
+};
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn)
+{
+ struct pci_dev *dev = NULL;
+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+ unsigned long flags;
+
+ if (domain != 0 || PCI_FUNC(devfn) != 0)
+ return NULL;
+
+ if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR)
+ return NULL;
+
+ spin_lock_irqsave(&slot_dev->lock, flags);
+ dev = slot_dev->slots[bus][PCI_SLOT(devfn)];
+ spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+ return dev;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb)
+{
+ int err = 0, slot, bus;
+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+ unsigned long flags;
+
+ if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
+ err = -EFAULT;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Can't export bridges on the virtual PCI bus");
+ goto out;
+ }
+
+ spin_lock_irqsave(&slot_dev->lock, flags);
+
+ /* Assign to a new slot on the virtual PCI bus */
+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ if (slot_dev->slots[bus][slot] == NULL) {
+ printk(KERN_INFO
+ "pciback: slot: %s: assign to virtual "
+ "slot %d, bus %d\n",
+ pci_name(dev), slot, bus);
+ slot_dev->slots[bus][slot] = dev;
+ goto unlock;
+ }
+ }
+
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "No more space on root virtual PCI bus");
+
+unlock:
+ spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+ /* Publish this device. */
+ if (!err)
+ err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, 0), devid);
+
+out:
+ return err;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+ int slot, bus;
+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&slot_dev->lock, flags);
+
+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ if (slot_dev->slots[bus][slot] == dev) {
+ slot_dev->slots[bus][slot] = NULL;
+ found_dev = dev;
+ goto out;
+ }
+ }
+
+out:
+ spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+ if (found_dev)
+ pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+ int slot, bus;
+ struct slot_dev_data *slot_dev;
+
+ slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL);
+ if (!slot_dev)
+ return -ENOMEM;
+
+ spin_lock_init(&slot_dev->lock);
+
+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++)
+ slot_dev->slots[bus][slot] = NULL;
+
+ pdev->pci_dev_data = slot_dev;
+
+ return 0;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+ publish_pci_root_cb publish_cb)
+{
+ /* The Virtual PCI bus has only one root */
+ return publish_cb(pdev, 0, 0);
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+ int slot, bus;
+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+ struct pci_dev *dev;
+
+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ dev = slot_dev->slots[bus][slot];
+ if (dev != NULL)
+ pcistub_put_pci_dev(dev);
+ }
+
+ kfree(slot_dev);
+ pdev->pci_dev_data = NULL;
+}
+
+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
+ struct pciback_device *pdev,
+ unsigned int *domain, unsigned int *bus,
+ unsigned int *devfn)
+{
+ int slot, busnr;
+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+ struct pci_dev *dev;
+ int found = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&slot_dev->lock, flags);
+
+ for (busnr = 0; busnr < PCI_BUS_NBR; bus++)
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ dev = slot_dev->slots[busnr][slot];
+ if (dev && dev->bus->number == pcidev->bus->number
+ && dev->devfn == pcidev->devfn
+ && pci_domain_nr(dev->bus) ==
+ pci_domain_nr(pcidev->bus)) {
+ found = 1;
+ *domain = 0;
+ *bus = busnr;
+ *devfn = PCI_DEVFN(slot, 0);
+ goto out;
+ }
+ }
+out:
+ spin_unlock_irqrestore(&slot_dev->lock, flags);
+ return found;
+
+}
diff --git a/drivers/xen/pciback/vpci.c b/drivers/xen/pciback/vpci.c
new file mode 100644
index 0000000..2857ab8
--- /dev/null
+++ b/drivers/xen/pciback/vpci.c
@@ -0,0 +1,244 @@
+/*
+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
+ * to the frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+#define PCI_SLOT_MAX 32
+
+struct vpci_dev_data {
+ /* Access to dev_list must be protected by lock */
+ struct list_head dev_list[PCI_SLOT_MAX];
+ spinlock_t lock;
+};
+
+static inline struct list_head *list_first(struct list_head *head)
+{
+ return head->next;
+}
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn)
+{
+ struct pci_dev_entry *entry;
+ struct pci_dev *dev = NULL;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+ unsigned long flags;
+
+ if (domain != 0 || bus != 0)
+ return NULL;
+
+ if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
+ spin_lock_irqsave(&vpci_dev->lock, flags);
+
+ list_for_each_entry(entry,
+ &vpci_dev->dev_list[PCI_SLOT(devfn)],
+ list) {
+ if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
+ dev = entry->dev;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&vpci_dev->lock, flags);
+ }
+ return dev;
+}
+
+static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
+{
+ if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
+ && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
+ return 1;
+
+ return 0;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb)
+{
+ int err = 0, slot, func = -1;
+ struct pci_dev_entry *t, *dev_entry;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+ unsigned long flags;
+
+ if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
+ err = -EFAULT;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Can't export bridges on the virtual PCI bus");
+ goto out;
+ }
+
+ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
+ if (!dev_entry) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error adding entry to virtual PCI bus");
+ goto out;
+ }
+
+ dev_entry->dev = dev;
+
+ spin_lock_irqsave(&vpci_dev->lock, flags);
+
+ /* Keep multi-function devices together on the virtual PCI bus */
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ if (!list_empty(&vpci_dev->dev_list[slot])) {
+ t = list_entry(list_first(&vpci_dev->dev_list[slot]),
+ struct pci_dev_entry, list);
+
+ if (match_slot(dev, t->dev)) {
+ pr_info("pciback: vpci: %s: "
+ "assign to virtual slot %d func %d\n",
+ pci_name(dev), slot,
+ PCI_FUNC(dev->devfn));
+ list_add_tail(&dev_entry->list,
+ &vpci_dev->dev_list[slot]);
+ func = PCI_FUNC(dev->devfn);
+ goto unlock;
+ }
+ }
+ }
+
+ /* Assign to a new slot on the virtual PCI bus */
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ if (list_empty(&vpci_dev->dev_list[slot])) {
+ printk(KERN_INFO
+ "pciback: vpci: %s: assign to virtual slot %d\n",
+ pci_name(dev), slot);
+ list_add_tail(&dev_entry->list,
+ &vpci_dev->dev_list[slot]);
+ func = PCI_FUNC(dev->devfn);
+ goto unlock;
+ }
+ }
+
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "No more space on root virtual PCI bus");
+
+unlock:
+ spin_unlock_irqrestore(&vpci_dev->lock, flags);
+
+ /* Publish this device. */
+ if (!err)
+ err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid);
+
+out:
+ return err;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+ int slot;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vpci_dev->lock, flags);
+
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ struct pci_dev_entry *e, *tmp;
+ list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
+ list) {
+ if (e->dev == dev) {
+ list_del(&e->list);
+ found_dev = e->dev;
+ kfree(e);
+ goto out;
+ }
+ }
+ }
+
+out:
+ spin_unlock_irqrestore(&vpci_dev->lock, flags);
+
+ if (found_dev)
+ pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+ int slot;
+ struct vpci_dev_data *vpci_dev;
+
+ vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
+ if (!vpci_dev)
+ return -ENOMEM;
+
+ spin_lock_init(&vpci_dev->lock);
+
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++)
+ INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
+
+ pdev->pci_dev_data = vpci_dev;
+
+ return 0;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+ publish_pci_root_cb publish_cb)
+{
+ /* The Virtual PCI bus has only one root */
+ return publish_cb(pdev, 0, 0);
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+ int slot;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ struct pci_dev_entry *e, *tmp;
+ list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
+ list) {
+ list_del(&e->list);
+ pcistub_put_pci_dev(e->dev);
+ kfree(e);
+ }
+ }
+
+ kfree(vpci_dev);
+ pdev->pci_dev_data = NULL;
+}
+
+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
+ struct pciback_device *pdev,
+ unsigned int *domain, unsigned int *bus,
+ unsigned int *devfn)
+{
+ struct pci_dev_entry *entry;
+ struct pci_dev *dev = NULL;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+ unsigned long flags;
+ int found = 0, slot;
+
+ spin_lock_irqsave(&vpci_dev->lock, flags);
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ list_for_each_entry(entry,
+ &vpci_dev->dev_list[slot],
+ list) {
+ dev = entry->dev;
+ if (dev && dev->bus->number == pcidev->bus->number
+ && pci_domain_nr(dev->bus) ==
+ pci_domain_nr(pcidev->bus)
+ && dev->devfn == pcidev->devfn) {
+ found = 1;
+ *domain = 0;
+ *bus = 0;
+ *devfn = PCI_DEVFN(slot,
+ PCI_FUNC(pcidev->devfn));
+ }
+ }
+ }
+ spin_unlock_irqrestore(&vpci_dev->lock, flags);
+ return found;
+}
diff --git a/drivers/xen/pciback/xenbus.c b/drivers/xen/pciback/xenbus.c
new file mode 100644
index 0000000..f0d5426
--- /dev/null
+++ b/drivers/xen/pciback/xenbus.c
@@ -0,0 +1,730 @@
+/*
+ * PCI Backend Xenbus Setup - handles setup with frontend and xend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+#include <linux/workqueue.h>
+#include "pciback.h"
+
+#define INVALID_EVTCHN_IRQ (-1)
+struct workqueue_struct *pciback_wq;
+
+static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
+{
+ struct pciback_device *pdev;
+
+ pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
+ if (pdev == NULL)
+ goto out;
+ dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
+
+ pdev->xdev = xdev;
+ dev_set_drvdata(&xdev->dev, pdev);
+
+ spin_lock_init(&pdev->dev_lock);
+
+ pdev->sh_info = NULL;
+ pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
+ pdev->be_watching = 0;
+
+ INIT_WORK(&pdev->op_work, pciback_do_op);
+
+ if (pciback_init_devices(pdev)) {
+ kfree(pdev);
+ pdev = NULL;
+ }
+out:
+ return pdev;
+}
+
+static void pciback_disconnect(struct pciback_device *pdev)
+{
+ spin_lock(&pdev->dev_lock);
+
+ /* Ensure the guest can't trigger our handler before removing devices */
+ if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) {
+ unbind_from_irqhandler(pdev->evtchn_irq, pdev);
+ pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
+ }
+ spin_unlock(&pdev->dev_lock);
+
+ /* If the driver domain started an op, make sure we complete it
+ * before releasing the shared memory */
+
+ /* Note, the workqueue does not use spinlocks at all.*/
+ flush_workqueue(pciback_wq);
+
+ spin_lock(&pdev->dev_lock);
+ if (pdev->sh_info != NULL) {
+ xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info);
+ pdev->sh_info = NULL;
+ }
+ spin_unlock(&pdev->dev_lock);
+
+}
+
+static void free_pdev(struct pciback_device *pdev)
+{
+ spin_lock(&pdev->dev_lock);
+ if (pdev->be_watching) {
+ unregister_xenbus_watch(&pdev->be_watch);
+ pdev->be_watching = 0;
+ }
+ spin_unlock(&pdev->dev_lock);
+
+ pciback_disconnect(pdev);
+
+ pciback_release_devices(pdev);
+
+ dev_set_drvdata(&pdev->xdev->dev, NULL);
+ pdev->xdev = NULL;
+
+ kfree(pdev);
+}
+
+static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref,
+ int remote_evtchn)
+{
+ int err = 0;
+ void *vaddr;
+
+ dev_dbg(&pdev->xdev->dev,
+ "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
+ gnt_ref, remote_evtchn);
+
+ err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error mapping other domain page in ours.");
+ goto out;
+ }
+
+ spin_lock(&pdev->dev_lock);
+ pdev->sh_info = vaddr;
+ spin_unlock(&pdev->dev_lock);
+
+ err = bind_interdomain_evtchn_to_irqhandler(
+ pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event,
+ 0, "pciback", pdev);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error binding event channel to IRQ");
+ goto out;
+ }
+
+ spin_lock(&pdev->dev_lock);
+ pdev->evtchn_irq = err;
+ spin_unlock(&pdev->dev_lock);
+ err = 0;
+
+ dev_dbg(&pdev->xdev->dev, "Attached!\n");
+out:
+ return err;
+}
+
+static int pciback_attach(struct pciback_device *pdev)
+{
+ int err = 0;
+ int gnt_ref, remote_evtchn;
+ char *magic = NULL;
+
+
+ /* Make sure we only do this setup once */
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateInitialised)
+ goto out;
+
+ /* Wait for frontend to state that it has published the configuration */
+ if (xenbus_read_driver_state(pdev->xdev->otherend) !=
+ XenbusStateInitialised)
+ goto out;
+
+ dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
+
+ err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
+ "pci-op-ref", "%u", &gnt_ref,
+ "event-channel", "%u", &remote_evtchn,
+ "magic", NULL, &magic, NULL);
+ if (err) {
+ /* If configuration didn't get read correctly, wait longer */
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading configuration from frontend");
+ goto out;
+ }
+
+ if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
+ xenbus_dev_fatal(pdev->xdev, -EFAULT,
+ "version mismatch (%s/%s) with pcifront - "
+ "halting pciback",
+ magic, XEN_PCI_MAGIC);
+ goto out;
+ }
+
+ err = pciback_do_attach(pdev, gnt_ref, remote_evtchn);
+ if (err)
+ goto out;
+
+ dev_dbg(&pdev->xdev->dev, "Connecting...\n");
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+ if (err)
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching to connected state!");
+
+ dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
+out:
+
+ kfree(magic);
+
+ return err;
+}
+
+static int pciback_publish_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn, unsigned int devid)
+{
+ int err;
+ int len;
+ char str[64];
+
+ len = snprintf(str, sizeof(str), "vdev-%d", devid);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+ "%04x:%02x:%02x.%02x", domain, bus,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+out:
+ return err;
+}
+
+static int pciback_export_device(struct pciback_device *pdev,
+ int domain, int bus, int slot, int func,
+ int devid)
+{
+ struct pci_dev *dev;
+ int err = 0;
+
+ dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
+ domain, bus, slot, func);
+
+ dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
+ if (!dev) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Couldn't locate PCI device "
+ "(%04x:%02x:%02x.%01x)! "
+ "perhaps already in-use?",
+ domain, bus, slot, func);
+ goto out;
+ }
+
+ err = pciback_add_pci_dev(pdev, dev, devid, pciback_publish_pci_dev);
+ if (err)
+ goto out;
+
+ dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id);
+ if (xen_register_device_domain_owner(dev,
+ pdev->xdev->otherend_id) != 0) {
+ dev_err(&dev->dev, "device has been assigned to another " \
+ "domain! Over-writting the ownership, but beware.\n");
+ xen_unregister_device_domain_owner(dev);
+ xen_register_device_domain_owner(dev, pdev->xdev->otherend_id);
+ }
+
+ /* TODO: It'd be nice to export a bridge and have all of its children
+ * get exported with it. This may be best done in xend (which will
+ * have to calculate resource usage anyway) but we probably want to
+ * put something in here to ensure that if a bridge gets given to a
+ * driver domain, that all devices under that bridge are not given
+ * to other driver domains (as he who controls the bridge can disable
+ * it and stop the other devices from working).
+ */
+out:
+ return err;
+}
+
+static int pciback_remove_device(struct pciback_device *pdev,
+ int domain, int bus, int slot, int func)
+{
+ int err = 0;
+ struct pci_dev *dev;
+
+ dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n",
+ domain, bus, slot, func);
+
+ dev = pciback_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func));
+ if (!dev) {
+ err = -EINVAL;
+ dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device "
+ "(%04x:%02x:%02x.%01x)! not owned by this domain\n",
+ domain, bus, slot, func);
+ goto out;
+ }
+
+ dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id);
+ xen_unregister_device_domain_owner(dev);
+
+ pciback_release_pci_dev(pdev, dev);
+
+out:
+ return err;
+}
+
+static int pciback_publish_pci_root(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus)
+{
+ unsigned int d, b;
+ int i, root_num, len, err;
+ char str[64];
+
+ dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ "root_num", "%d", &root_num);
+ if (err == 0 || err == -ENOENT)
+ root_num = 0;
+ else if (err < 0)
+ goto out;
+
+ /* Verify that we haven't already published this pci root */
+ for (i = 0; i < root_num; i++) {
+ len = snprintf(str, sizeof(str), "root-%d", i);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ str, "%x:%x", &d, &b);
+ if (err < 0)
+ goto out;
+ if (err != 2) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (d == domain && b == bus) {
+ err = 0;
+ goto out;
+ }
+ }
+
+ len = snprintf(str, sizeof(str), "root-%d", root_num);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
+ root_num, domain, bus);
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+ "%04x:%02x", domain, bus);
+ if (err)
+ goto out;
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
+ "root_num", "%d", (root_num + 1));
+
+out:
+ return err;
+}
+
+static int pciback_reconfigure(struct pciback_device *pdev)
+{
+ int err = 0;
+ int num_devs;
+ int domain, bus, slot, func;
+ int substate;
+ int i, len;
+ char state_str[64];
+ char dev_str[64];
+
+
+ dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n");
+
+ /* Make sure we only reconfigure once */
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateReconfiguring)
+ goto out;
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
+ &num_devs);
+ if (err != 1) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading number of devices");
+ goto out;
+ }
+
+ for (i = 0; i < num_devs; i++) {
+ len = snprintf(state_str, sizeof(state_str), "state-%d", i);
+ if (unlikely(len >= (sizeof(state_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while reading "
+ "configuration");
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str,
+ "%d", &substate);
+ if (err != 1)
+ substate = XenbusStateUnknown;
+
+ switch (substate) {
+ case XenbusStateInitialising:
+ dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i);
+
+ len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
+ if (unlikely(len >= (sizeof(dev_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while "
+ "reading configuration");
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ dev_str, "%x:%x:%x.%x",
+ &domain, &bus, &slot, &func);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading device "
+ "configuration");
+ goto out;
+ }
+ if (err != 4) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error parsing pci device "
+ "configuration");
+ goto out;
+ }
+
+ err = pciback_export_device(pdev, domain, bus, slot,
+ func, i);
+ if (err)
+ goto out;
+
+ /* Publish pci roots. */
+ err = pciback_publish_pci_roots(pdev,
+ pciback_publish_pci_root);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error while publish PCI root"
+ "buses for frontend");
+ goto out;
+ }
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
+ state_str, "%d",
+ XenbusStateInitialised);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching substate of "
+ "dev-%d\n", i);
+ goto out;
+ }
+ break;
+
+ case XenbusStateClosing:
+ dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i);
+
+ len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i);
+ if (unlikely(len >= (sizeof(dev_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while "
+ "reading configuration");
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ dev_str, "%x:%x:%x.%x",
+ &domain, &bus, &slot, &func);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading device "
+ "configuration");
+ goto out;
+ }
+ if (err != 4) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error parsing pci device "
+ "configuration");
+ goto out;
+ }
+
+ err = pciback_remove_device(pdev, domain, bus, slot,
+ func);
+ if (err)
+ goto out;
+
+ /* TODO: If at some point we implement support for pci
+ * root hot-remove on pcifront side, we'll need to
+ * remove unnecessary xenstore nodes of pci roots here.
+ */
+
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching to reconfigured state!");
+ goto out;
+ }
+
+out:
+ return 0;
+}
+
+static void pciback_frontend_changed(struct xenbus_device *xdev,
+ enum xenbus_state fe_state)
+{
+ struct pciback_device *pdev = dev_get_drvdata(&xdev->dev);
+
+ dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
+
+ switch (fe_state) {
+ case XenbusStateInitialised:
+ pciback_attach(pdev);
+ break;
+
+ case XenbusStateReconfiguring:
+ pciback_reconfigure(pdev);
+ break;
+
+ case XenbusStateConnected:
+ /* pcifront switched its state from reconfiguring to connected.
+ * Then switch to connected state.
+ */
+ xenbus_switch_state(xdev, XenbusStateConnected);
+ break;
+
+ case XenbusStateClosing:
+ pciback_disconnect(pdev);
+ xenbus_switch_state(xdev, XenbusStateClosing);
+ break;
+
+ case XenbusStateClosed:
+ pciback_disconnect(pdev);
+ xenbus_switch_state(xdev, XenbusStateClosed);
+ if (xenbus_dev_is_online(xdev))
+ break;
+ /* fall through if not online */
+ case XenbusStateUnknown:
+ dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
+ device_unregister(&xdev->dev);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static int pciback_setup_backend(struct pciback_device *pdev)
+{
+ /* Get configuration from xend (if available now) */
+ int domain, bus, slot, func;
+ int err = 0;
+ int i, num_devs;
+ char dev_str[64];
+ char state_str[64];
+
+ /* It's possible we could get the call to setup twice, so make sure
+ * we're not already connected.
+ */
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateInitWait)
+ goto out;
+
+ dev_dbg(&pdev->xdev->dev, "getting be setup\n");
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
+ &num_devs);
+ if (err != 1) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading number of devices");
+ goto out;
+ }
+
+ for (i = 0; i < num_devs; i++) {
+ int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
+ if (unlikely(l >= (sizeof(dev_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while reading "
+ "configuration");
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
+ "%x:%x:%x.%x", &domain, &bus, &slot, &func);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading device configuration");
+ goto out;
+ }
+ if (err != 4) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error parsing pci device "
+ "configuration");
+ goto out;
+ }
+
+ err = pciback_export_device(pdev, domain, bus, slot, func, i);
+ if (err)
+ goto out;
+
+ /* Switch substate of this device. */
+ l = snprintf(state_str, sizeof(state_str), "state-%d", i);
+ if (unlikely(l >= (sizeof(state_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while reading "
+ "configuration");
+ goto out;
+ }
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str,
+ "%d", XenbusStateInitialised);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err, "Error switching "
+ "substate of dev-%d\n", i);
+ goto out;
+ }
+ }
+
+ err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error while publish PCI root buses "
+ "for frontend");
+ goto out;
+ }
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
+ if (err)
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching to initialised state!");
+
+out:
+ if (!err)
+ /* see if pcifront is already configured (if not, we'll wait) */
+ pciback_attach(pdev);
+
+ return err;
+}
+
+static void pciback_be_watch(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ struct pciback_device *pdev =
+ container_of(watch, struct pciback_device, be_watch);
+
+ switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
+ case XenbusStateInitWait:
+ pciback_setup_backend(pdev);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static int pciback_xenbus_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ int err = 0;
+ struct pciback_device *pdev = alloc_pdev(dev);
+
+ if (pdev == NULL) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(dev, err,
+ "Error allocating pciback_device struct");
+ goto out;
+ }
+
+ /* wait for xend to configure us */
+ err = xenbus_switch_state(dev, XenbusStateInitWait);
+ if (err)
+ goto out;
+
+ /* watch the backend node for backend configuration information */
+ err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
+ pciback_be_watch);
+ if (err)
+ goto out;
+
+ spin_lock(&pdev->dev_lock);
+ pdev->be_watching = 1;
+ spin_unlock(&pdev->dev_lock);
+
+ /* We need to force a call to our callback here in case
+ * xend already configured us!
+ */
+ pciback_be_watch(&pdev->be_watch, NULL, 0);
+
+out:
+ return err;
+}
+
+static int pciback_xenbus_remove(struct xenbus_device *dev)
+{
+ struct pciback_device *pdev = dev_get_drvdata(&dev->dev);
+
+ if (pdev != NULL)
+ free_pdev(pdev);
+
+ return 0;
+}
+
+static const struct xenbus_device_id xenpci_ids[] = {
+ {"pci"},
+ {""},
+};
+
+static struct xenbus_driver xenbus_pciback_driver = {
+ .name = "pciback",
+ .owner = THIS_MODULE,
+ .ids = xenpci_ids,
+ .probe = pciback_xenbus_probe,
+ .remove = pciback_xenbus_remove,
+ .otherend_changed = pciback_frontend_changed,
+};
+
+int __init pciback_xenbus_register(void)
+{
+ pciback_wq = create_workqueue("pciback_workqueue");
+ if (!pciback_wq) {
+ printk(KERN_ERR "%s: create"
+ "pciback_workqueue failed\n",__FUNCTION__);
+ return -EFAULT;
+ }
+ return xenbus_register_backend(&xenbus_pciback_driver);
+}
+
+void __exit pciback_xenbus_unregister(void)
+{
+ destroy_workqueue(pciback_wq);
+ xenbus_unregister_driver(&xenbus_pciback_driver);
+}
diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c
new file mode 100644
index 0000000..6d1a770
--- /dev/null
+++ b/drivers/xen/pcpu.c
@@ -0,0 +1,452 @@
+/*
+ * pcpu.c - management physical cpu in dom0 environment
+ */
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <linux/cpu.h>
+#include <xen/xenbus.h>
+#include <xen/pcpu.h>
+#include <xen/events.h>
+#include <xen/acpi.h>
+
+static struct sysdev_class xen_pcpu_sysdev_class = {
+ .name = "xen_pcpu",
+};
+
+static DEFINE_MUTEX(xen_pcpu_lock);
+static RAW_NOTIFIER_HEAD(xen_pcpu_chain);
+
+/* No need for irq disable since hotplug notify is in workqueue context */
+#define get_pcpu_lock() mutex_lock(&xen_pcpu_lock);
+#define put_pcpu_lock() mutex_unlock(&xen_pcpu_lock);
+
+struct xen_pcpus {
+ struct list_head list;
+ int present;
+};
+static struct xen_pcpus xen_pcpus;
+
+int register_xen_pcpu_notifier(struct notifier_block *nb)
+{
+ int ret;
+
+ /* All refer to the chain notifier is protected by the pcpu_lock */
+ get_pcpu_lock();
+ ret = raw_notifier_chain_register(&xen_pcpu_chain, nb);
+ put_pcpu_lock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_xen_pcpu_notifier);
+
+void unregister_xen_pcpu_notifier(struct notifier_block *nb)
+{
+ get_pcpu_lock();
+ raw_notifier_chain_unregister(&xen_pcpu_chain, nb);
+ put_pcpu_lock();
+}
+EXPORT_SYMBOL_GPL(unregister_xen_pcpu_notifier);
+
+static int xen_pcpu_down(uint32_t xen_id)
+{
+ int ret;
+ xen_platform_op_t op = {
+ .cmd = XENPF_cpu_offline,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.cpu_ol.cpuid = xen_id,
+ };
+
+ ret = HYPERVISOR_dom0_op(&op);
+ return ret;
+}
+
+static int xen_pcpu_up(uint32_t xen_id)
+{
+ int ret;
+ xen_platform_op_t op = {
+ .cmd = XENPF_cpu_online,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.cpu_ol.cpuid = xen_id,
+ };
+
+ ret = HYPERVISOR_dom0_op(&op);
+ return ret;
+}
+
+static ssize_t show_online(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ char *buf)
+{
+ struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
+
+ return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE));
+}
+
+static ssize_t __ref store_online(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
+ ssize_t ret;
+
+ switch (buf[0]) {
+ case '0':
+ ret = xen_pcpu_down(cpu->xen_id);
+ break;
+ case '1':
+ ret = xen_pcpu_up(cpu->xen_id);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (ret >= 0)
+ ret = count;
+ return ret;
+}
+
+static SYSDEV_ATTR(online, 0644, show_online, store_online);
+
+static ssize_t show_apicid(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ char *buf)
+{
+ struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
+
+ return sprintf(buf, "%u\n", cpu->apic_id);
+}
+
+static ssize_t show_acpiid(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ char *buf)
+{
+ struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
+
+ return sprintf(buf, "%u\n", cpu->acpi_id);
+}
+static SYSDEV_ATTR(apic_id, 0444, show_apicid, NULL);
+static SYSDEV_ATTR(acpi_id, 0444, show_acpiid, NULL);
+
+static int xen_pcpu_free(struct pcpu *pcpu)
+{
+ if (!pcpu)
+ return 0;
+
+ sysdev_remove_file(&pcpu->sysdev, &attr_online);
+ sysdev_unregister(&pcpu->sysdev);
+ list_del(&pcpu->pcpu_list);
+ kfree(pcpu);
+
+ return 0;
+}
+
+static inline int same_pcpu(struct xenpf_pcpuinfo *info,
+ struct pcpu *pcpu)
+{
+ return (pcpu->apic_id == info->apic_id) &&
+ (pcpu->xen_id == info->xen_cpuid);
+}
+
+/*
+ * Return 1 if online status changed
+ */
+static int xen_pcpu_online_check(struct xenpf_pcpuinfo *info,
+ struct pcpu *pcpu)
+{
+ int result = 0;
+
+ if (info->xen_cpuid != pcpu->xen_id)
+ return 0;
+
+ if (xen_pcpu_online(info->flags) && !xen_pcpu_online(pcpu->flags)) {
+ /* the pcpu is onlined */
+ pcpu->flags |= XEN_PCPU_FLAGS_ONLINE;
+ kobject_uevent(&pcpu->sysdev.kobj, KOBJ_ONLINE);
+ raw_notifier_call_chain(&xen_pcpu_chain,
+ XEN_PCPU_ONLINE, (void *)(long)pcpu->xen_id);
+ result = 1;
+ } else if (!xen_pcpu_online(info->flags) &&
+ xen_pcpu_online(pcpu->flags)) {
+ /* The pcpu is offlined now */
+ pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE;
+ kobject_uevent(&pcpu->sysdev.kobj, KOBJ_OFFLINE);
+ raw_notifier_call_chain(&xen_pcpu_chain,
+ XEN_PCPU_OFFLINE, (void *)(long)pcpu->xen_id);
+ result = 1;
+ }
+
+ return result;
+}
+
+static int pcpu_sysdev_init(struct pcpu *cpu)
+{
+ int error;
+
+ error = sysdev_register(&cpu->sysdev);
+ if (error) {
+ printk(KERN_WARNING "xen_pcpu_add: Failed to register pcpu\n");
+ kfree(cpu);
+ return -1;
+ }
+ sysdev_create_file(&cpu->sysdev, &attr_online);
+ sysdev_create_file(&cpu->sysdev, &attr_apic_id);
+ sysdev_create_file(&cpu->sysdev, &attr_acpi_id);
+ return 0;
+}
+
+static struct pcpu *get_pcpu(int xen_id)
+{
+ struct pcpu *pcpu = NULL;
+
+ list_for_each_entry(pcpu, &xen_pcpus.list, pcpu_list) {
+ if (pcpu->xen_id == xen_id)
+ return pcpu;
+ }
+ return NULL;
+}
+
+static struct pcpu *init_pcpu(struct xenpf_pcpuinfo *info)
+{
+ struct pcpu *pcpu;
+
+ if (info->flags & XEN_PCPU_FLAGS_INVALID)
+ return NULL;
+
+ /* The PCPU is just added */
+ pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL);
+ if (!pcpu)
+ return NULL;
+
+ INIT_LIST_HEAD(&pcpu->pcpu_list);
+ pcpu->xen_id = info->xen_cpuid;
+ pcpu->apic_id = info->apic_id;
+ pcpu->acpi_id = info->acpi_id;
+ pcpu->flags = info->flags;
+
+ pcpu->sysdev.cls = &xen_pcpu_sysdev_class;
+ pcpu->sysdev.id = info->xen_cpuid;
+
+ if (pcpu_sysdev_init(pcpu)) {
+ kfree(pcpu);
+ return NULL;
+ }
+
+ list_add_tail(&pcpu->pcpu_list, &xen_pcpus.list);
+ raw_notifier_call_chain(&xen_pcpu_chain,
+ XEN_PCPU_ADD,
+ (void *)(long)pcpu->xen_id);
+ return pcpu;
+}
+
+#define PCPU_NO_CHANGE 0
+#define PCPU_ADDED 1
+#define PCPU_ONLINE_OFFLINE 2
+#define PCPU_REMOVED 3
+/*
+ * Caller should hold the pcpu lock
+ * < 0: Something wrong
+ * 0: No changes
+ * > 0: State changed
+ */
+static struct pcpu *_sync_pcpu(int cpu_num, int *max_id, int *result)
+{
+ struct pcpu *pcpu = NULL;
+ struct xenpf_pcpuinfo *info;
+ xen_platform_op_t op = {
+ .cmd = XENPF_get_cpuinfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ };
+ int ret;
+
+ *result = -1;
+
+ info = &op.u.pcpu_info;
+ info->xen_cpuid = cpu_num;
+
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ return NULL;
+
+ if (max_id)
+ *max_id = op.u.pcpu_info.max_present;
+
+ pcpu = get_pcpu(cpu_num);
+
+ if (info->flags & XEN_PCPU_FLAGS_INVALID) {
+ /* The pcpu has been removed */
+ *result = PCPU_NO_CHANGE;
+ if (pcpu) {
+ raw_notifier_call_chain(&xen_pcpu_chain,
+ XEN_PCPU_REMOVE,
+ (void *)(long)pcpu->xen_id);
+ xen_pcpu_free(pcpu);
+ *result = PCPU_REMOVED;
+ }
+ return NULL;
+ }
+
+
+ if (!pcpu) {
+ *result = PCPU_ADDED;
+ pcpu = init_pcpu(info);
+ if (pcpu == NULL) {
+ printk(KERN_WARNING "Failed to init pcpu %x\n",
+ info->xen_cpuid);
+ *result = -1;
+ }
+ } else {
+ *result = PCPU_NO_CHANGE;
+ /*
+ * Old PCPU is replaced with a new pcpu, this means
+ * several virq is missed, will it happen?
+ */
+ if (!same_pcpu(info, pcpu)) {
+ printk(KERN_WARNING "Pcpu %x changed!\n",
+ pcpu->xen_id);
+ pcpu->apic_id = info->apic_id;
+ pcpu->acpi_id = info->acpi_id;
+ }
+ if (xen_pcpu_online_check(info, pcpu))
+ *result = PCPU_ONLINE_OFFLINE;
+ }
+ return pcpu;
+}
+
+int xen_pcpu_index(uint32_t id, int is_acpiid)
+{
+ int cpu_num = 0, max_id = 0, ret;
+ xen_platform_op_t op = {
+ .cmd = XENPF_get_cpuinfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ };
+ struct xenpf_pcpuinfo *info = &op.u.pcpu_info;
+
+ info->xen_cpuid = 0;
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ return -1;
+ max_id = op.u.pcpu_info.max_present;
+
+ while ((cpu_num <= max_id)) {
+ info->xen_cpuid = cpu_num;
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ continue;
+
+ if (op.u.pcpu_info.max_present > max_id)
+ max_id = op.u.pcpu_info.max_present;
+ if (id == (is_acpiid ? info->acpi_id : info->apic_id))
+ return cpu_num;
+ cpu_num++;
+ }
+
+ return -1;
+}
+EXPORT_SYMBOL(xen_pcpu_index);
+
+/*
+ * Sync dom0's pcpu information with xen hypervisor's
+ */
+static int xen_sync_pcpus(void)
+{
+ /*
+ * Boot cpu always have cpu_id 0 in xen
+ */
+ int cpu_num = 0, max_id = 0, result = 0, present = 0;
+ struct list_head *elem, *tmp;
+ struct pcpu *pcpu;
+
+ get_pcpu_lock();
+
+ while ((result >= 0) && (cpu_num <= max_id)) {
+ pcpu = _sync_pcpu(cpu_num, &max_id, &result);
+
+ printk(KERN_DEBUG "sync cpu %x get result %x max_id %x\n",
+ cpu_num, result, max_id);
+
+ switch (result) {
+ case PCPU_NO_CHANGE:
+ if (pcpu)
+ present++;
+ break;
+ case PCPU_ADDED:
+ case PCPU_ONLINE_OFFLINE:
+ present++;
+ case PCPU_REMOVED:
+ break;
+ default:
+ printk(KERN_WARNING "Failed to sync pcpu %x\n",
+ cpu_num);
+ break;
+
+ }
+ cpu_num++;
+ }
+
+ if (result < 0) {
+ list_for_each_safe(elem, tmp, &xen_pcpus.list) {
+ pcpu = list_entry(elem, struct pcpu, pcpu_list);
+ xen_pcpu_free(pcpu);
+ }
+ present = 0;
+ }
+
+ xen_pcpus.present = present;
+
+ put_pcpu_lock();
+
+ return 0;
+}
+
+static void xen_pcpu_dpc(struct work_struct *work)
+{
+ if (xen_sync_pcpus() < 0)
+ printk(KERN_WARNING
+ "xen_pcpu_dpc: Failed to sync pcpu information\n");
+}
+static DECLARE_WORK(xen_pcpu_work, xen_pcpu_dpc);
+
+int xen_pcpu_hotplug(int type, uint32_t apic_id)
+{
+ schedule_work(&xen_pcpu_work);
+
+ return 0;
+}
+EXPORT_SYMBOL(xen_pcpu_hotplug);
+
+static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id)
+{
+ schedule_work(&xen_pcpu_work);
+ return IRQ_HANDLED;
+}
+
+static int __init xen_pcpu_init(void)
+{
+ int err;
+
+ if (!xen_initial_domain())
+ return 0;
+
+ err = sysdev_class_register(&xen_pcpu_sysdev_class);
+ if (err) {
+ printk(KERN_WARNING
+ "xen_pcpu_init: register xen_pcpu sysdev Failed!\n");
+ return err;
+ }
+
+ INIT_LIST_HEAD(&xen_pcpus.list);
+ xen_pcpus.present = 0;
+
+ xen_sync_pcpus();
+ if (xen_pcpus.present > 0)
+ err = bind_virq_to_irqhandler(VIRQ_PCPU_STATE,
+ 0, xen_pcpu_interrupt, 0, "pcpu", NULL);
+ if (err < 0)
+ printk(KERN_WARNING "xen_pcpu_init: "
+ "Failed to bind pcpu_state virq\n"
+ "You will lost latest information! \n");
+ return err;
+}
+
+arch_initcall(xen_pcpu_init);
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
new file mode 100644
index 0000000..8adaa3b
--- /dev/null
+++ b/drivers/xen/platform-pci.c
@@ -0,0 +1,204 @@
+/******************************************************************************
+ * platform-pci.c
+ *
+ * Xen platform PCI device driver
+ * Copyright (c) 2005, Intel Corporation.
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include <xen/platform_pci.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/hvm.h>
+#include <xen/xen-ops.h>
+
+#define DRV_NAME "xen-platform-pci"
+
+MODULE_AUTHOR("ssmith@xensource.com and stefano.stabellini@eu.citrix.com");
+MODULE_DESCRIPTION("Xen platform PCI device");
+MODULE_LICENSE("GPL");
+
+static unsigned long platform_mmio;
+static unsigned long platform_mmio_alloc;
+static unsigned long platform_mmiolen;
+static uint64_t callback_via;
+
+unsigned long alloc_xen_mmio(unsigned long len)
+{
+ unsigned long addr;
+
+ addr = platform_mmio + platform_mmio_alloc;
+ platform_mmio_alloc += len;
+ BUG_ON(platform_mmio_alloc > platform_mmiolen);
+
+ return addr;
+}
+
+static uint64_t get_callback_via(struct pci_dev *pdev)
+{
+ u8 pin;
+ int irq;
+
+ irq = pdev->irq;
+ if (irq < 16)
+ return irq; /* ISA IRQ */
+
+ pin = pdev->pin;
+
+ /* We don't know the GSI. Specify the PCI INTx line instead. */
+ return ((uint64_t)0x01 << 56) | /* PCI INTx identifier */
+ ((uint64_t)pci_domain_nr(pdev->bus) << 32) |
+ ((uint64_t)pdev->bus->number << 16) |
+ ((uint64_t)(pdev->devfn & 0xff) << 8) |
+ ((uint64_t)(pin - 1) & 3);
+}
+
+static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id)
+{
+ xen_hvm_evtchn_do_upcall();
+ return IRQ_HANDLED;
+}
+
+static int xen_allocate_irq(struct pci_dev *pdev)
+{
+ return request_irq(pdev->irq, do_hvm_evtchn_intr,
+ IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING,
+ "xen-platform-pci", pdev);
+}
+
+static int platform_pci_resume(struct pci_dev *pdev)
+{
+ int err;
+ if (xen_have_vector_callback)
+ return 0;
+ err = xen_set_callback_via(callback_via);
+ if (err) {
+ dev_err(&pdev->dev, "platform_pci_resume failure!\n");
+ return err;
+ }
+ return 0;
+}
+
+static int __devinit platform_pci_init(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ int i, ret;
+ long ioaddr, iolen;
+ long mmio_addr, mmio_len;
+ unsigned int max_nr_gframes;
+
+ i = pci_enable_device(pdev);
+ if (i)
+ return i;
+
+ ioaddr = pci_resource_start(pdev, 0);
+ iolen = pci_resource_len(pdev, 0);
+
+ mmio_addr = pci_resource_start(pdev, 1);
+ mmio_len = pci_resource_len(pdev, 1);
+
+ if (mmio_addr == 0 || ioaddr == 0) {
+ dev_err(&pdev->dev, "no resources found\n");
+ ret = -ENOENT;
+ goto pci_out;
+ }
+
+ if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) {
+ dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n",
+ mmio_addr, mmio_len);
+ ret = -EBUSY;
+ goto pci_out;
+ }
+
+ if (request_region(ioaddr, iolen, DRV_NAME) == NULL) {
+ dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n",
+ iolen, ioaddr);
+ ret = -EBUSY;
+ goto mem_out;
+ }
+
+ platform_mmio = mmio_addr;
+ platform_mmiolen = mmio_len;
+
+ if (!xen_have_vector_callback) {
+ ret = xen_allocate_irq(pdev);
+ if (ret) {
+ dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret);
+ goto out;
+ }
+ callback_via = get_callback_via(pdev);
+ ret = xen_set_callback_via(callback_via);
+ if (ret) {
+ dev_warn(&pdev->dev, "Unable to set the evtchn callback "
+ "err=%d\n", ret);
+ goto out;
+ }
+ }
+
+ max_nr_gframes = gnttab_max_grant_frames();
+ xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+ ret = gnttab_init();
+ if (ret)
+ goto out;
+ xenbus_probe(NULL);
+ return 0;
+
+out:
+ release_region(ioaddr, iolen);
+mem_out:
+ release_mem_region(mmio_addr, mmio_len);
+pci_out:
+ pci_disable_device(pdev);
+ return ret;
+}
+
+static struct pci_device_id platform_pci_tbl[] __devinitdata = {
+ {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM,
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+ {0,}
+};
+
+MODULE_DEVICE_TABLE(pci, platform_pci_tbl);
+
+static struct pci_driver platform_driver = {
+ .name = DRV_NAME,
+ .probe = platform_pci_init,
+ .id_table = platform_pci_tbl,
+#ifdef CONFIG_PM
+ .resume_early = platform_pci_resume,
+#endif
+};
+
+static int __init platform_pci_module_init(void)
+{
+ /* no unplug has been done, IGNORE hasn't been specified: just
+ * return now */
+ if (!xen_platform_pci_unplug)
+ return -ENODEV;
+
+ return pci_register_driver(&platform_driver);
+}
+
+module_init(platform_pci_module_init);
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index 88a60e0..ae5cb05 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -14,6 +14,7 @@
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/interface/xen.h>
#include <xen/interface/version.h>
diff --git a/drivers/xen/xen_acpi_memhotplug.c b/drivers/xen/xen_acpi_memhotplug.c
new file mode 100644
index 0000000..0c4af99
--- /dev/null
+++ b/drivers/xen/xen_acpi_memhotplug.c
@@ -0,0 +1,209 @@
+/*
+ * xen_acpi_memhotplug.c - interface to notify Xen on memory device hotadd
+ *
+ * Copyright (C) 2008, Intel corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/memory_hotplug.h>
+#include <acpi/acpi_drivers.h>
+#include <xen/interface/platform.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <xen/acpi.h>
+
+struct xen_hotmem_entry {
+ struct list_head hotmem_list;
+ uint64_t start;
+ uint64_t end;
+ uint32_t flags;
+ uint32_t pxm;
+};
+
+struct xen_hotmem_list {
+ struct list_head list;
+ int entry_nr;
+} xen_hotmem;
+
+DEFINE_SPINLOCK(xen_hotmem_lock);
+
+static int xen_hyper_addmem(struct xen_hotmem_entry *entry)
+{
+ int ret;
+
+ xen_platform_op_t op = {
+ .cmd = XENPF_mem_hotadd,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ };
+ op.u.mem_add.spfn = entry->start >> PAGE_SHIFT;
+ op.u.mem_add.epfn = entry->end >> PAGE_SHIFT;
+ op.u.mem_add.flags = entry->flags;
+ op.u.mem_add.pxm = entry->pxm;
+
+ ret = HYPERVISOR_dom0_op(&op);
+ return ret;
+}
+
+static int add_hotmem_entry(int pxm, uint64_t start,
+ uint64_t length, uint32_t flags)
+{
+ struct xen_hotmem_entry *entry;
+
+ if (pxm < 0 || !length)
+ return -EINVAL;
+
+ entry = kzalloc(sizeof(struct xen_hotmem_entry), GFP_ATOMIC);
+ if (!entry)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&entry->hotmem_list);
+ entry->start = start;
+ entry->end = start + length;
+ entry->flags = flags;
+ entry->pxm = pxm;
+
+ spin_lock(&xen_hotmem_lock);
+
+ list_add_tail(&entry->hotmem_list, &xen_hotmem.list);
+ xen_hotmem.entry_nr++;
+
+ spin_unlock(&xen_hotmem_lock);
+
+ return 0;
+}
+
+static int free_hotmem_entry(struct xen_hotmem_entry *entry)
+{
+ list_del(&entry->hotmem_list);
+ kfree(entry);
+
+ return 0;
+}
+
+static void xen_hotadd_mem_dpc(struct work_struct *work)
+{
+ struct list_head *elem, *tmp;
+ struct xen_hotmem_entry *entry;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&xen_hotmem_lock, flags);
+ list_for_each_safe(elem, tmp, &xen_hotmem.list) {
+ entry = list_entry(elem, struct xen_hotmem_entry, hotmem_list);
+ ret = xen_hyper_addmem(entry);
+ if (ret)
+ printk(KERN_WARNING "xen addmem failed with %x\n", ret);
+ free_hotmem_entry(entry);
+ xen_hotmem.entry_nr--;
+ }
+ spin_unlock_irqrestore(&xen_hotmem_lock, flags);
+}
+
+static DECLARE_WORK(xen_hotadd_mem_work, xen_hotadd_mem_dpc);
+
+static int xen_acpi_get_pxm(acpi_handle h)
+{
+ unsigned long long pxm;
+ acpi_status status;
+ acpi_handle handle;
+ acpi_handle phandle = h;
+
+ do {
+ handle = phandle;
+ status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm);
+ if (ACPI_SUCCESS(status))
+ return pxm;
+ status = acpi_get_parent(handle, &phandle);
+ } while (ACPI_SUCCESS(status));
+
+ return -1;
+}
+
+int xen_hotadd_memory(struct acpi_memory_device *mem_device)
+{
+ int pxm, result;
+ int num_enabled = 0;
+ struct acpi_memory_info *info;
+
+ if (!mem_device)
+ return -EINVAL;
+
+ pxm = xen_acpi_get_pxm(mem_device->device->handle);
+
+ if (pxm < 0)
+ return -EINVAL;
+
+ /*
+ * Always return success to ACPI driver, and notify hypervisor later
+ * because hypervisor will utilize the memory in memory hotadd hypercall
+ */
+ list_for_each_entry(info, &mem_device->res_list, list) {
+ if (info->enabled) { /* just sanity check...*/
+ num_enabled++;
+ continue;
+ }
+ /*
+ * If the memory block size is zero, please ignore it.
+ * Don't try to do the following memory hotplug flowchart.
+ */
+ if (!info->length)
+ continue;
+
+ result = add_hotmem_entry(pxm, info->start_addr,
+ info->length, 0);
+ if (result)
+ continue;
+ info->enabled = 1;
+ num_enabled++;
+ }
+
+ if (!num_enabled)
+ return -EINVAL;
+
+ schedule_work(&xen_hotadd_mem_work);
+
+ return 0;
+}
+EXPORT_SYMBOL(xen_hotadd_memory);
+
+static int xen_hotadd_mem_init(void)
+{
+ if (!xen_initial_domain())
+ return -ENODEV;
+
+ INIT_LIST_HEAD(&xen_hotmem.list);
+ xen_hotmem.entry_nr = 0;
+
+ return 0;
+}
+
+static void xen_hotadd_mem_exit(void)
+{
+ flush_scheduled_work();
+}
+
+module_init(xen_hotadd_mem_init);
+module_exit(xen_hotadd_mem_exit);
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
index 5571f5b..8dca685 100644
--- a/drivers/xen/xenbus/Makefile
+++ b/drivers/xen/xenbus/Makefile
@@ -5,3 +5,8 @@
xenbus-objs += xenbus_comms.o
xenbus-objs += xenbus_xs.o
xenbus-objs += xenbus_probe.o
+
+xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
+xenbus-objs += $(xenbus-be-objs-y)
+
+obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index 92a1ef8..89f2e42 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -49,6 +49,8 @@
[ XenbusStateConnected ] = "Connected",
[ XenbusStateClosing ] = "Closing",
[ XenbusStateClosed ] = "Closed",
+ [ XenbusStateReconfiguring ] = "Reconfiguring",
+ [ XenbusStateReconfigured ] = "Reconfigured",
};
return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
}
@@ -132,6 +134,64 @@
}
EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
+static void xenbus_switch_fatal(struct xenbus_device *, int, int,
+ const char *, ...);
+
+static int
+__xenbus_switch_state(struct xenbus_device *dev,
+ enum xenbus_state state, int depth)
+{
+ /* We check whether the state is currently set to the given value, and
+ if not, then the state is set. We don't want to unconditionally
+ write the given state, because we don't want to fire watches
+ unnecessarily. Furthermore, if the node has gone, we don't write
+ to it, as the device will be tearing down, and we don't want to
+ resurrect that directory.
+
+ Note that, because of this cached value of our state, this
+ function will not take a caller's Xenstore transaction
+ (something it was trying to in the past) because dev->state
+ would not get reset if the transaction was aborted.
+ */
+
+ struct xenbus_transaction xbt;
+ int current_state;
+ int err, abort;
+
+ if (state == dev->state)
+ return 0;
+
+again:
+ abort = 1;
+
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_switch_fatal(dev, depth, err, "starting transaction");
+ return 0;
+ }
+
+ err = xenbus_scanf(xbt, dev->nodename, "state", "%d", ¤t_state);
+ if (err != 1)
+ goto abort;
+
+ err = xenbus_printf(xbt, dev->nodename, "state", "%d", state);
+ if (err) {
+ xenbus_switch_fatal(dev, depth, err, "writing new state");
+ goto abort;
+ }
+
+ abort = 0;
+abort:
+ err = xenbus_transaction_end(xbt, abort);
+ if (err) {
+ if (err == -EAGAIN && !abort)
+ goto again;
+ xenbus_switch_fatal(dev, depth, err, "ending transaction");
+ } else
+ dev->state = state;
+
+ return 0;
+}
/**
* xenbus_switch_state
@@ -144,42 +204,9 @@
*/
int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
{
- /* We check whether the state is currently set to the given value, and
- if not, then the state is set. We don't want to unconditionally
- write the given state, because we don't want to fire watches
- unnecessarily. Furthermore, if the node has gone, we don't write
- to it, as the device will be tearing down, and we don't want to
- resurrect that directory.
-
- Note that, because of this cached value of our state, this function
- will not work inside a Xenstore transaction (something it was
- trying to in the past) because dev->state would not get reset if
- the transaction was aborted.
-
- */
-
- int current_state;
- int err;
-
- if (state == dev->state)
- return 0;
-
- err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d",
- ¤t_state);
- if (err != 1)
- return 0;
-
- err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state);
- if (err) {
- if (state != XenbusStateClosing) /* Avoid looping */
- xenbus_dev_fatal(dev, err, "writing new state");
- return err;
- }
-
- dev->state = state;
-
- return 0;
+ return __xenbus_switch_state(dev, state, 0);
}
+
EXPORT_SYMBOL_GPL(xenbus_switch_state);
int xenbus_frontend_closed(struct xenbus_device *dev)
@@ -283,6 +310,23 @@
EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
/**
+ * Equivalent to xenbus_dev_fatal(dev, err, fmt, args), but helps
+ * avoiding recursion within xenbus_switch_state.
+ */
+static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
+ const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ xenbus_va_dev_error(dev, err, fmt, ap);
+ va_end(ap);
+
+ if (!depth)
+ __xenbus_switch_state(dev, XenbusStateClosing, 1);
+}
+
+/**
* xenbus_grant_ring
* @dev: xenbus device
* @ring_mfn: mfn of ring to grant
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 649fcdf..be4c7c3 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -49,31 +49,29 @@
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/events.h>
#include <xen/page.h>
+#include <xen/platform_pci.h>
+#include <xen/hvm.h>
+
#include "xenbus_comms.h"
#include "xenbus_probe.h"
int xen_store_evtchn;
-EXPORT_SYMBOL(xen_store_evtchn);
+EXPORT_SYMBOL_GPL(xen_store_evtchn);
struct xenstore_domain_interface *xen_store_interface;
+EXPORT_SYMBOL_GPL(xen_store_interface);
+
static unsigned long xen_store_mfn;
static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
-static void wait_for_devices(struct xenbus_driver *xendrv);
-
-static int xenbus_probe_frontend(const char *type, const char *name);
-
-static void xenbus_dev_shutdown(struct device *_dev);
-
-static int xenbus_dev_suspend(struct device *dev, pm_message_t state);
-static int xenbus_dev_resume(struct device *dev);
-
/* If something in array of ids matches this device, return it. */
static const struct xenbus_device_id *
match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
@@ -94,34 +92,7 @@
return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
}
-
-static int xenbus_uevent(struct device *_dev, struct kobj_uevent_env *env)
-{
- struct xenbus_device *dev = to_xenbus_device(_dev);
-
- if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
- return -ENOMEM;
-
- return 0;
-}
-
-/* device/<type>/<id> => <type>-<id> */
-static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
-{
- nodename = strchr(nodename, '/');
- if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
- printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
- return -EINVAL;
- }
-
- strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
- if (!strchr(bus_id, '/')) {
- printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
- return -EINVAL;
- }
- *strchr(bus_id, '/') = '-';
- return 0;
-}
+EXPORT_SYMBOL_GPL(xenbus_match);
static void free_otherend_details(struct xenbus_device *dev)
@@ -141,7 +112,28 @@
}
-int read_otherend_details(struct xenbus_device *xendev,
+static int talk_to_otherend(struct xenbus_device *dev)
+{
+ struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+
+ free_otherend_watch(dev);
+ free_otherend_details(dev);
+
+ return drv->read_otherend_details(dev);
+}
+
+
+
+static int watch_otherend(struct xenbus_device *dev)
+{
+ struct xen_bus_type *bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
+
+ return xenbus_watch_pathfmt(dev, &dev->otherend_watch, bus->otherend_changed,
+ "%s/%s", dev->otherend, "state");
+}
+
+
+int xenbus_read_otherend_details(struct xenbus_device *xendev,
char *id_node, char *path_node)
{
int err = xenbus_gather(XBT_NIL, xendev->nodename,
@@ -166,39 +158,11 @@
return 0;
}
+EXPORT_SYMBOL_GPL(xenbus_read_otherend_details);
-
-static int read_backend_details(struct xenbus_device *xendev)
-{
- return read_otherend_details(xendev, "backend-id", "backend");
-}
-
-static struct device_attribute xenbus_dev_attrs[] = {
- __ATTR_NULL
-};
-
-/* Bus type for frontend drivers. */
-static struct xen_bus_type xenbus_frontend = {
- .root = "device",
- .levels = 2, /* device/type/<id> */
- .get_bus_id = frontend_bus_id,
- .probe = xenbus_probe_frontend,
- .bus = {
- .name = "xen",
- .match = xenbus_match,
- .uevent = xenbus_uevent,
- .probe = xenbus_dev_probe,
- .remove = xenbus_dev_remove,
- .shutdown = xenbus_dev_shutdown,
- .dev_attrs = xenbus_dev_attrs,
-
- .suspend = xenbus_dev_suspend,
- .resume = xenbus_dev_resume,
- },
-};
-
-static void otherend_changed(struct xenbus_watch *watch,
- const char **vec, unsigned int len)
+void xenbus_otherend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len,
+ int ignore_on_shutdown)
{
struct xenbus_device *dev =
container_of(watch, struct xenbus_device, otherend_watch);
@@ -226,11 +190,7 @@
* work that can fail e.g., when the rootfs is gone.
*/
if (system_state > SYSTEM_RUNNING) {
- struct xen_bus_type *bus = bus;
- bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
- /* If we're frontend, drive the state machine to Closed. */
- /* This should cause the backend to release our resources. */
- if ((bus == &xenbus_frontend) && (state == XenbusStateClosing))
+ if (ignore_on_shutdown && (state == XenbusStateClosing))
xenbus_frontend_closed(dev);
return;
}
@@ -238,25 +198,7 @@
if (drv->otherend_changed)
drv->otherend_changed(dev, state);
}
-
-
-static int talk_to_otherend(struct xenbus_device *dev)
-{
- struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
-
- free_otherend_watch(dev);
- free_otherend_details(dev);
-
- return drv->read_otherend_details(dev);
-}
-
-
-static int watch_otherend(struct xenbus_device *dev)
-{
- return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed,
- "%s/%s", dev->otherend, "state");
-}
-
+EXPORT_SYMBOL_GPL(xenbus_otherend_changed);
int xenbus_dev_probe(struct device *_dev)
{
@@ -300,8 +242,9 @@
fail:
xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
xenbus_switch_state(dev, XenbusStateClosed);
- return -ENODEV;
+ return err;
}
+EXPORT_SYMBOL_GPL(xenbus_dev_probe);
int xenbus_dev_remove(struct device *_dev)
{
@@ -319,8 +262,9 @@
xenbus_switch_state(dev, XenbusStateClosed);
return 0;
}
+EXPORT_SYMBOL_GPL(xenbus_dev_remove);
-static void xenbus_dev_shutdown(struct device *_dev)
+void xenbus_dev_shutdown(struct device *_dev)
{
struct xenbus_device *dev = to_xenbus_device(_dev);
unsigned long timeout = 5*HZ;
@@ -341,6 +285,7 @@
out:
put_device(&dev->dev);
}
+EXPORT_SYMBOL_GPL(xenbus_dev_shutdown);
int xenbus_register_driver_common(struct xenbus_driver *drv,
struct xen_bus_type *bus,
@@ -354,25 +299,7 @@
return driver_register(&drv->driver);
}
-
-int __xenbus_register_frontend(struct xenbus_driver *drv,
- struct module *owner, const char *mod_name)
-{
- int ret;
-
- drv->read_otherend_details = read_backend_details;
-
- ret = xenbus_register_driver_common(drv, &xenbus_frontend,
- owner, mod_name);
- if (ret)
- return ret;
-
- /* If this driver is loaded as a module wait for devices to attach. */
- wait_for_devices(drv);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
+EXPORT_SYMBOL_GPL(xenbus_register_driver_common);
void xenbus_unregister_driver(struct xenbus_driver *drv)
{
@@ -543,24 +470,7 @@
kfree(xendev);
return err;
}
-
-/* device/<typename>/<name> */
-static int xenbus_probe_frontend(const char *type, const char *name)
-{
- char *nodename;
- int err;
-
- nodename = kasprintf(GFP_KERNEL, "%s/%s/%s",
- xenbus_frontend.root, type, name);
- if (!nodename)
- return -ENOMEM;
-
- DPRINTK("%s", nodename);
-
- err = xenbus_probe_node(&xenbus_frontend, type, nodename);
- kfree(nodename);
- return err;
-}
+EXPORT_SYMBOL_GPL(xenbus_probe_node);
static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
{
@@ -574,10 +484,11 @@
return PTR_ERR(dir);
for (i = 0; i < dir_n; i++) {
- err = bus->probe(type, dir[i]);
+ err = bus->probe(bus, type, dir[i]);
if (err)
break;
}
+
kfree(dir);
return err;
}
@@ -597,9 +508,11 @@
if (err)
break;
}
+
kfree(dir);
return err;
}
+EXPORT_SYMBOL_GPL(xenbus_probe_devices);
static unsigned int char_count(const char *str, char c)
{
@@ -662,54 +575,37 @@
}
EXPORT_SYMBOL_GPL(xenbus_dev_changed);
-static void frontend_changed(struct xenbus_watch *watch,
- const char **vec, unsigned int len)
-{
- DPRINTK("");
-
- xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
-}
-
-/* We watch for devices appearing and vanishing. */
-static struct xenbus_watch fe_watch = {
- .node = "device",
- .callback = frontend_changed,
-};
-
-static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
+int xenbus_dev_suspend(struct device *dev)
{
int err = 0;
struct xenbus_driver *drv;
- struct xenbus_device *xdev;
+ struct xenbus_device *xdev = container_of(dev, struct xenbus_device, dev);
- DPRINTK("");
+ DPRINTK("%s", xdev->nodename);
if (dev->driver == NULL)
return 0;
drv = to_xenbus_driver(dev->driver);
- xdev = container_of(dev, struct xenbus_device, dev);
if (drv->suspend)
- err = drv->suspend(xdev, state);
+ err = drv->suspend(xdev);
if (err)
printk(KERN_WARNING
"xenbus: suspend %s failed: %i\n", dev_name(dev), err);
return 0;
}
+EXPORT_SYMBOL_GPL(xenbus_dev_suspend);
-static int xenbus_dev_resume(struct device *dev)
+int xenbus_dev_resume(struct device *dev)
{
int err;
struct xenbus_driver *drv;
- struct xenbus_device *xdev;
+ struct xenbus_device *xdev = container_of(dev, struct xenbus_device, dev);
- DPRINTK("");
+ DPRINTK("%s", xdev->nodename);
if (dev->driver == NULL)
return 0;
-
drv = to_xenbus_driver(dev->driver);
- xdev = container_of(dev, struct xenbus_device, dev);
-
err = talk_to_otherend(xdev);
if (err) {
printk(KERN_WARNING
@@ -740,6 +636,15 @@
return 0;
}
+EXPORT_SYMBOL_GPL(xenbus_dev_resume);
+
+int xenbus_dev_cancel(struct device *dev)
+{
+ /* Do nothing */
+ DPRINTK("cancel");
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_cancel);
/* A flag to determine if xenstored is 'ready' (i.e. has started) */
int xenstored_ready = 0;
@@ -766,59 +671,96 @@
void xenbus_probe(struct work_struct *unused)
{
- BUG_ON((xenstored_ready <= 0));
-
- /* Enumerate devices in xenstore and watch for changes. */
- xenbus_probe_devices(&xenbus_frontend);
- register_xenbus_watch(&fe_watch);
- xenbus_backend_probe_and_watch();
+ xenstored_ready = 1;
/* Notify others that xenstore is up */
blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
}
+EXPORT_SYMBOL_GPL(xenbus_probe);
-static int __init xenbus_probe_init(void)
+static int __init xenbus_probe_initcall(void)
+{
+ if (!xen_domain())
+ return -ENODEV;
+
+ if (xen_initial_domain() || xen_hvm_domain())
+ return 0;
+
+ xenbus_probe(NULL);
+ return 0;
+}
+
+device_initcall(xenbus_probe_initcall);
+
+static int __init xenbus_init(void)
{
int err = 0;
+ unsigned long page = 0;
DPRINTK("");
err = -ENODEV;
if (!xen_domain())
- goto out_error;
-
- /* Register ourselves with the kernel bus subsystem */
- err = bus_register(&xenbus_frontend.bus);
- if (err)
- goto out_error;
-
- err = xenbus_backend_bus_register();
- if (err)
- goto out_unreg_front;
+ return err;
/*
* Domain0 doesn't have a store_evtchn or store_mfn yet.
*/
if (xen_initial_domain()) {
- /* dom0 not yet supported */
+ struct evtchn_alloc_unbound alloc_unbound;
+
+ /* Allocate Xenstore page */
+ page = get_zeroed_page(GFP_KERNEL);
+ if (!page)
+ goto out_error;
+
+ xen_store_mfn = xen_start_info->store_mfn =
+ pfn_to_mfn(virt_to_phys((void *)page) >>
+ PAGE_SHIFT);
+
+ /* Next allocate a local port which xenstored can bind to */
+ alloc_unbound.dom = DOMID_SELF;
+ alloc_unbound.remote_dom = 0;
+
+ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+ &alloc_unbound);
+ if (err == -ENOSYS)
+ goto out_error;
+
+ BUG_ON(err);
+ xen_store_evtchn = xen_start_info->store_evtchn =
+ alloc_unbound.port;
+
+ xen_store_interface = mfn_to_virt(xen_store_mfn);
} else {
xenstored_ready = 1;
- xen_store_evtchn = xen_start_info->store_evtchn;
- xen_store_mfn = xen_start_info->store_mfn;
+ if (xen_hvm_domain()) {
+ uint64_t v = 0;
+ err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
+ if (err)
+ goto out_error;
+ xen_store_evtchn = (int)v;
+ err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
+ if (err)
+ goto out_error;
+ xen_store_mfn = (unsigned long)v;
+ xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE);
+ } else {
+ xen_store_evtchn = xen_start_info->store_evtchn;
+ xen_store_mfn = xen_start_info->store_mfn;
+ xen_store_interface = mfn_to_virt(xen_store_mfn);
+ xenstored_ready = 1;
+ }
}
- xen_store_interface = mfn_to_virt(xen_store_mfn);
/* Initialize the interface to xenstore. */
err = xs_init();
if (err) {
printk(KERN_WARNING
"XENBUS: Error initializing xenstore comms: %i\n", err);
- goto out_unreg_back;
+ goto out_error;
}
- if (!xen_initial_domain())
- xenbus_probe(NULL);
-
#ifdef CONFIG_XEN_COMPAT_XENFS
/*
* Create xenfs mountpoint in /proc for compatibility with
@@ -829,128 +771,13 @@
return 0;
- out_unreg_back:
- xenbus_backend_bus_unregister();
-
- out_unreg_front:
- bus_unregister(&xenbus_frontend.bus);
-
out_error:
+ if (page != 0)
+ free_page(page);
+
return err;
}
-postcore_initcall(xenbus_probe_init);
+postcore_initcall(xenbus_init);
MODULE_LICENSE("GPL");
-
-static int is_device_connecting(struct device *dev, void *data)
-{
- struct xenbus_device *xendev = to_xenbus_device(dev);
- struct device_driver *drv = data;
- struct xenbus_driver *xendrv;
-
- /*
- * A device with no driver will never connect. We care only about
- * devices which should currently be in the process of connecting.
- */
- if (!dev->driver)
- return 0;
-
- /* Is this search limited to a particular driver? */
- if (drv && (dev->driver != drv))
- return 0;
-
- xendrv = to_xenbus_driver(dev->driver);
- return (xendev->state < XenbusStateConnected ||
- (xendev->state == XenbusStateConnected &&
- xendrv->is_ready && !xendrv->is_ready(xendev)));
-}
-
-static int exists_connecting_device(struct device_driver *drv)
-{
- return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
- is_device_connecting);
-}
-
-static int print_device_status(struct device *dev, void *data)
-{
- struct xenbus_device *xendev = to_xenbus_device(dev);
- struct device_driver *drv = data;
-
- /* Is this operation limited to a particular driver? */
- if (drv && (dev->driver != drv))
- return 0;
-
- if (!dev->driver) {
- /* Information only: is this too noisy? */
- printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
- xendev->nodename);
- } else if (xendev->state < XenbusStateConnected) {
- enum xenbus_state rstate = XenbusStateUnknown;
- if (xendev->otherend)
- rstate = xenbus_read_driver_state(xendev->otherend);
- printk(KERN_WARNING "XENBUS: Timeout connecting "
- "to device: %s (local state %d, remote state %d)\n",
- xendev->nodename, xendev->state, rstate);
- }
-
- return 0;
-}
-
-/* We only wait for device setup after most initcalls have run. */
-static int ready_to_wait_for_devices;
-
-/*
- * On a 5-minute timeout, wait for all devices currently configured. We need
- * to do this to guarantee that the filesystems and / or network devices
- * needed for boot are available, before we can allow the boot to proceed.
- *
- * This needs to be on a late_initcall, to happen after the frontend device
- * drivers have been initialised, but before the root fs is mounted.
- *
- * A possible improvement here would be to have the tools add a per-device
- * flag to the store entry, indicating whether it is needed at boot time.
- * This would allow people who knew what they were doing to accelerate their
- * boot slightly, but of course needs tools or manual intervention to set up
- * those flags correctly.
- */
-static void wait_for_devices(struct xenbus_driver *xendrv)
-{
- unsigned long start = jiffies;
- struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
- unsigned int seconds_waited = 0;
-
- if (!ready_to_wait_for_devices || !xen_domain())
- return;
-
- while (exists_connecting_device(drv)) {
- if (time_after(jiffies, start + (seconds_waited+5)*HZ)) {
- if (!seconds_waited)
- printk(KERN_WARNING "XENBUS: Waiting for "
- "devices to initialise: ");
- seconds_waited += 5;
- printk("%us...", 300 - seconds_waited);
- if (seconds_waited == 300)
- break;
- }
-
- schedule_timeout_interruptible(HZ/10);
- }
-
- if (seconds_waited)
- printk("\n");
-
- bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
- print_device_status);
-}
-
-#ifndef MODULE
-static int __init boot_wait_for_devices(void)
-{
- ready_to_wait_for_devices = 1;
- wait_for_devices(NULL);
- return 0;
-}
-
-late_initcall(boot_wait_for_devices);
-#endif
diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h
index 6c5e318..4019187 100644
--- a/drivers/xen/xenbus/xenbus_probe.h
+++ b/drivers/xen/xenbus/xenbus_probe.h
@@ -36,26 +36,13 @@
#define XEN_BUS_ID_SIZE 20
-#ifdef CONFIG_XEN_BACKEND
-extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
-extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
-extern void xenbus_backend_probe_and_watch(void);
-extern int xenbus_backend_bus_register(void);
-extern void xenbus_backend_bus_unregister(void);
-#else
-static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
-static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
-static inline void xenbus_backend_probe_and_watch(void) {}
-static inline int xenbus_backend_bus_register(void) { return 0; }
-static inline void xenbus_backend_bus_unregister(void) {}
-#endif
-
struct xen_bus_type
{
char *root;
unsigned int levels;
int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename);
- int (*probe)(const char *type, const char *dir);
+ int (*probe)(struct xen_bus_type *bus, const char *type, const char *dir);
+ void (*otherend_changed)(struct xenbus_watch *watch, const char **vec, unsigned int len);
struct bus_type bus;
};
@@ -73,4 +60,17 @@
extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus);
+extern void xenbus_dev_shutdown(struct device *_dev);
+
+extern int xenbus_dev_suspend(struct device *dev);
+extern int xenbus_dev_resume(struct device *dev);
+extern int xenbus_dev_cancel(struct device *dev);
+
+extern void xenbus_otherend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len,
+ int ignore_on_shutdown);
+
+extern int xenbus_read_otherend_details(struct xenbus_device *xendev,
+ char *id_node, char *path_node);
+
#endif
diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c
new file mode 100644
index 0000000..9b9dd36
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe_backend.c
@@ -0,0 +1,293 @@
+/******************************************************************************
+ * Talks to Xen Store to figure out what devices we have (backend half).
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005, 2006 XenSource Ltd
+ * Copyright (C) 2007 Solarflare Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define DPRINTK(fmt, args...) \
+ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
+ __func__, __LINE__, ##args)
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/features.h>
+
+#include "xenbus_comms.h"
+#include "xenbus_probe.h"
+
+/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
+static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
+{
+ int domid, err;
+ const char *devid, *type, *frontend;
+ unsigned int typelen;
+
+ type = strchr(nodename, '/');
+ if (!type)
+ return -EINVAL;
+ type++;
+ typelen = strcspn(type, "/");
+ if (!typelen || type[typelen] != '/')
+ return -EINVAL;
+
+ devid = strrchr(nodename, '/') + 1;
+
+ err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid,
+ "frontend", NULL, &frontend,
+ NULL);
+ if (err)
+ return err;
+ if (strlen(frontend) == 0)
+ err = -ERANGE;
+ if (!err && !xenbus_exists(XBT_NIL, frontend, ""))
+ err = -ENOENT;
+ kfree(frontend);
+
+ if (err)
+ return err;
+
+ if (snprintf(bus_id, XEN_BUS_ID_SIZE,
+ "%.*s-%i-%s", typelen, type, domid, devid) >= XEN_BUS_ID_SIZE)
+ return -ENOSPC;
+ return 0;
+}
+
+static int xenbus_uevent_backend(struct device *dev,
+ struct kobj_uevent_env *env)
+{
+ struct xenbus_device *xdev;
+ struct xenbus_driver *drv;
+ struct xen_bus_type *bus;
+
+ DPRINTK("");
+
+ if (dev == NULL)
+ return -ENODEV;
+
+ xdev = to_xenbus_device(dev);
+ bus = container_of(xdev->dev.bus, struct xen_bus_type, bus);
+ if (xdev == NULL)
+ return -ENODEV;
+
+ /* stuff we want to pass to /sbin/hotplug */
+ if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype))
+ return -ENOMEM;
+
+ if (add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename))
+ return -ENOMEM;
+
+ if (add_uevent_var(env, "XENBUS_BASE_PATH=%s", bus->root))
+ return -ENOMEM;
+
+ if (dev->driver) {
+ drv = to_xenbus_driver(dev->driver);
+ if (drv && drv->uevent)
+ return drv->uevent(xdev, env);
+ }
+
+ return 0;
+}
+
+/* backend/<typename>/<frontend-uuid>/<name> */
+static int xenbus_probe_backend_unit(struct xen_bus_type *bus,
+ const char *dir,
+ const char *type,
+ const char *name)
+{
+ char *nodename;
+ int err;
+
+ nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
+ if (!nodename)
+ return -ENOMEM;
+
+ DPRINTK("%s\n", nodename);
+
+ err = xenbus_probe_node(bus, type, nodename);
+ kfree(nodename);
+ return err;
+}
+
+/* backend/<typename>/<frontend-domid> */
+static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type, const char *domid)
+{
+ char *nodename;
+ int err = 0;
+ char **dir;
+ unsigned int i, dir_n = 0;
+
+ DPRINTK("");
+
+ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, domid);
+ if (!nodename)
+ return -ENOMEM;
+
+ dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n);
+ if (IS_ERR(dir)) {
+ kfree(nodename);
+ return PTR_ERR(dir);
+ }
+
+ for (i = 0; i < dir_n; i++) {
+ err = xenbus_probe_backend_unit(bus, nodename, type, dir[i]);
+ if (err)
+ break;
+ }
+ kfree(dir);
+ kfree(nodename);
+ return err;
+}
+
+static void frontend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ xenbus_otherend_changed(watch, vec, len, 0);
+}
+
+static struct device_attribute xenbus_backend_dev_attrs[] = {
+ __ATTR_NULL
+};
+
+static struct xen_bus_type xenbus_backend = {
+ .root = "backend",
+ .levels = 3, /* backend/type/<frontend>/<id> */
+ .get_bus_id = backend_bus_id,
+ .probe = xenbus_probe_backend,
+ .otherend_changed = frontend_changed,
+ .bus = {
+ .name = "xen-backend",
+ .match = xenbus_match,
+ .uevent = xenbus_uevent_backend,
+ .probe = xenbus_dev_probe,
+ .remove = xenbus_dev_remove,
+ .shutdown = xenbus_dev_shutdown,
+ .dev_attrs = xenbus_backend_dev_attrs,
+ },
+};
+
+static void backend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ DPRINTK("");
+
+ xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
+}
+
+static struct xenbus_watch be_watch = {
+ .node = "backend",
+ .callback = backend_changed,
+};
+
+static int read_frontend_details(struct xenbus_device *xendev)
+{
+ return xenbus_read_otherend_details(xendev, "frontend-id", "frontend");
+}
+
+//void xenbus_backend_suspend(int (*fn)(struct device *, void *))
+//{
+// DPRINTK("");
+// bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
+//}
+
+//void xenbus_backend_resume(int (*fn)(struct device *, void *))
+//{
+// DPRINTK("");
+// bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
+//}
+
+//int xenbus_for_each_backend(void *arg, int (*fn)(struct device *, void *))
+//{
+// return bus_for_each_dev(&xenbus_backend.bus, NULL, arg, fn);
+//}
+//EXPORT_SYMBOL_GPL(xenbus_for_each_backend);
+
+int xenbus_dev_is_online(struct xenbus_device *dev)
+{
+ int rc, val;
+
+ rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val);
+ if (rc != 1)
+ val = 0; /* no online node present */
+
+ return val;
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
+
+int __xenbus_register_backend(struct xenbus_driver *drv,
+ struct module *owner, const char *mod_name)
+{
+ drv->read_otherend_details = read_frontend_details;
+
+ return xenbus_register_driver_common(drv, &xenbus_backend,
+ owner, mod_name);
+}
+EXPORT_SYMBOL_GPL(__xenbus_register_backend);
+
+static int backend_probe_and_watch(struct notifier_block *notifier,
+ unsigned long event,
+ void *data)
+{
+ /* Enumerate devices in xenstore and watch for changes. */
+ xenbus_probe_devices(&xenbus_backend);
+ register_xenbus_watch(&be_watch);
+
+ return NOTIFY_DONE;
+}
+
+static int __init xenbus_probe_backend_init(void)
+{
+ static struct notifier_block xenstore_notifier = {
+ .notifier_call = backend_probe_and_watch
+ };
+ int err;
+
+ DPRINTK("");
+
+ /* Register ourselves with the kernel bus subsystem */
+ err = bus_register(&xenbus_backend.bus);
+ if (err)
+ return err;
+
+ register_xenstore_notifier(&xenstore_notifier);
+
+ return 0;
+}
+subsys_initcall(xenbus_probe_backend_init);
diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c
new file mode 100644
index 0000000..f91b310
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
@@ -0,0 +1,298 @@
+#define DPRINTK(fmt, args...) \
+ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
+ __func__, __LINE__, ##args)
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#include <xen/xen.h>
+#include <xen/platform_pci.h>
+
+#include "xenbus_comms.h"
+#include "xenbus_probe.h"
+
+/* device/<type>/<id> => <type>-<id> */
+static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
+{
+ nodename = strchr(nodename, '/');
+ if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
+ printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
+ return -EINVAL;
+ }
+
+ strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
+ if (!strchr(bus_id, '/')) {
+ printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
+ return -EINVAL;
+ }
+ *strchr(bus_id, '/') = '-';
+ return 0;
+}
+
+/* device/<typename>/<name> */
+static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type, const char *name)
+{
+ char *nodename;
+ int err;
+
+ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name);
+ if (!nodename)
+ return -ENOMEM;
+
+ DPRINTK("%s", nodename);
+
+ err = xenbus_probe_node(bus, type, nodename);
+ kfree(nodename);
+ return err;
+}
+
+static int xenbus_uevent_frontend(struct device *_dev, struct kobj_uevent_env *env)
+{
+ struct xenbus_device *dev = to_xenbus_device(_dev);
+
+ if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
+ return -ENOMEM;
+
+ return 0;
+}
+
+
+static void backend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ xenbus_otherend_changed(watch, vec, len, 1);
+}
+
+static struct device_attribute xenbus_frontend_dev_attrs[] = {
+ __ATTR_NULL
+};
+
+static const struct dev_pm_ops xenbus_pm_ops = {
+ .suspend = xenbus_dev_suspend,
+ .resume = xenbus_dev_resume,
+ .freeze = xenbus_dev_suspend,
+ .thaw = xenbus_dev_cancel,
+ .restore = xenbus_dev_resume,
+};
+
+static struct xen_bus_type xenbus_frontend = {
+ .root = "device",
+ .levels = 2, /* device/type/<id> */
+ .get_bus_id = frontend_bus_id,
+ .probe = xenbus_probe_frontend,
+ .otherend_changed = backend_changed,
+ .bus = {
+ .name = "xen",
+ .match = xenbus_match,
+ .uevent = xenbus_uevent_frontend,
+ .probe = xenbus_dev_probe,
+ .remove = xenbus_dev_remove,
+ .shutdown = xenbus_dev_shutdown,
+ .dev_attrs = xenbus_frontend_dev_attrs,
+
+ .pm = &xenbus_pm_ops,
+ },
+};
+
+static void frontend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ DPRINTK("");
+
+ xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
+}
+
+
+/* We watch for devices appearing and vanishing. */
+static struct xenbus_watch fe_watch = {
+ .node = "device",
+ .callback = frontend_changed,
+};
+
+static int read_backend_details(struct xenbus_device *xendev)
+{
+ return xenbus_read_otherend_details(xendev, "backend-id", "backend");
+}
+
+static int is_device_connecting(struct device *dev, void *data)
+{
+ struct xenbus_device *xendev = to_xenbus_device(dev);
+ struct device_driver *drv = data;
+ struct xenbus_driver *xendrv;
+
+ /*
+ * A device with no driver will never connect. We care only about
+ * devices which should currently be in the process of connecting.
+ */
+ if (!dev->driver)
+ return 0;
+
+ /* Is this search limited to a particular driver? */
+ if (drv && (dev->driver != drv))
+ return 0;
+
+ xendrv = to_xenbus_driver(dev->driver);
+ return (xendev->state < XenbusStateConnected ||
+ (xendev->state == XenbusStateConnected &&
+ xendrv->is_ready && !xendrv->is_ready(xendev)));
+}
+
+static int exists_connecting_device(struct device_driver *drv)
+{
+ return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+ is_device_connecting);
+}
+
+static int print_device_status(struct device *dev, void *data)
+{
+ struct xenbus_device *xendev = to_xenbus_device(dev);
+ struct device_driver *drv = data;
+
+ /* Is this operation limited to a particular driver? */
+ if (drv && (dev->driver != drv))
+ return 0;
+
+ if (!dev->driver) {
+ /* Information only: is this too noisy? */
+ printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
+ xendev->nodename);
+ } else if (xendev->state < XenbusStateConnected) {
+ enum xenbus_state rstate = XenbusStateUnknown;
+ if (xendev->otherend)
+ rstate = xenbus_read_driver_state(xendev->otherend);
+ printk(KERN_WARNING "XENBUS: Timeout connecting "
+ "to device: %s (local state %d, remote state %d)\n",
+ xendev->nodename, xendev->state, rstate);
+ }
+
+ return 0;
+}
+
+/* We only wait for device setup after most initcalls have run. */
+static int ready_to_wait_for_devices;
+
+/*
+ * On a 5-minute timeout, wait for all devices currently configured. We need
+ * to do this to guarantee that the filesystems and / or network devices
+ * needed for boot are available, before we can allow the boot to proceed.
+ *
+ * This needs to be on a late_initcall, to happen after the frontend device
+ * drivers have been initialised, but before the root fs is mounted.
+ *
+ * A possible improvement here would be to have the tools add a per-device
+ * flag to the store entry, indicating whether it is needed at boot time.
+ * This would allow people who knew what they were doing to accelerate their
+ * boot slightly, but of course needs tools or manual intervention to set up
+ * those flags correctly.
+ */
+static void wait_for_devices(struct xenbus_driver *xendrv)
+{
+ unsigned long start = jiffies;
+ struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
+ unsigned int seconds_waited = 0;
+
+ if (!ready_to_wait_for_devices || !xen_domain())
+ return;
+
+ while (exists_connecting_device(drv)) {
+ if (time_after(jiffies, start + (seconds_waited+5)*HZ)) {
+ if (!seconds_waited)
+ printk(KERN_WARNING "XENBUS: Waiting for "
+ "devices to initialise: ");
+ seconds_waited += 5;
+ printk("%us...", 300 - seconds_waited);
+ if (seconds_waited == 300)
+ break;
+ }
+
+ schedule_timeout_interruptible(HZ/10);
+ }
+
+ if (seconds_waited)
+ printk("\n");
+
+ bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+ print_device_status);
+}
+
+int __xenbus_register_frontend(struct xenbus_driver *drv,
+ struct module *owner, const char *mod_name)
+{
+ int ret;
+
+ drv->read_otherend_details = read_backend_details;
+
+ ret = xenbus_register_driver_common(drv, &xenbus_frontend,
+ owner, mod_name);
+ if (ret)
+ return ret;
+
+ /* If this driver is loaded as a module wait for devices to attach. */
+ wait_for_devices(drv);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
+
+static int frontend_probe_and_watch(struct notifier_block *notifier,
+ unsigned long event,
+ void *data)
+{
+ /* Enumerate devices in xenstore and watch for changes. */
+ xenbus_probe_devices(&xenbus_frontend);
+ register_xenbus_watch(&fe_watch);
+
+ return NOTIFY_DONE;
+}
+
+
+static int __init xenbus_probe_frontend_init(void)
+{
+ static struct notifier_block xenstore_notifier = {
+ .notifier_call = frontend_probe_and_watch
+ };
+ int err;
+
+ DPRINTK("");
+
+ /* Register ourselves with the kernel bus subsystem */
+ err = bus_register(&xenbus_frontend.bus);
+ if (err)
+ return err;
+
+ register_xenstore_notifier(&xenstore_notifier);
+
+ return 0;
+}
+subsys_initcall(xenbus_probe_frontend_init);
+
+#ifndef MODULE
+static int __init boot_wait_for_devices(void)
+{
+ if (xen_hvm_domain() && !xen_platform_pci_unplug)
+ return -ENODEV;
+
+ ready_to_wait_for_devices = 1;
+ wait_for_devices(NULL);
+ return 0;
+}
+
+late_initcall(boot_wait_for_devices);
+#endif
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index 7b547f5..5534690 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -76,6 +76,14 @@
/*
* Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex.
* response_mutex is never taken simultaneously with the other three.
+ *
+ * transaction_mutex must be held before incrementing
+ * transaction_count. The mutex is held when a suspend is in
+ * progress to prevent new transactions starting.
+ *
+ * When decrementing transaction_count to zero the wait queue
+ * should be woken up, the suspend code waits for count to
+ * reach zero.
*/
/* One request at a time. */
@@ -85,7 +93,9 @@
struct mutex response_mutex;
/* Protect transactions against save/restore. */
- struct rw_semaphore transaction_mutex;
+ struct mutex transaction_mutex;
+ atomic_t transaction_count;
+ wait_queue_head_t transaction_wq;
/* Protect watch (de)register against save/restore. */
struct rw_semaphore watch_mutex;
@@ -157,6 +167,31 @@
return body;
}
+static void transaction_start(void)
+{
+ mutex_lock(&xs_state.transaction_mutex);
+ atomic_inc(&xs_state.transaction_count);
+ mutex_unlock(&xs_state.transaction_mutex);
+}
+
+static void transaction_end(void)
+{
+ if (atomic_dec_and_test(&xs_state.transaction_count))
+ wake_up(&xs_state.transaction_wq);
+}
+
+static void transaction_suspend(void)
+{
+ mutex_lock(&xs_state.transaction_mutex);
+ wait_event(xs_state.transaction_wq,
+ atomic_read(&xs_state.transaction_count) == 0);
+}
+
+static void transaction_resume(void)
+{
+ mutex_unlock(&xs_state.transaction_mutex);
+}
+
void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
{
void *ret;
@@ -164,7 +199,7 @@
int err;
if (req_msg.type == XS_TRANSACTION_START)
- down_read(&xs_state.transaction_mutex);
+ transaction_start();
mutex_lock(&xs_state.request_mutex);
@@ -180,7 +215,7 @@
if ((msg->type == XS_TRANSACTION_END) ||
((req_msg.type == XS_TRANSACTION_START) &&
(msg->type == XS_ERROR)))
- up_read(&xs_state.transaction_mutex);
+ transaction_end();
return ret;
}
@@ -432,11 +467,11 @@
{
char *id_str;
- down_read(&xs_state.transaction_mutex);
+ transaction_start();
id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL);
if (IS_ERR(id_str)) {
- up_read(&xs_state.transaction_mutex);
+ transaction_end();
return PTR_ERR(id_str);
}
@@ -461,7 +496,7 @@
err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
- up_read(&xs_state.transaction_mutex);
+ transaction_end();
return err;
}
@@ -662,7 +697,7 @@
void xs_suspend(void)
{
- down_write(&xs_state.transaction_mutex);
+ transaction_suspend();
down_write(&xs_state.watch_mutex);
mutex_lock(&xs_state.request_mutex);
mutex_lock(&xs_state.response_mutex);
@@ -677,7 +712,7 @@
mutex_unlock(&xs_state.response_mutex);
mutex_unlock(&xs_state.request_mutex);
- up_write(&xs_state.transaction_mutex);
+ transaction_resume();
/* No need for watches_lock: the watch_mutex is sufficient. */
list_for_each_entry(watch, &watches, list) {
@@ -693,7 +728,7 @@
mutex_unlock(&xs_state.response_mutex);
mutex_unlock(&xs_state.request_mutex);
up_write(&xs_state.watch_mutex);
- up_write(&xs_state.transaction_mutex);
+ mutex_unlock(&xs_state.transaction_mutex);
}
static int xenwatch_thread(void *unused)
@@ -843,8 +878,10 @@
mutex_init(&xs_state.request_mutex);
mutex_init(&xs_state.response_mutex);
- init_rwsem(&xs_state.transaction_mutex);
+ mutex_init(&xs_state.transaction_mutex);
init_rwsem(&xs_state.watch_mutex);
+ atomic_set(&xs_state.transaction_count, 0);
+ init_waitqueue_head(&xs_state.transaction_wq);
/* Initialize the shared memory rings to talk to xenstored */
err = xb_init_comms();
diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
index 25275c3..4fde944 100644
--- a/drivers/xen/xenfs/Makefile
+++ b/drivers/xen/xenfs/Makefile
@@ -1,3 +1,4 @@
obj-$(CONFIG_XENFS) += xenfs.o
-xenfs-objs = super.o xenbus.o
\ No newline at end of file
+xenfs-y = super.o xenbus.o privcmd.o
+xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
new file mode 100644
index 0000000..f80be7f
--- /dev/null
+++ b/drivers/xen/xenfs/privcmd.c
@@ -0,0 +1,404 @@
+/******************************************************************************
+ * privcmd.c
+ *
+ * Interface to privileged domain-0 commands.
+ *
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/uaccess.h>
+#include <linux/swap.h>
+#include <linux/smp_lock.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/privcmd.h>
+#include <xen/interface/xen.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
+#endif
+
+static long privcmd_ioctl_hypercall(void __user *udata)
+{
+ struct privcmd_hypercall hypercall;
+ long ret;
+
+ if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
+ return -EFAULT;
+
+ ret = privcmd_call(hypercall.op,
+ hypercall.arg[0], hypercall.arg[1],
+ hypercall.arg[2], hypercall.arg[3],
+ hypercall.arg[4]);
+
+ return ret;
+}
+
+static void free_page_list(struct list_head *pages)
+{
+ struct page *p, *n;
+
+ list_for_each_entry_safe(p, n, pages, lru)
+ __free_page(p);
+
+ INIT_LIST_HEAD(pages);
+}
+
+/*
+ * Given an array of items in userspace, return a list of pages
+ * containing the data. If copying fails, either because of memory
+ * allocation failure or a problem reading user memory, return an
+ * error code; its up to the caller to dispose of any partial list.
+ */
+static int gather_array(struct list_head *pagelist,
+ unsigned nelem, size_t size,
+ void __user *data)
+{
+ unsigned pageidx;
+ void *pagedata;
+ int ret;
+
+ if (size > PAGE_SIZE)
+ return 0;
+
+ pageidx = PAGE_SIZE;
+ pagedata = NULL; /* quiet, gcc */
+ while (nelem--) {
+ if (pageidx > PAGE_SIZE-size) {
+ struct page *page = alloc_page(GFP_KERNEL);
+
+ ret = -ENOMEM;
+ if (page == NULL)
+ goto fail;
+
+ pagedata = page_address(page);
+
+ list_add_tail(&page->lru, pagelist);
+ pageidx = 0;
+ }
+
+ ret = -EFAULT;
+ if (copy_from_user(pagedata + pageidx, data, size))
+ goto fail;
+
+ data += size;
+ pageidx += size;
+ }
+
+ ret = 0;
+
+fail:
+ return ret;
+}
+
+/*
+ * Call function "fn" on each element of the array fragmented
+ * over a list of pages.
+ */
+static int traverse_pages(unsigned nelem, size_t size,
+ struct list_head *pos,
+ int (*fn)(void *data, void *state),
+ void *state)
+{
+ void *pagedata;
+ unsigned pageidx;
+ int ret = 0;
+
+ BUG_ON(size > PAGE_SIZE);
+
+ pageidx = PAGE_SIZE;
+ pagedata = NULL; /* hush, gcc */
+
+ while (nelem--) {
+ if (pageidx > PAGE_SIZE-size) {
+ struct page *page;
+ pos = pos->next;
+ page = list_entry(pos, struct page, lru);
+ pagedata = page_address(page);
+ pageidx = 0;
+ }
+
+ ret = (*fn)(pagedata + pageidx, state);
+ if (ret)
+ break;
+ pageidx += size;
+ }
+
+ return ret;
+}
+
+struct mmap_mfn_state {
+ unsigned long va;
+ struct vm_area_struct *vma;
+ domid_t domain;
+};
+
+static int mmap_mfn_range(void *data, void *state)
+{
+ struct privcmd_mmap_entry *msg = data;
+ struct mmap_mfn_state *st = state;
+ struct vm_area_struct *vma = st->vma;
+ int rc;
+
+ /* Do not allow range to wrap the address space. */
+ if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
+ ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
+ return -EINVAL;
+
+ /* Range chunks must be contiguous in va space. */
+ if ((msg->va != st->va) ||
+ ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
+ return -EINVAL;
+
+ rc = xen_remap_domain_mfn_range(vma,
+ msg->va & PAGE_MASK,
+ msg->mfn, msg->npages,
+ vma->vm_page_prot,
+ st->domain);
+ if (rc < 0)
+ return rc;
+
+ st->va += msg->npages << PAGE_SHIFT;
+
+ return 0;
+}
+
+static long privcmd_ioctl_mmap(void __user *udata)
+{
+ struct privcmd_mmap mmapcmd;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int rc;
+ LIST_HEAD(pagelist);
+ struct mmap_mfn_state state;
+
+ if (!xen_initial_domain())
+ return -EPERM;
+
+ if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
+ return -EFAULT;
+
+ rc = gather_array(&pagelist,
+ mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+ mmapcmd.entry);
+
+ if (rc || list_empty(&pagelist))
+ goto out;
+
+ down_write(&mm->mmap_sem);
+
+ {
+ struct page *page = list_first_entry(&pagelist,
+ struct page, lru);
+ struct privcmd_mmap_entry *msg = page_address(page);
+
+ vma = find_vma(mm, msg->va);
+ rc = -EINVAL;
+
+ if (!vma || (msg->va != vma->vm_start) ||
+ !privcmd_enforce_singleshot_mapping(vma))
+ goto out_up;
+ }
+
+ state.va = vma->vm_start;
+ state.vma = vma;
+ state.domain = mmapcmd.dom;
+
+ rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+ &pagelist,
+ mmap_mfn_range, &state);
+
+
+out_up:
+ up_write(&mm->mmap_sem);
+
+out:
+ free_page_list(&pagelist);
+
+ return rc;
+}
+
+struct mmap_batch_state {
+ domid_t domain;
+ unsigned long va;
+ struct vm_area_struct *vma;
+ int err;
+
+ xen_pfn_t __user *user;
+};
+
+static int mmap_batch_fn(void *data, void *state)
+{
+ xen_pfn_t *mfnp = data;
+ struct mmap_batch_state *st = state;
+
+ if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
+ st->vma->vm_page_prot, st->domain) < 0) {
+ *mfnp |= 0xf0000000U;
+ st->err++;
+ }
+ st->va += PAGE_SIZE;
+
+ return 0;
+}
+
+static int mmap_return_errors(void *data, void *state)
+{
+ xen_pfn_t *mfnp = data;
+ struct mmap_batch_state *st = state;
+
+ put_user(*mfnp, st->user++);
+
+ return 0;
+}
+
+static struct vm_operations_struct privcmd_vm_ops;
+
+static long privcmd_ioctl_mmap_batch(void __user *udata)
+{
+ int ret;
+ struct privcmd_mmapbatch m;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long nr_pages;
+ LIST_HEAD(pagelist);
+ struct mmap_batch_state state;
+
+ if (!xen_initial_domain())
+ return -EPERM;
+
+ if (copy_from_user(&m, udata, sizeof(m)))
+ return -EFAULT;
+
+ nr_pages = m.num;
+ if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
+ return -EINVAL;
+
+ ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
+ m.arr);
+
+ if (ret || list_empty(&pagelist))
+ goto out;
+
+ down_write(&mm->mmap_sem);
+
+ vma = find_vma(mm, m.addr);
+ ret = -EINVAL;
+ if (!vma ||
+ vma->vm_ops != &privcmd_vm_ops ||
+ (m.addr != vma->vm_start) ||
+ ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
+ !privcmd_enforce_singleshot_mapping(vma)) {
+ up_write(&mm->mmap_sem);
+ goto out;
+ }
+
+ state.domain = m.dom;
+ state.vma = vma;
+ state.va = m.addr;
+ state.err = 0;
+
+ ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+ &pagelist, mmap_batch_fn, &state);
+
+ up_write(&mm->mmap_sem);
+
+ if (state.err > 0) {
+ ret = 0;
+
+ state.user = m.arr;
+ traverse_pages(m.num, sizeof(xen_pfn_t),
+ &pagelist,
+ mmap_return_errors, &state);
+ }
+
+out:
+ free_page_list(&pagelist);
+
+ return ret;
+}
+
+static long privcmd_ioctl(struct file *file,
+ unsigned int cmd, unsigned long data)
+{
+ int ret = -ENOSYS;
+ void __user *udata = (void __user *) data;
+
+ switch (cmd) {
+ case IOCTL_PRIVCMD_HYPERCALL:
+ ret = privcmd_ioctl_hypercall(udata);
+ break;
+
+ case IOCTL_PRIVCMD_MMAP:
+ ret = privcmd_ioctl_mmap(udata);
+ break;
+
+ case IOCTL_PRIVCMD_MMAPBATCH:
+ ret = privcmd_ioctl_mmap_batch(udata);
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
+ vma, vma->vm_start, vma->vm_end,
+ vmf->pgoff, vmf->virtual_address);
+
+ return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct privcmd_vm_ops = {
+ .fault = privcmd_fault
+};
+
+static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ /* Unsupported for auto-translate guests. */
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return -ENOSYS;
+
+ /* DONTCOPY is essential for Xen as copy_page_range is broken. */
+ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
+ vma->vm_ops = &privcmd_vm_ops;
+ vma->vm_private_data = NULL;
+
+ return 0;
+}
+
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
+{
+ return (xchg(&vma->vm_private_data, (void *)1) == NULL);
+}
+#endif
+
+const struct file_operations privcmd_file_ops = {
+ .unlocked_ioctl = privcmd_ioctl,
+ .mmap = privcmd_mmap,
+};
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 6559e0c..984891e 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -12,6 +12,10 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/magic.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+
+#include <xen/xen.h>
#include "xenfs.h"
@@ -20,6 +24,62 @@
MODULE_DESCRIPTION("Xen filesystem");
MODULE_LICENSE("GPL");
+static int xenfs_set_page_dirty(struct page *page)
+{
+ return !TestSetPageDirty(page);
+}
+
+static const struct address_space_operations xenfs_aops = {
+ .set_page_dirty = xenfs_set_page_dirty,
+};
+
+static struct backing_dev_info xenfs_backing_dev_info = {
+ .ra_pages = 0, /* No readahead */
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+
+static struct inode *xenfs_make_inode(struct super_block *sb, int mode)
+{
+ struct inode *ret = new_inode(sb);
+
+ if (ret) {
+ ret->i_mode = mode;
+ ret->i_mapping->a_ops = &xenfs_aops;
+ ret->i_mapping->backing_dev_info = &xenfs_backing_dev_info;
+ ret->i_uid = ret->i_gid = 0;
+ ret->i_blocks = 0;
+ ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
+ }
+ return ret;
+}
+
+static struct dentry *xenfs_create_file(struct super_block *sb,
+ struct dentry *parent,
+ const char *name,
+ const struct file_operations *fops,
+ void *data,
+ int mode)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+
+ dentry = d_alloc_name(parent, name);
+ if (!dentry)
+ return NULL;
+
+ inode = xenfs_make_inode(sb, S_IFREG | mode);
+ if (!inode) {
+ dput(dentry);
+ return NULL;
+ }
+
+ inode->i_fop = fops;
+ inode->i_private = data;
+
+ d_add(dentry, inode);
+ return dentry;
+}
+
static ssize_t capabilities_read(struct file *file, char __user *buf,
size_t size, loff_t *off)
{
@@ -41,10 +101,23 @@
[1] = {},
{ "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
{ "capabilities", &capabilities_file_ops, S_IRUGO },
+ { "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR },
{""},
};
+ int rc;
- return simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
+ rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
+ if (rc < 0)
+ return rc;
+
+ if (xen_initial_domain()) {
+ xenfs_create_file(sb, sb->s_root, "xsd_kva",
+ &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR);
+ xenfs_create_file(sb, sb->s_root, "xsd_port",
+ &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR);
+ }
+
+ return rc;
}
static int xenfs_get_sb(struct file_system_type *fs_type,
@@ -63,16 +136,30 @@
static int __init xenfs_init(void)
{
- if (xen_pv_domain())
- return register_filesystem(&xenfs_type);
+ int err;
+ if (!xen_domain()) {
+ printk(KERN_INFO "xenfs: not registering filesystem on non-xen platform\n");
+ return 0;
+ }
- printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n");
- return 0;
+ err = register_filesystem(&xenfs_type);
+ if (err) {
+ printk(KERN_ERR "xenfs: Unable to register filesystem!\n");
+ goto out;
+ }
+
+ err = bdi_init(&xenfs_backing_dev_info);
+ if (err)
+ unregister_filesystem(&xenfs_type);
+
+ out:
+
+ return err;
}
static void __exit xenfs_exit(void)
{
- if (xen_pv_domain())
+ if (xen_domain())
unregister_filesystem(&xenfs_type);
}
diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c
index 6c4269b..0ddef43 100644
--- a/drivers/xen/xenfs/xenbus.c
+++ b/drivers/xen/xenfs/xenbus.c
@@ -121,8 +121,12 @@
int ret;
mutex_lock(&u->reply_mutex);
+again:
while (list_empty(&u->read_buffers)) {
mutex_unlock(&u->reply_mutex);
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
ret = wait_event_interruptible(u->read_waitq,
!list_empty(&u->read_buffers));
if (ret)
@@ -140,7 +144,7 @@
i += sz - ret;
rb->cons += sz - ret;
- if (ret != sz) {
+ if (ret != 0) {
if (i == 0)
i = -EFAULT;
goto out;
@@ -156,6 +160,8 @@
struct read_buffer, list);
}
}
+ if (i == 0)
+ goto again;
out:
mutex_unlock(&u->reply_mutex);
@@ -403,6 +409,7 @@
mutex_lock(&u->reply_mutex);
rc = queue_reply(&u->read_buffers, &reply, sizeof(reply));
+ wake_up(&u->read_waitq);
mutex_unlock(&u->reply_mutex);
}
@@ -451,7 +458,7 @@
ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
- if (ret == len) {
+ if (ret != 0) {
rc = -EFAULT;
goto out;
}
@@ -484,21 +491,6 @@
msg_type = u->u.msg.type;
switch (msg_type) {
- case XS_TRANSACTION_START:
- case XS_TRANSACTION_END:
- case XS_DIRECTORY:
- case XS_READ:
- case XS_GET_PERMS:
- case XS_RELEASE:
- case XS_GET_DOMAIN_PATH:
- case XS_WRITE:
- case XS_MKDIR:
- case XS_RM:
- case XS_SET_PERMS:
- /* Send out a transaction */
- ret = xenbus_write_transaction(msg_type, u);
- break;
-
case XS_WATCH:
case XS_UNWATCH:
/* (Un)Ask for some path to be watched for changes */
@@ -506,7 +498,8 @@
break;
default:
- ret = -EINVAL;
+ /* Send out a transaction */
+ ret = xenbus_write_transaction(msg_type, u);
break;
}
if (ret != 0)
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
index 51f08b2..b68aa62 100644
--- a/drivers/xen/xenfs/xenfs.h
+++ b/drivers/xen/xenfs/xenfs.h
@@ -2,5 +2,8 @@
#define _XENFS_XENBUS_H
extern const struct file_operations xenbus_file_ops;
+extern const struct file_operations privcmd_file_ops;
+extern const struct file_operations xsd_kva_file_ops;
+extern const struct file_operations xsd_port_file_ops;
#endif /* _XENFS_XENBUS_H */
diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c
new file mode 100644
index 0000000..af10804
--- /dev/null
+++ b/drivers/xen/xenfs/xenstored.c
@@ -0,0 +1,67 @@
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+
+#include <xen/page.h>
+
+#include "xenfs.h"
+#include "../xenbus/xenbus_comms.h"
+
+static ssize_t xsd_read(struct file *file, char __user *buf,
+ size_t size, loff_t *off)
+{
+ const char *str = (const char *)file->private_data;
+ return simple_read_from_buffer(buf, size, off, str, strlen(str));
+}
+
+static int xsd_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static int xsd_kva_open(struct inode *inode, struct file *file)
+{
+ file->private_data = (void *)kasprintf(GFP_KERNEL, "0x%p",
+ xen_store_interface);
+ if (!file->private_data)
+ return -ENOMEM;
+ return 0;
+}
+
+static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ size_t size = vma->vm_end - vma->vm_start;
+
+ if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
+ return -EINVAL;
+
+ if (remap_pfn_range(vma, vma->vm_start,
+ virt_to_pfn(xen_store_interface),
+ size, vma->vm_page_prot))
+ return -EAGAIN;
+
+ return 0;
+}
+
+const struct file_operations xsd_kva_file_ops = {
+ .open = xsd_kva_open,
+ .mmap = xsd_kva_mmap,
+ .read = xsd_read,
+ .release = xsd_release,
+};
+
+static int xsd_port_open(struct inode *inode, struct file *file)
+{
+ file->private_data = (void *)kasprintf(GFP_KERNEL, "%d",
+ xen_store_evtchn);
+ if (!file->private_data)
+ return -ENOMEM;
+ return 0;
+}
+
+const struct file_operations xsd_port_file_ops = {
+ .open = xsd_port_open,
+ .read = xsd_read,
+ .release = xsd_release,
+};
diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
index f4906f6..e7233e8 100644
--- a/include/acpi/acpi_drivers.h
+++ b/include/acpi/acpi_drivers.h
@@ -154,4 +154,25 @@
}
#endif
+/*--------------------------------------------------------------------------
+ Memory
+ -------------------------------------------------------------------------- */
+#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
+ defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
+struct acpi_memory_info {
+ struct list_head list;
+ u64 start_addr; /* Memory Range start physical addr */
+ u64 length; /* Memory Range length */
+ unsigned short caching; /* memory cache attribute */
+ unsigned short write_protect; /* memory read/write attribute */
+ unsigned int enabled:1;
+};
+
+struct acpi_memory_device {
+ struct acpi_device *device;
+ unsigned int state; /* State of the memory device */
+ struct list_head res_list;
+};
+#endif
+
#endif /*__ACPI_DRIVERS_H__*/
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index e7bdaaf..6aa3111 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -239,6 +239,25 @@
} piix4;
};
+extern int acpi_processor_errata(struct acpi_processor *pr);
+#ifdef CONFIG_ACPI_PROCFS
+extern int acpi_processor_add_fs(struct acpi_device *device);
+extern int acpi_processor_remove_fs(struct acpi_device *device);
+#else
+static inline int acpi_processor_add_fs(struct acpi_device *device)
+{
+ return 0;
+}
+
+static inline int acpi_processor_remove_fs(struct acpi_device *device)
+{
+ return 0;
+}
+#endif
+extern int acpi_processor_set_pdc(struct acpi_processor *pr);
+extern int acpi_processor_remove(struct acpi_device *device, int type);
+extern void acpi_processor_notify(struct acpi_device *device, u32 event);
+
extern int acpi_processor_preregister_performance(struct
acpi_processor_performance
*performance);
@@ -296,6 +315,8 @@
void acpi_processor_ppc_init(void);
void acpi_processor_ppc_exit(void);
int acpi_processor_ppc_has_changed(struct acpi_processor *pr);
+int acpi_processor_get_performance_info(struct acpi_processor *pr);
+int acpi_processor_get_psd(struct acpi_processor *pr);
#else
static inline void acpi_processor_ppc_init(void)
{
@@ -332,6 +353,7 @@
int acpi_processor_cst_has_changed(struct acpi_processor *pr);
int acpi_processor_power_exit(struct acpi_processor *pr,
struct acpi_device *device);
+int acpi_processor_get_power_info(struct acpi_processor *pr);
int acpi_processor_suspend(struct acpi_device * device, pm_message_t state);
int acpi_processor_resume(struct acpi_device * device);
extern struct cpuidle_driver acpi_idle_driver;
diff --git a/include/asm-generic/pci.h b/include/asm-generic/pci.h
index 26373cf..9fb4270 100644
--- a/include/asm-generic/pci.h
+++ b/include/asm-generic/pci.h
@@ -43,6 +43,8 @@
return root;
}
+#ifndef HAVE_ARCH_PCIBIOS_SCAN_ALL_FNS
+#endif
#ifndef HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ
static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
{
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index 66713c6..29c4ec1 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -1388,7 +1388,7 @@
#endif
/* Scatter Gather Support (drm_scatter.h) */
-extern void drm_sg_cleanup(struct drm_sg_mem * entry);
+extern void drm_sg_cleanup(struct drm_device *dev, struct drm_sg_mem * entry);
extern int drm_sg_alloc_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
extern int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request);
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index dd97fb8..b10ec49 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -53,6 +53,7 @@
unsigned long addr,
unsigned long size);
extern void free_bootmem(unsigned long addr, unsigned long size);
+extern void free_bootmem_late(unsigned long addr, unsigned long size);
/*
* Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 4a2b162..5de4c9e 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -208,16 +208,9 @@
u8 include_all:1; /* include all ports */
};
-/* Intel DMAR initialization functions */
extern int intel_iommu_init(void);
-#else
-static inline int intel_iommu_init(void)
-{
-#ifdef CONFIG_INTR_REMAP
- return dmar_dev_scope_init();
-#else
- return -ENODEV;
-#endif
-}
-#endif /* !CONFIG_DMAR */
+#else /* !CONFIG_DMAR: */
+static inline int intel_iommu_init(void) { return -ENODEV; }
+#endif /* CONFIG_DMAR */
+
#endif /* __DMAR_H__ */
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 862e7d4..74d67ca 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -763,6 +763,7 @@
* takes over; acceleration engine should be in a quiescent state */
/* hints */
+#define FBINFO_VIRTFB 0x0004 /* FB is System RAM, not device. */
#define FBINFO_PARTIAL_PAN_OK 0x0040 /* otw use pan only for double-buffering */
#define FBINFO_READS_FAST 0x0080 /* soft-copy faster than rendering */
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 176c518..d681cc9 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -81,6 +81,8 @@
#define IFLA_LINKINFO IFLA_LINKINFO
IFLA_NET_NS_PID,
IFLA_IFALIAS,
+ IFLA_NUM_VF, /* Number of VFs if device is SR-IOV PF */
+ IFLA_VFINFO_LIST,
__IFLA_MAX
};
@@ -190,4 +192,47 @@
__u32 to;
};
+/* SR-IOV virtual function managment section */
+
+enum {
+ IFLA_VF_INFO_UNSPEC,
+ IFLA_VF_INFO,
+ __IFLA_VF_INFO_MAX,
+};
+
+#define IFLA_VF_INFO_MAX (__IFLA_VF_INFO_MAX - 1)
+
+enum {
+ IFLA_VF_UNSPEC,
+ IFLA_VF_MAC, /* Hardware queue specific attributes */
+ IFLA_VF_VLAN,
+ IFLA_VF_TX_RATE, /* TX Bandwidth Allocation */
+ __IFLA_VF_MAX,
+};
+
+#define IFLA_VF_MAX (__IFLA_VF_MAX - 1)
+
+struct ifla_vf_mac {
+ __u32 vf;
+ __u8 mac[32]; /* MAX_ADDR_LEN */
+};
+
+struct ifla_vf_vlan {
+ __u32 vf;
+ __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */
+ __u32 qos;
+};
+
+struct ifla_vf_tx_rate {
+ __u32 vf;
+ __u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */
+};
+
+struct ifla_vf_info {
+ __u32 vf;
+ __u8 mac[32];
+ __u32 vlan;
+ __u32 qos;
+ __u32 tx_rate;
+};
#endif /* _LINUX_IF_LINK_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 11e5be6..4c98621 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -109,6 +109,12 @@
#define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */
#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */
#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
+#ifdef CONFIG_XEN
+#define VM_FOREIGN 0x20000000 /* Has pages belonging to another VM */
+struct vm_foreign_map {
+ struct page **map;
+};
+#endif
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@ -199,6 +205,11 @@
*/
int (*access)(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write);
+
+ /* Area-specific function for clearing the PTE at @ptep. Returns the
+ * original value of @ptep. */
+ pte_t (*zap_pte)(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, int is_fullmm);
#ifdef CONFIG_NUMA
/*
* set_policy() op must add a reference to any non-NULL @new mempolicy
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9d7e8f7..82ee25f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -28,6 +28,7 @@
#include <linux/if.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
+#include <linux/if_link.h>
#ifdef __KERNEL__
#include <linux/timer.h>
@@ -577,6 +578,13 @@
* this function is called when a VLAN id is unregistered.
*
* void (*ndo_poll_controller)(struct net_device *dev);
+ *
+ * SR-IOV management functions.
+ * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
+ * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos);
+ * int (*ndo_set_vf_tx_rate)(struct net_device *dev, int vf, int rate);
+ * int (*ndo_get_vf_config)(struct net_device *dev,
+ * int vf, struct ifla_vf_info *ivf);
*/
#define HAVE_NET_DEVICE_OPS
struct net_device_ops {
@@ -626,6 +634,15 @@
#define HAVE_NETDEV_POLL
void (*ndo_poll_controller)(struct net_device *dev);
#endif
+ int (*ndo_set_vf_mac)(struct net_device *dev,
+ int queue, u8 *mac);
+ int (*ndo_set_vf_vlan)(struct net_device *dev,
+ int queue, u16 vlan, u8 qos);
+ int (*ndo_set_vf_tx_rate)(struct net_device *dev,
+ int vf, int rate);
+ int (*ndo_get_vf_config)(struct net_device *dev,
+ int vf,
+ struct ifla_vf_info *ivf);
#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
int (*ndo_fcoe_enable)(struct net_device *dev);
int (*ndo_fcoe_disable)(struct net_device *dev);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 6b202b1..b03950e 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -105,6 +105,9 @@
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
PG_uncached, /* Page has been mapped as uncached */
#endif
+#ifdef CONFIG_XEN
+ PG_foreign,
+#endif
#ifdef CONFIG_MEMORY_FAILURE
PG_hwpoison, /* hardware poisoned page. Don't touch */
#endif
@@ -275,6 +278,23 @@
PAGEFLAG_FALSE(Uncached)
#endif
+#ifdef CONFIG_XEN
+TESTPAGEFLAG(Foreign, foreign)
+__SETPAGEFLAG(Foreign, foreign)
+CLEARPAGEFLAG(Foreign, foreign)
+#define SetPageForeign(_page, dtor) do { \
+ __SetPageForeign(_page); \
+ BUG_ON((dtor) == (void (*)(struct page *, unsigned int))0); \
+ (_page)->index = (long)(dtor); \
+} while (0)
+#define _PageForeignDestructor(_page) \
+ ((void (*)(struct page *, unsigned int))(_page)->index)
+#define PageForeignDestructor(_page, order) \
+ _PageForeignDestructor(_page)(_page, order)
+#else
+PAGEFLAG_FALSE(Foreign)
+#endif
+
#ifdef CONFIG_MEMORY_FAILURE
PAGEFLAG(HWPoison, hwpoison)
TESTSETFLAG(HWPoison, hwpoison)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e07d194..ca28e46 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -609,6 +609,9 @@
extern void pci_stop_bus_device(struct pci_dev *dev);
void pci_setup_cardbus(struct pci_bus *bus);
extern void pci_sort_breadthfirst(void);
+#define dev_is_pci(d) ((d)->bus == &pci_bus_type)
+#define dev_is_pf(d) ((dev_is_pci(d) ? to_pci_dev(d)->is_physfn : false))
+#define dev_num_vf(d) ((dev_is_pci(d) ? pci_num_vf(to_pci_dev(d)) : 0))
/* Generic PCI functions exported to card drivers */
@@ -1124,6 +1127,9 @@
unsigned int devfn)
{ return NULL; }
+#define dev_is_pci(d) (false)
+#define dev_is_pf(d) (false)
+#define dev_num_vf(d) (0)
#endif /* CONFIG_PCI */
/* Include architecture-dependent settings and functions */
@@ -1279,6 +1285,7 @@
extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
extern void pci_disable_sriov(struct pci_dev *dev);
extern irqreturn_t pci_sriov_migration(struct pci_dev *dev);
+extern int pci_num_vf(struct pci_dev *dev);
#else
static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
{
@@ -1291,6 +1298,10 @@
{
return IRQ_NONE;
}
+static inline int pci_num_vf(struct pci_dev *dev)
+{
+ return 0;
+}
#endif
#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a339386..178faf9 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2719,3 +2719,6 @@
#define PCI_DEVICE_ID_RME_DIGI32 0x9896
#define PCI_DEVICE_ID_RME_DIGI32_PRO 0x9897
#define PCI_DEVICE_ID_RME_DIGI32_8 0x9898
+
+#define PCI_VENDOR_ID_XEN 0x5853
+#define PCI_DEVICE_ID_XEN_PLATFORM 0x0001
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 73b1f1c..113585a 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -7,6 +7,8 @@
struct dma_attrs;
struct scatterlist;
+extern int swiotlb_force;
+
/*
* Maximum allowable number of contiguous slabs to map,
* must be a power of 2. What is the appropriate value ?
@@ -20,9 +22,46 @@
*/
#define IO_TLB_SHIFT 11
-extern void
-swiotlb_init(void);
+/* swiotlb-core.c */
+extern void swiotlb_init(int verbose);
+#ifdef CONFIG_SWIOTLB
+extern void __init swiotlb_free(void);
+#else
+static inline void swiotlb_free(void) { }
+#endif
+extern void swiotlb_print_info(void);
+/* swiotlb-core.c: Internal book-keeping functions.
+ * Must be linked against the library to take advantage of them.*/
+#ifdef CONFIG_SWIOTLB
+/*
+ * Enumeration for sync targets
+ */
+enum dma_sync_target {
+ SYNC_FOR_CPU = 0,
+ SYNC_FOR_DEVICE = 1,
+};
+extern char *io_tlb_start;
+extern char *io_tlb_end;
+extern unsigned long io_tlb_nslabs;
+extern void *io_tlb_overflow_buffer;
+extern unsigned long io_tlb_overflow;
+extern int is_swiotlb_buffer(phys_addr_t paddr);
+extern void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
+ enum dma_data_direction dir);
+extern void *do_map_single(struct device *hwdev, phys_addr_t phys,
+ unsigned long start_dma_addr, size_t size, int dir);
+
+extern void do_unmap_single(struct device *hwdev, char *dma_addr, size_t size,
+ int dir);
+
+extern void do_sync_single(struct device *hwdev, char *dma_addr, size_t size,
+ int dir, int target);
+extern void swiotlb_full(struct device *dev, size_t size, int dir, int do_panic);
+extern void __init swiotlb_init_early(size_t default_size, int verbose);
+#endif
+
+/* swiotlb.c: dma_ops functions. */
extern void
*swiotlb_alloc_coherent(struct device *hwdev, size_t size,
dma_addr_t *dma_handle, gfp_t flags);
@@ -88,4 +127,74 @@
extern int
swiotlb_dma_supported(struct device *hwdev, u64 mask);
+/* swiotlb-xen.c: dma_ops functions. */
+extern void xen_swiotlb_init(int verbose);
+extern void
+*xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags);
+
+extern void
+xen_swiotlb_free_coherent(struct device *hwdev, size_t size,
+ void *vaddr, dma_addr_t dma_handle);
+
+extern dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ struct dma_attrs *attrs);
+extern void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir,
+ struct dma_attrs *attrs);
+
+extern int
+xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
+ int direction);
+
+extern void
+xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
+ int direction);
+
+extern int
+xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ struct dma_attrs *attrs);
+
+extern void
+xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ struct dma_attrs *attrs);
+
+extern void
+xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir);
+
+extern void
+xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir);
+
+extern void
+xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir);
+
+extern void
+xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir);
+
+extern void
+xen_swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir);
+
+extern void
+xen_swiotlb_sync_single_range_for_device(struct device *hwdev,
+ dma_addr_t dev_addr,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir);
+
+extern int
+xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
+
+extern int
+xen_swiotlb_dma_supported(struct device *hwdev, u64 mask);
+
+
#endif /* __LINUX_SWIOTLB_H */
diff --git a/include/xen/Kbuild b/include/xen/Kbuild
index 4e65c16..84ad8f0 100644
--- a/include/xen/Kbuild
+++ b/include/xen/Kbuild
@@ -1 +1,2 @@
header-y += evtchn.h
+header-y += privcmd.h
diff --git a/include/xen/acpi.h b/include/xen/acpi.h
new file mode 100644
index 0000000..279142d
--- /dev/null
+++ b/include/xen/acpi.h
@@ -0,0 +1,106 @@
+#ifndef _XEN_ACPI_H
+#define _XEN_ACPI_H
+
+#include <linux/types.h>
+#include <acpi/acpi_drivers.h>
+#include <acpi/processor.h>
+#include <xen/xen.h>
+
+#ifdef CONFIG_XEN_S3
+#include <asm/xen/hypervisor.h>
+
+static inline bool xen_pv_acpi(void)
+{
+ return xen_pv_domain();
+}
+#else
+static inline bool xen_pv_acpi(void)
+{
+ return false;
+}
+#endif
+
+int acpi_notify_hypervisor_state(u8 sleep_state,
+ u32 pm1a_cnt, u32 pm1b_cnd);
+
+/*
+ * Following are interfaces for xen acpi processor control
+ */
+
+/* Events notified to xen */
+#define PROCESSOR_PM_INIT 1
+#define PROCESSOR_PM_CHANGE 2
+#define PROCESSOR_HOTPLUG 3
+
+/* Objects for the PM events */
+#define PM_TYPE_IDLE 0
+#define PM_TYPE_PERF 1
+#define PM_TYPE_THR 2
+#define PM_TYPE_MAX 3
+
+#define XEN_MAX_ACPI_ID 255
+
+/* Processor hotplug events */
+#define HOTPLUG_TYPE_ADD 0
+#define HOTPLUG_TYPE_REMOVE 1
+
+int xen_acpi_processor_init(void);
+void xen_acpi_processor_exit(void);
+
+int xen_acpi_processor_power_init(struct acpi_processor *pr,
+ struct acpi_device *device);
+int xen_acpi_processor_cst_has_changed(struct acpi_processor *pr);
+
+void xen_arch_acpi_processor_init_pdc(struct acpi_processor *pr);
+
+#ifdef CONFIG_CPU_FREQ
+int xen_acpi_processor_ppc_has_changed(struct acpi_processor *pr);
+int xen_acpi_processor_get_performance(struct acpi_processor *pr);
+#else
+static inline int xen_acpi_processor_ppc_has_changed(struct acpi_processor *pr)
+{
+ return acpi_processor_ppc_has_changed(pr);
+}
+static inline int xen_acpi_processor_get_performance(struct acpi_processor *pr)
+{
+ printk(KERN_WARNING
+ "Warning: xen_acpi_processor_get_performance not supported\n"
+ "Consider compiling CPUfreq support into your kernel.\n");
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
+ defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
+int xen_hotadd_memory(struct acpi_memory_device *mem_device);
+#endif
+
+#if defined(CONFIG_ACPI_PROCESSOR_XEN) || \
+defined(CONFIG_ACPI_PROCESSOR_XEN_MODULE)
+
+struct processor_cntl_xen_ops {
+ /* Transfer processor PM events to xen */
+int (*pm_ops[PM_TYPE_MAX])(struct acpi_processor *pr, int event);
+ /* Notify physical processor status to xen */
+ int (*hotplug)(struct acpi_processor *pr, int type);
+};
+
+extern int processor_cntl_xen_notify(struct acpi_processor *pr,
+ int event, int type);
+extern int processor_cntl_xen_power_cache(int cpu, int cx,
+ struct acpi_power_register *reg);
+#else
+
+static inline int processor_cntl_xen_notify(struct acpi_processor *pr,
+ int event, int type)
+{
+ return 0;
+}
+static inline int processor_cntl_xen_power_cache(int cpu, int cx,
+ struct acpi_power_register *reg)
+{
+ return 0;
+}
+#endif /* CONFIG_ACPI_PROCESSOR_XEN */
+
+#endif /* _XEN_ACPI_H */
diff --git a/include/xen/balloon.h b/include/xen/balloon.h
new file mode 100644
index 0000000..e751514
--- /dev/null
+++ b/include/xen/balloon.h
@@ -0,0 +1,8 @@
+#ifndef _XEN_BALLOON_H
+#define _XEN_BALLOON_H
+
+/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */
+struct page **alloc_empty_pages_and_pagevec(int nr_pages);
+void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages);
+
+#endif
diff --git a/include/xen/blkif.h b/include/xen/blkif.h
new file mode 100644
index 0000000..7172081
--- /dev/null
+++ b/include/xen/blkif.h
@@ -0,0 +1,123 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_BLKIF_H__
+#define __XEN_BLKIF_H__
+
+#include <xen/interface/xen.h>
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/protocols.h>
+
+/* Not a real protocol. Used to generate ring structs which contain
+ * the elements common to all protocols only. This way we get a
+ * compiler-checkable way to use common struct elements, so we can
+ * avoid using switch(protocol) in a number of places. */
+struct blkif_common_request {
+ char dummy;
+};
+struct blkif_common_response {
+ char dummy;
+};
+
+/* i386 protocol version */
+#pragma pack(push, 4)
+struct blkif_x86_32_request {
+ uint8_t operation; /* BLKIF_OP_??? */
+ uint8_t nr_segments; /* number of segments */
+ blkif_vdev_t handle; /* only for read/write requests */
+ uint64_t id; /* private guest value, echoed in resp */
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_32_response {
+ uint64_t id; /* copied from request */
+ uint8_t operation; /* copied from request */
+ int16_t status; /* BLKIF_RSP_??? */
+};
+typedef struct blkif_x86_32_request blkif_x86_32_request_t;
+typedef struct blkif_x86_32_response blkif_x86_32_response_t;
+#pragma pack(pop)
+
+/* x86_64 protocol version */
+struct blkif_x86_64_request {
+ uint8_t operation; /* BLKIF_OP_??? */
+ uint8_t nr_segments; /* number of segments */
+ blkif_vdev_t handle; /* only for read/write requests */
+ uint64_t __attribute__((__aligned__(8))) id;
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_64_response {
+ uint64_t __attribute__((__aligned__(8))) id;
+ uint8_t operation; /* copied from request */
+ int16_t status; /* BLKIF_RSP_??? */
+};
+typedef struct blkif_x86_64_request blkif_x86_64_request_t;
+typedef struct blkif_x86_64_response blkif_x86_64_response_t;
+
+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
+
+union blkif_back_rings {
+ struct blkif_back_ring native;
+ struct blkif_common_back_ring common;
+ struct blkif_x86_32_back_ring x86_32;
+ struct blkif_x86_64_back_ring x86_64;
+};
+
+enum blkif_protocol {
+ BLKIF_PROTOCOL_NATIVE = 1,
+ BLKIF_PROTOCOL_X86_32 = 2,
+ BLKIF_PROTOCOL_X86_64 = 3,
+};
+
+static void inline blkif_get_x86_32_req(struct blkif_request *dst, struct blkif_x86_32_request *src)
+{
+ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ dst->operation = src->operation;
+ dst->nr_segments = src->nr_segments;
+ dst->handle = src->handle;
+ dst->id = src->id;
+ dst->sector_number = src->sector_number;
+ barrier();
+ if (n > dst->nr_segments)
+ n = dst->nr_segments;
+ for (i = 0; i < n; i++)
+ dst->seg[i] = src->seg[i];
+}
+
+static void inline blkif_get_x86_64_req(struct blkif_request *dst, struct blkif_x86_64_request *src)
+{
+ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ dst->operation = src->operation;
+ dst->nr_segments = src->nr_segments;
+ dst->handle = src->handle;
+ dst->id = src->id;
+ dst->sector_number = src->sector_number;
+ barrier();
+ if (n > dst->nr_segments)
+ n = dst->nr_segments;
+ for (i = 0; i < n; i++)
+ dst->seg[i] = src->seg[i];
+}
+
+#endif /* __XEN_BLKIF_H__ */
diff --git a/include/xen/events.h b/include/xen/events.h
index e68d59a..7e17e2a 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -12,6 +12,8 @@
irq_handler_t handler,
unsigned long irqflags, const char *devname,
void *dev_id);
+int bind_virq_to_irq(unsigned int virq, unsigned int cpu);
+
int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
irq_handler_t handler,
unsigned long irqflags, const char *devname,
@@ -22,6 +24,12 @@
unsigned long irqflags,
const char *devname,
void *dev_id);
+int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
+ unsigned int remote_port,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id);
/*
* Common unbind function for all event sources. Takes IRQ to unbind from.
@@ -53,7 +61,42 @@
irq will be disabled so it won't deliver an interrupt. */
void xen_poll_irq(int irq);
+/* Poll waiting for an irq to become pending with a timeout. In the usual case, the
+ irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq_timeout(int irq, u64 timeout);
+
/* Determine the IRQ which is bound to an event channel */
unsigned irq_from_evtchn(unsigned int evtchn);
+/* Allocate an irq for a physical interrupt, given a gsi. "Legacy"
+ GSIs are identity mapped; others are dynamically allocated as
+ usual. */
+int xen_allocate_pirq(unsigned gsi, int shareable, char *name);
+
+/* De-allocates the above mentioned physical interrupt. */
+int xen_destroy_irq(int irq);
+
+/* Return vector allocated to pirq */
+int xen_vector_from_irq(unsigned pirq);
+
+/* Return gsi allocated to pirq */
+int xen_gsi_from_irq(unsigned pirq);
+
+#ifdef CONFIG_XEN_DOM0_PCI
+void xen_setup_pirqs(void);
+#else
+static inline void xen_setup_pirqs(void)
+{
+}
+#endif
+
+/* Determine whether to ignore this IRQ if passed to a guest. */
+int xen_ignore_irq(int irq);
+/* Xen HVM evtchn vector callback */
+extern void xen_hvm_callback_vector(void);
+extern int xen_have_vector_callback;
+int xen_set_callback_via(uint64_t via);
+void xen_evtchn_do_upcall(struct pt_regs *regs);
+void xen_hvm_evtchn_do_upcall(void);
+
#endif /* _XEN_EVENTS_H */
diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h
new file mode 100644
index 0000000..8bd1467
--- /dev/null
+++ b/include/xen/gntdev.h
@@ -0,0 +1,119 @@
+/******************************************************************************
+ * gntdev.h
+ *
+ * Interface to /dev/xen/gntdev.
+ *
+ * Copyright (c) 2007, D G Murray
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_GNTDEV_H__
+#define __LINUX_PUBLIC_GNTDEV_H__
+
+struct ioctl_gntdev_grant_ref {
+ /* The domain ID of the grant to be mapped. */
+ uint32_t domid;
+ /* The grant reference of the grant to be mapped. */
+ uint32_t ref;
+};
+
+/*
+ * Inserts the grant references into the mapping table of an instance
+ * of gntdev. N.B. This does not perform the mapping, which is deferred
+ * until mmap() is called with @index as the offset.
+ */
+#define IOCTL_GNTDEV_MAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
+struct ioctl_gntdev_map_grant_ref {
+ /* IN parameters */
+ /* The number of grants to be mapped. */
+ uint32_t count;
+ uint32_t pad;
+ /* OUT parameters */
+ /* The offset to be used on a subsequent call to mmap(). */
+ uint64_t index;
+ /* Variable IN parameter. */
+ /* Array of grant references, of size @count. */
+ struct ioctl_gntdev_grant_ref refs[1];
+};
+
+/*
+ * Removes the grant references from the mapping table of an instance of
+ * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
+ * before this ioctl is called, or an error will result.
+ */
+#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
+struct ioctl_gntdev_unmap_grant_ref {
+ /* IN parameters */
+ /* The offset was returned by the corresponding map operation. */
+ uint64_t index;
+ /* The number of pages to be unmapped. */
+ uint32_t count;
+ uint32_t pad;
+};
+
+/*
+ * Returns the offset in the driver's address space that corresponds
+ * to @vaddr. This can be used to perform a munmap(), followed by an
+ * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
+ * the caller. The number of pages that were allocated at the same time as
+ * @vaddr is returned in @count.
+ *
+ * N.B. Where more than one page has been mapped into a contiguous range, the
+ * supplied @vaddr must correspond to the start of the range; otherwise
+ * an error will result. It is only possible to munmap() the entire
+ * contiguously-allocated range at once, and not any subrange thereof.
+ */
+#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
+_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
+struct ioctl_gntdev_get_offset_for_vaddr {
+ /* IN parameters */
+ /* The virtual address of the first mapped page in a range. */
+ uint64_t vaddr;
+ /* OUT parameters */
+ /* The offset that was used in the initial mmap() operation. */
+ uint64_t offset;
+ /* The number of pages mapped in the VM area that begins at @vaddr. */
+ uint32_t count;
+ uint32_t pad;
+};
+
+/*
+ * Sets the maximum number of grants that may mapped at once by this gntdev
+ * instance.
+ *
+ * N.B. This must be called before any other ioctl is performed on the device.
+ */
+#define IOCTL_GNTDEV_SET_MAX_GRANTS \
+_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
+struct ioctl_gntdev_set_max_grants {
+ /* IN parameter */
+ /* The maximum number of grants that may be mapped at once. */
+ uint32_t count;
+};
+
+#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index a40f1cd..871b553 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -37,10 +37,16 @@
#ifndef __ASM_GNTTAB_H__
#define __ASM_GNTTAB_H__
-#include <asm/xen/hypervisor.h>
+#include <asm/page.h>
+
+#include <xen/interface/xen.h>
#include <xen/interface/grant_table.h>
+
+#include <asm/xen/hypervisor.h>
#include <asm/xen/grant_table.h>
+#include <xen/features.h>
+
/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
#define NR_GRANT_FRAMES 4
@@ -51,6 +57,9 @@
u16 count;
};
+void gnttab_reset_grant_page(struct page *page);
+
+int gnttab_init(void);
int gnttab_suspend(void);
int gnttab_resume(void);
@@ -80,6 +89,8 @@
int gnttab_query_foreign_access(grant_ref_t ref);
+int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep);
+
/*
* operations on reserved batches of grant references
*/
@@ -106,12 +117,46 @@
void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
unsigned long pfn);
+static inline void
+gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr,
+ uint32_t flags, grant_ref_t ref, domid_t domid)
+{
+ if (flags & GNTMAP_contains_pte)
+ map->host_addr = addr;
+ else if (xen_feature(XENFEAT_auto_translated_physmap))
+ map->host_addr = __pa(addr);
+ else
+ map->host_addr = addr;
+
+ map->flags = flags;
+ map->ref = ref;
+ map->dom = domid;
+}
+
+static inline void
+gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr,
+ uint32_t flags, grant_handle_t handle)
+{
+ if (flags & GNTMAP_contains_pte)
+ unmap->host_addr = addr;
+ else if (xen_feature(XENFEAT_auto_translated_physmap))
+ unmap->host_addr = __pa(addr);
+ else
+ unmap->host_addr = addr;
+
+ unmap->handle = handle;
+ unmap->dev_bus_addr = 0;
+}
+
int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
unsigned long max_nr_gframes,
struct grant_entry **__shared);
void arch_gnttab_unmap_shared(struct grant_entry *shared,
unsigned long nr_gframes);
+extern unsigned long xen_hvm_resume_frames;
+unsigned int gnttab_max_grant_frames(void);
+
#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
#endif /* __ASM_GNTTAB_H__ */
diff --git a/include/xen/hvm.h b/include/xen/hvm.h
new file mode 100644
index 0000000..b193fa2
--- /dev/null
+++ b/include/xen/hvm.h
@@ -0,0 +1,30 @@
+/* Simple wrappers around HVM functions */
+#ifndef XEN_HVM_H__
+#define XEN_HVM_H__
+
+#include <xen/interface/hvm/params.h>
+#include <asm/xen/hypercall.h>
+
+static inline int hvm_get_parameter(int idx, uint64_t *value)
+{
+ struct xen_hvm_param xhv;
+ int r;
+
+ xhv.domid = DOMID_SELF;
+ xhv.index = idx;
+ r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
+ if (r < 0) {
+ printk(KERN_ERR "Cannot get hvm parameter %d: %d!\n",
+ idx, r);
+ return r;
+ }
+ *value = xhv.value;
+ return r;
+}
+
+#define HVM_CALLBACK_VIA_TYPE_VECTOR 0x2
+#define HVM_CALLBACK_VIA_TYPE_SHIFT 56
+#define HVM_CALLBACK_VECTOR(x) (((uint64_t)HVM_CALLBACK_VIA_TYPE_VECTOR)<<\
+ HVM_CALLBACK_VIA_TYPE_SHIFT | (x))
+
+#endif /* XEN_HVM_H__ */
diff --git a/include/xen/interface/features.h b/include/xen/interface/features.h
index f51b641..70d2563 100644
--- a/include/xen/interface/features.h
+++ b/include/xen/interface/features.h
@@ -41,6 +41,12 @@
/* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */
#define XENFEAT_mmu_pt_update_preserve_ad 5
+/* x86: Does this Xen host support the HVM callback vector type? */
+#define XENFEAT_hvm_callback_vector 8
+
+/* x86: pvclock algorithm is safe to use on HVM */
+#define XENFEAT_hvm_safe_pvclock 9
+
#define XENFEAT_NR_SUBMAPS 1
#endif /* __XEN_PUBLIC_FEATURES_H__ */
diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h
index 39da93c..c704fe5 100644
--- a/include/xen/interface/grant_table.h
+++ b/include/xen/interface/grant_table.h
@@ -28,6 +28,7 @@
#ifndef __XEN_PUBLIC_GRANT_TABLE_H__
#define __XEN_PUBLIC_GRANT_TABLE_H__
+#include <xen/interface/xen.h>
/***********************************
* GRANT TABLE REPRESENTATION
@@ -321,6 +322,28 @@
DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size);
/*
+ * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference mappings
+ * tracked by <handle> but atomically replace the page table entry with one
+ * pointing to the machine address under <new_addr>. <new_addr> will be
+ * redirected to the null entry.
+ * NOTES:
+ * 1. The call may fail in an undefined manner if either mapping is not
+ * tracked by <handle>.
+ * 2. After executing a batch of unmaps, it is guaranteed that no stale
+ * mappings will remain in the device or host TLBs.
+ */
+#define GNTTABOP_unmap_and_replace 7
+struct gnttab_unmap_and_replace {
+ /* IN parameters. */
+ uint64_t host_addr;
+ uint64_t new_addr;
+ grant_handle_t handle;
+ /* OUT parameters. */
+ int16_t status; /* GNTST_* */
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_and_replace);
+
+/*
* Bitfield values for update_pin_status.flags.
*/
/* Map the grant entry for access by I/O devices. */
diff --git a/include/xen/interface/hvm/hvm_op.h b/include/xen/interface/hvm/hvm_op.h
new file mode 100644
index 0000000..a4827f4
--- /dev/null
+++ b/include/xen/interface/hvm/hvm_op.h
@@ -0,0 +1,46 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_HVM_OP_H__
+#define __XEN_PUBLIC_HVM_HVM_OP_H__
+
+/* Get/set subcommands: the second argument of the hypercall is a
+ * pointer to a xen_hvm_param struct. */
+#define HVMOP_set_param 0
+#define HVMOP_get_param 1
+struct xen_hvm_param {
+ domid_t domid; /* IN */
+ uint32_t index; /* IN */
+ uint64_t value; /* IN/OUT */
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_param);
+
+/* Hint from PV drivers for pagetable destruction. */
+#define HVMOP_pagetable_dying 9
+struct xen_hvm_pagetable_dying {
+ /* Domain with a pagetable about to be destroyed. */
+ domid_t domid;
+ /* guest physical address of the toplevel pagetable dying */
+ aligned_u64 gpa;
+};
+typedef struct xen_hvm_pagetable_dying xen_hvm_pagetable_dying_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_pagetable_dying_t);
+
+#endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
diff --git a/include/xen/interface/hvm/params.h b/include/xen/interface/hvm/params.h
new file mode 100644
index 0000000..1888d8c
--- /dev/null
+++ b/include/xen/interface/hvm/params.h
@@ -0,0 +1,95 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_PARAMS_H__
+#define __XEN_PUBLIC_HVM_PARAMS_H__
+
+#include "hvm_op.h"
+
+/*
+ * Parameter space for HVMOP_{set,get}_param.
+ */
+
+/*
+ * How should CPU0 event-channel notifications be delivered?
+ * val[63:56] == 0: val[55:0] is a delivery GSI (Global System Interrupt).
+ * val[63:56] == 1: val[55:0] is a delivery PCI INTx line, as follows:
+ * Domain = val[47:32], Bus = val[31:16],
+ * DevFn = val[15: 8], IntX = val[ 1: 0]
+ * val[63:56] == 2: val[7:0] is a vector number.
+ * If val == 0 then CPU0 event-channel notifications are not delivered.
+ */
+#define HVM_PARAM_CALLBACK_IRQ 0
+
+#define HVM_PARAM_STORE_PFN 1
+#define HVM_PARAM_STORE_EVTCHN 2
+
+#define HVM_PARAM_PAE_ENABLED 4
+
+#define HVM_PARAM_IOREQ_PFN 5
+
+#define HVM_PARAM_BUFIOREQ_PFN 6
+
+/*
+ * Set mode for virtual timers (currently x86 only):
+ * delay_for_missed_ticks (default):
+ * Do not advance a vcpu's time beyond the correct delivery time for
+ * interrupts that have been missed due to preemption. Deliver missed
+ * interrupts when the vcpu is rescheduled and advance the vcpu's virtual
+ * time stepwise for each one.
+ * no_delay_for_missed_ticks:
+ * As above, missed interrupts are delivered, but guest time always tracks
+ * wallclock (i.e., real) time while doing so.
+ * no_missed_ticks_pending:
+ * No missed interrupts are held pending. Instead, to ensure ticks are
+ * delivered at some non-zero rate, if we detect missed ticks then the
+ * internal tick alarm is not disabled if the VCPU is preempted during the
+ * next tick period.
+ * one_missed_tick_pending:
+ * Missed interrupts are collapsed together and delivered as one 'late tick'.
+ * Guest time always tracks wallclock (i.e., real) time.
+ */
+#define HVM_PARAM_TIMER_MODE 10
+#define HVMPTM_delay_for_missed_ticks 0
+#define HVMPTM_no_delay_for_missed_ticks 1
+#define HVMPTM_no_missed_ticks_pending 2
+#define HVMPTM_one_missed_tick_pending 3
+
+/* Boolean: Enable virtual HPET (high-precision event timer)? (x86-only) */
+#define HVM_PARAM_HPET_ENABLED 11
+
+/* Identity-map page directory used by Intel EPT when CR0.PG=0. */
+#define HVM_PARAM_IDENT_PT 12
+
+/* Device Model domain, defaults to 0. */
+#define HVM_PARAM_DM_DOMAIN 13
+
+/* ACPI S state: currently support S0 and S3 on x86. */
+#define HVM_PARAM_ACPI_S_STATE 14
+
+/* TSS used on Intel when CR0.PE=0. */
+#define HVM_PARAM_VM86_TSS 15
+
+/* Boolean: Enable aligning all periodic vpts to reduce interrupts */
+#define HVM_PARAM_VPT_ALIGN 16
+
+#define HVM_NR_PARAMS 17
+
+#endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index c2d1fa4..68dd2b4 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -91,4 +91,25 @@
#define VDISK_REMOVABLE 0x2
#define VDISK_READONLY 0x4
+/* Xen-defined major numbers for virtual disks, they look strangely
+ * familiar */
+#define XEN_IDE0_MAJOR 3
+#define XEN_IDE1_MAJOR 22
+#define XEN_SCSI_DISK0_MAJOR 8
+#define XEN_SCSI_DISK1_MAJOR 65
+#define XEN_SCSI_DISK2_MAJOR 66
+#define XEN_SCSI_DISK3_MAJOR 67
+#define XEN_SCSI_DISK4_MAJOR 68
+#define XEN_SCSI_DISK5_MAJOR 69
+#define XEN_SCSI_DISK6_MAJOR 70
+#define XEN_SCSI_DISK7_MAJOR 71
+#define XEN_SCSI_DISK8_MAJOR 128
+#define XEN_SCSI_DISK9_MAJOR 129
+#define XEN_SCSI_DISK10_MAJOR 130
+#define XEN_SCSI_DISK11_MAJOR 131
+#define XEN_SCSI_DISK12_MAJOR 132
+#define XEN_SCSI_DISK13_MAJOR 133
+#define XEN_SCSI_DISK14_MAJOR 134
+#define XEN_SCSI_DISK15_MAJOR 135
+
#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h
index 518481c..8309344 100644
--- a/include/xen/interface/io/netif.h
+++ b/include/xen/interface/io/netif.h
@@ -131,6 +131,10 @@
#define _NETRXF_extra_info (3)
#define NETRXF_extra_info (1U<<_NETRXF_extra_info)
+/* GSO Prefix descriptor. */
+#define _NETRXF_gso_prefix (4)
+#define NETRXF_gso_prefix (1U<<_NETRXF_gso_prefix)
+
struct xen_netif_rx_response {
uint16_t id;
uint16_t offset; /* Offset in page of start of received packet */
diff --git a/include/xen/interface/io/pciif.h b/include/xen/interface/io/pciif.h
new file mode 100644
index 0000000..c4177f3
--- /dev/null
+++ b/include/xen/interface/io/pciif.h
@@ -0,0 +1,124 @@
+/*
+ * PCI Backend/Frontend Common Data Structures & Macros
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#ifndef __XEN_PCI_COMMON_H__
+#define __XEN_PCI_COMMON_H__
+
+/* Be sure to bump this number if you change this file */
+#define XEN_PCI_MAGIC "7"
+
+/* xen_pci_sharedinfo flags */
+#define _XEN_PCIF_active (0)
+#define XEN_PCIF_active (1<<_XEN_PCIF_active)
+#define _XEN_PCIB_AERHANDLER (1)
+#define XEN_PCIB_AERHANDLER (1<<_XEN_PCIB_AERHANDLER)
+#define _XEN_PCIB_active (2)
+#define XEN_PCIB_active (1<<_XEN_PCIB_active)
+
+/* xen_pci_op commands */
+#define XEN_PCI_OP_conf_read (0)
+#define XEN_PCI_OP_conf_write (1)
+#define XEN_PCI_OP_enable_msi (2)
+#define XEN_PCI_OP_disable_msi (3)
+#define XEN_PCI_OP_enable_msix (4)
+#define XEN_PCI_OP_disable_msix (5)
+#define XEN_PCI_OP_aer_detected (6)
+#define XEN_PCI_OP_aer_resume (7)
+#define XEN_PCI_OP_aer_mmio (8)
+#define XEN_PCI_OP_aer_slotreset (9)
+
+/* xen_pci_op error numbers */
+#define XEN_PCI_ERR_success (0)
+#define XEN_PCI_ERR_dev_not_found (-1)
+#define XEN_PCI_ERR_invalid_offset (-2)
+#define XEN_PCI_ERR_access_denied (-3)
+#define XEN_PCI_ERR_not_implemented (-4)
+/* XEN_PCI_ERR_op_failed - backend failed to complete the operation */
+#define XEN_PCI_ERR_op_failed (-5)
+
+/*
+ * it should be PAGE_SIZE-sizeof(struct xen_pci_op))/sizeof(struct msix_entry))
+ * Should not exceed 128
+ */
+#define SH_INFO_MAX_VEC 128
+
+struct xen_msix_entry {
+ uint16_t vector;
+ uint16_t entry;
+};
+struct xen_pci_op {
+ /* IN: what action to perform: XEN_PCI_OP_* */
+ uint32_t cmd;
+
+ /* OUT: will contain an error number (if any) from errno.h */
+ int32_t err;
+
+ /* IN: which device to touch */
+ uint32_t domain; /* PCI Domain/Segment */
+ uint32_t bus;
+ uint32_t devfn;
+
+ /* IN: which configuration registers to touch */
+ int32_t offset;
+ int32_t size;
+
+ /* IN/OUT: Contains the result after a READ or the value to WRITE */
+ uint32_t value;
+ /* IN: Contains extra infor for this operation */
+ uint32_t info;
+ /*IN: param for msi-x */
+ struct xen_msix_entry msix_entries[SH_INFO_MAX_VEC];
+};
+
+/*used for pcie aer handling*/
+struct xen_pcie_aer_op
+{
+
+ /* IN: what action to perform: XEN_PCI_OP_* */
+ uint32_t cmd;
+ /*IN/OUT: return aer_op result or carry error_detected state as input*/
+ int32_t err;
+
+ /* IN: which device to touch */
+ uint32_t domain; /* PCI Domain/Segment*/
+ uint32_t bus;
+ uint32_t devfn;
+};
+struct xen_pci_sharedinfo {
+ /* flags - XEN_PCIF_* */
+ uint32_t flags;
+ struct xen_pci_op op;
+ struct xen_pcie_aer_op aer_op;
+};
+
+#endif /* __XEN_PCI_COMMON_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h
index e8cbf43..c9ba846 100644
--- a/include/xen/interface/io/ring.h
+++ b/include/xen/interface/io/ring.h
@@ -24,8 +24,15 @@
* A ring contains as many entries as will fit, rounded down to the nearest
* power of two (so we can mask with (size-1) to loop around).
*/
-#define __RING_SIZE(_s, _sz) \
- (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
+#define __CONST_RING_SIZE(_s, _sz) \
+ (__RD32(((_sz) - offsetof(struct _s##_sring, ring)) / \
+ sizeof(((struct _s##_sring *)0)->ring[0])))
+
+/*
+ * The same for passing in an actual pointer instead of a name tag.
+ */
+#define __RING_SIZE(_s, _sz) \
+ (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
/*
* Macros to make the correct C datatypes for a new kind of ring.
@@ -73,7 +80,16 @@
struct __name##_sring { \
RING_IDX req_prod, req_event; \
RING_IDX rsp_prod, rsp_event; \
- uint8_t pad[48]; \
+ union { \
+ struct { \
+ uint8_t smartpoll_active; \
+ } netif; \
+ struct { \
+ uint8_t msg; \
+ } tapif_user; \
+ uint8_t pvt_pad[4]; \
+ } private; \
+ uint8_t pad[44]; \
union __name##_sring_entry ring[1]; /* variable-length */ \
}; \
\
diff --git a/include/xen/interface/io/xenbus.h b/include/xen/interface/io/xenbus.h
index 46508c7..9fda532 100644
--- a/include/xen/interface/io/xenbus.h
+++ b/include/xen/interface/io/xenbus.h
@@ -27,8 +27,14 @@
XenbusStateClosing = 5, /* The device is being closed
due to an error or an unplug
event. */
- XenbusStateClosed = 6
+ XenbusStateClosed = 6,
+ /*
+ * Reconfiguring: The device is being reconfigured.
+ */
+ XenbusStateReconfiguring = 7,
+
+ XenbusStateReconfigured = 8
};
#endif /* _XEN_PUBLIC_IO_XENBUS_H */
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
index af36ead..aa4e368 100644
--- a/include/xen/interface/memory.h
+++ b/include/xen/interface/memory.h
@@ -9,6 +9,8 @@
#ifndef __XEN_PUBLIC_MEMORY_H__
#define __XEN_PUBLIC_MEMORY_H__
+#include <linux/spinlock.h>
+
/*
* Increase or decrease the specified domain's memory reservation. Returns a
* -ve errcode on failure, or the # extents successfully allocated or freed.
@@ -53,6 +55,48 @@
DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation);
/*
+ * An atomic exchange of memory pages. If return code is zero then
+ * @out.extent_list provides GMFNs of the newly-allocated memory.
+ * Returns zero on complete success, otherwise a negative error code.
+ * On complete success then always @nr_exchanged == @in.nr_extents.
+ * On partial success @nr_exchanged indicates how much work was done.
+ */
+#define XENMEM_exchange 11
+struct xen_memory_exchange {
+ /*
+ * [IN] Details of memory extents to be exchanged (GMFN bases).
+ * Note that @in.address_bits is ignored and unused.
+ */
+ struct xen_memory_reservation in;
+
+ /*
+ * [IN/OUT] Details of new memory extents.
+ * We require that:
+ * 1. @in.domid == @out.domid
+ * 2. @in.nr_extents << @in.extent_order ==
+ * @out.nr_extents << @out.extent_order
+ * 3. @in.extent_start and @out.extent_start lists must not overlap
+ * 4. @out.extent_start lists GPFN bases to be populated
+ * 5. @out.extent_start is overwritten with allocated GMFN bases
+ */
+ struct xen_memory_reservation out;
+
+ /*
+ * [OUT] Number of input extents that were successfully exchanged:
+ * 1. The first @nr_exchanged input extents were successfully
+ * deallocated.
+ * 2. The corresponding first entries in the output extent list correctly
+ * indicate the GMFNs that were successfully exchanged.
+ * 3. All other input and output extents are untouched.
+ * 4. If not all input exents are exchanged then the return code of this
+ * command will be non-zero.
+ * 5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER!
+ */
+ unsigned long nr_exchanged;
+};
+
+DEFINE_GUEST_HANDLE_STRUCT(xen_memory_exchange);
+/*
* Returns the maximum machine frame number of mapped RAM in this system.
* This command always succeeds (it never returns an error code).
* arg == NULL.
@@ -97,6 +141,19 @@
DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
/*
+ * Returns the location in virtual address space of the machine_to_phys
+ * mapping table. Architectures which do not have a m2p table, or which do not
+ * map it by default into guest address space, do not implement this command.
+ * arg == addr of xen_machphys_mapping_t.
+ */
+#define XENMEM_machphys_mapping 12
+struct xen_machphys_mapping {
+ unsigned long v_start, v_end; /* Start and end virtual addresses. */
+ unsigned long max_mfn; /* Maximum MFN that can be looked up. */
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mapping_t);
+
+/*
* Sets the GPFN at which a particular page appears in the specified guest's
* pseudophysical address space.
* arg == addr of xen_add_to_physmap_t.
@@ -142,4 +199,38 @@
};
DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
+/*
+ * Returns the pseudo-physical memory map as it was when the domain
+ * was started (specified by XENMEM_set_memory_map).
+ * arg == addr of struct xen_memory_map.
+ */
+#define XENMEM_memory_map 9
+struct xen_memory_map {
+ /*
+ * On call the number of entries which can be stored in buffer. On
+ * return the number of entries which have been stored in
+ * buffer.
+ */
+ unsigned int nr_entries;
+
+ /*
+ * Entries in the buffer are in the same format as returned by the
+ * BIOS INT 0x15 EAX=0xE820 call.
+ */
+ GUEST_HANDLE(void) buffer;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
+
+/*
+ * Returns the real physical memory map. Passes the same structure as
+ * XENMEM_memory_map.
+ * arg == addr of struct xen_memory_map.
+ */
+#define XENMEM_machine_memory_map 10
+
+/*
+ * Prevent the balloon driver from changing the memory reservation
+ * during a driver critical region.
+ */
+extern spinlock_t xen_reservation_lock;
#endif /* __XEN_PUBLIC_MEMORY_H__ */
diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h
index cd69391..0703ef6 100644
--- a/include/xen/interface/physdev.h
+++ b/include/xen/interface/physdev.h
@@ -39,6 +39,19 @@
};
/*
+ * Register a shared page for the hypervisor to indicate whether the guest
+ * must issue PHYSDEVOP_eoi. The semantics of PHYSDEVOP_eoi change slightly
+ * once the guest used this function in that the associated event channel
+ * will automatically get unmasked. The page registered is used as a bit
+ * array indexed by Xen's PIRQ value.
+ */
+#define PHYSDEVOP_pirq_eoi_gmfn 17
+struct physdev_pirq_eoi_gmfn {
+ /* IN */
+ unsigned long gmfn;
+};
+
+/*
* Query the status of an IRQ line.
* @arg == pointer to physdev_irq_status_query structure.
*/
@@ -106,6 +119,64 @@
uint32_t vector;
};
+#define MAP_PIRQ_TYPE_MSI 0x0
+#define MAP_PIRQ_TYPE_GSI 0x1
+#define MAP_PIRQ_TYPE_UNKNOWN 0x2
+
+#define PHYSDEVOP_map_pirq 13
+struct physdev_map_pirq {
+ domid_t domid;
+ /* IN */
+ int type;
+ /* IN */
+ int index;
+ /* IN or OUT */
+ int pirq;
+ /* IN */
+ int bus;
+ /* IN */
+ int devfn;
+ /* IN */
+ int entry_nr;
+ /* IN */
+ uint64_t table_base;
+};
+
+#define PHYSDEVOP_unmap_pirq 14
+struct physdev_unmap_pirq {
+ domid_t domid;
+ /* IN */
+ int pirq;
+};
+
+#define PHYSDEVOP_manage_pci_add 15
+#define PHYSDEVOP_manage_pci_remove 16
+struct physdev_manage_pci {
+ /* IN */
+ uint8_t bus;
+ uint8_t devfn;
+};
+
+#define PHYSDEVOP_restore_msi 19
+struct physdev_restore_msi {
+ /* IN */
+ uint8_t bus;
+ uint8_t devfn;
+};
+
+#define PHYSDEVOP_manage_pci_add_ext 20
+struct physdev_manage_pci_ext {
+ /* IN */
+ uint8_t bus;
+ uint8_t devfn;
+ unsigned is_extfn;
+ unsigned is_virtfn;
+ struct {
+ uint8_t bus;
+ uint8_t devfn;
+ } physfn;
+};
+
/*
* Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
* hypercall since 0x00030202.
@@ -121,6 +192,16 @@
} u;
};
+#define PHYSDEVOP_setup_gsi 21
+struct physdev_setup_gsi {
+ int gsi;
+ /* IN */
+ uint8_t triggering;
+ /* IN */
+ uint8_t polarity;
+ /* IN */
+};
+
/*
* Notify that some PIRQ-bound event channels have been unmasked.
* ** This command is obsolete since interface version 0x00030202 and is **
diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h
new file mode 100644
index 0000000..17ae622
--- /dev/null
+++ b/include/xen/interface/platform.h
@@ -0,0 +1,381 @@
+/******************************************************************************
+ * platform.h
+ *
+ * Hardware platform operations. Intended for use by domain-0 kernel.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2002-2006, K Fraser
+ */
+
+#ifndef __XEN_PUBLIC_PLATFORM_H__
+#define __XEN_PUBLIC_PLATFORM_H__
+
+#include "xen.h"
+
+#define XENPF_INTERFACE_VERSION 0x03000001
+
+/*
+ * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC,
+ * 1 January, 1970 if the current system time was <system_time>.
+ */
+#define XENPF_settime 17
+struct xenpf_settime {
+ /* IN variables. */
+ uint32_t secs;
+ uint32_t nsecs;
+ uint64_t system_time;
+};
+typedef struct xenpf_settime xenpf_settime_t;
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_settime_t);
+
+/*
+ * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type.
+ * On x86, @type is an architecture-defined MTRR memory type.
+ * On success, returns the MTRR that was used (@reg) and a handle that can
+ * be passed to XENPF_DEL_MEMTYPE to accurately tear down the new setting.
+ * (x86-specific).
+ */
+#define XENPF_add_memtype 31
+struct xenpf_add_memtype {
+ /* IN variables. */
+ unsigned long mfn;
+ uint64_t nr_mfns;
+ uint32_t type;
+ /* OUT variables. */
+ uint32_t handle;
+ uint32_t reg;
+};
+typedef struct xenpf_add_memtype xenpf_add_memtype_t;
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_add_memtype_t);
+
+/*
+ * Tear down an existing memory-range type. If @handle is remembered then it
+ * should be passed in to accurately tear down the correct setting (in case
+ * of overlapping memory regions with differing types). If it is not known
+ * then @handle should be set to zero. In all cases @reg must be set.
+ * (x86-specific).
+ */
+#define XENPF_del_memtype 32
+struct xenpf_del_memtype {
+ /* IN variables. */
+ uint32_t handle;
+ uint32_t reg;
+};
+typedef struct xenpf_del_memtype xenpf_del_memtype_t;
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_del_memtype_t);
+
+/* Read current type of an MTRR (x86-specific). */
+#define XENPF_read_memtype 33
+struct xenpf_read_memtype {
+ /* IN variables. */
+ uint32_t reg;
+ /* OUT variables. */
+ unsigned long mfn;
+ uint64_t nr_mfns;
+ uint32_t type;
+};
+typedef struct xenpf_read_memtype xenpf_read_memtype_t;
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_read_memtype_t);
+
+#define XENPF_microcode_update 35
+struct xenpf_microcode_update {
+ /* IN variables. */
+ GUEST_HANDLE(void) data; /* Pointer to microcode data */
+ uint32_t length; /* Length of microcode data. */
+};
+typedef struct xenpf_microcode_update xenpf_microcode_update_t;
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_microcode_update_t);
+
+#define XENPF_platform_quirk 39
+#define QUIRK_NOIRQBALANCING 1 /* Do not restrict IO-APIC RTE targets */
+#define QUIRK_IOAPIC_BAD_REGSEL 2 /* IO-APIC REGSEL forgets its value */
+#define QUIRK_IOAPIC_GOOD_REGSEL 3 /* IO-APIC REGSEL behaves properly */
+struct xenpf_platform_quirk {
+ /* IN variables. */
+ uint32_t quirk_id;
+};
+typedef struct xenpf_platform_quirk xenpf_platform_quirk_t;
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_platform_quirk_t);
+
+#define XENPF_firmware_info 50
+#define XEN_FW_DISK_INFO 1 /* from int 13 AH=08/41/48 */
+#define XEN_FW_DISK_MBR_SIGNATURE 2 /* from MBR offset 0x1b8 */
+#define XEN_FW_VBEDDC_INFO 3 /* from int 10 AX=4f15 */
+struct xenpf_firmware_info {
+ /* IN variables. */
+ uint32_t type;
+ uint32_t index;
+ /* OUT variables. */
+ union {
+ struct {
+ /* Int13, Fn48: Check Extensions Present. */
+ uint8_t device; /* %dl: bios device number */
+ uint8_t version; /* %ah: major version */
+ uint16_t interface_support; /* %cx: support bitmap */
+ /* Int13, Fn08: Legacy Get Device Parameters. */
+ uint16_t legacy_max_cylinder; /* %cl[7:6]:%ch: max cyl # */
+ uint8_t legacy_max_head; /* %dh: max head # */
+ uint8_t legacy_sectors_per_track; /* %cl[5:0]: max sector # */
+ /* Int13, Fn41: Get Device Parameters (as filled into %ds:%esi). */
+ /* NB. First uint16_t of buffer must be set to buffer size. */
+ GUEST_HANDLE(void) edd_params;
+ } disk_info; /* XEN_FW_DISK_INFO */
+ struct {
+ uint8_t device; /* bios device number */
+ uint32_t mbr_signature; /* offset 0x1b8 in mbr */
+ } disk_mbr_signature; /* XEN_FW_DISK_MBR_SIGNATURE */
+ struct {
+ /* Int10, AX=4F15: Get EDID info. */
+ uint8_t capabilities;
+ uint8_t edid_transfer_time;
+ /* must refer to 128-byte buffer */
+ GUEST_HANDLE(uchar) edid;
+ } vbeddc_info; /* XEN_FW_VBEDDC_INFO */
+ } u;
+};
+typedef struct xenpf_firmware_info xenpf_firmware_info_t;
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_firmware_info_t);
+
+#define XENPF_enter_acpi_sleep 51
+struct xenpf_enter_acpi_sleep {
+ /* IN variables */
+ uint16_t pm1a_cnt_val; /* PM1a control value. */
+ uint16_t pm1b_cnt_val; /* PM1b control value. */
+ uint32_t sleep_state; /* Which state to enter (Sn). */
+ uint32_t flags; /* Must be zero. */
+};
+typedef struct xenpf_enter_acpi_sleep xenpf_enter_acpi_sleep_t;
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_enter_acpi_sleep_t);
+
+#define XENPF_change_freq 52
+struct xenpf_change_freq {
+ /* IN variables */
+ uint32_t flags; /* Must be zero. */
+ uint32_t cpu; /* Physical cpu. */
+ uint64_t freq; /* New frequency (Hz). */
+};
+typedef struct xenpf_change_freq xenpf_change_freq_t;
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_change_freq_t);
+
+/*
+ * Get idle times (nanoseconds since boot) for physical CPUs specified in the
+ * @cpumap_bitmap with range [0..@cpumap_nr_cpus-1]. The @idletime array is
+ * indexed by CPU number; only entries with the corresponding @cpumap_bitmap
+ * bit set are written to. On return, @cpumap_bitmap is modified so that any
+ * non-existent CPUs are cleared. Such CPUs have their @idletime array entry
+ * cleared.
+ */
+#define XENPF_getidletime 53
+struct xenpf_getidletime {
+ /* IN/OUT variables */
+ /* IN: CPUs to interrogate; OUT: subset of IN which are present */
+ GUEST_HANDLE(uchar) cpumap_bitmap;
+ /* IN variables */
+ /* Size of cpumap bitmap. */
+ uint32_t cpumap_nr_cpus;
+ /* Must be indexable for every cpu in cpumap_bitmap. */
+ GUEST_HANDLE(uint64_t) idletime;
+ /* OUT variables */
+ /* System time when the idletime snapshots were taken. */
+ uint64_t now;
+};
+typedef struct xenpf_getidletime xenpf_getidletime_t;
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_getidletime_t);
+
+#define XENPF_set_processor_pminfo 54
+
+/* ability bits */
+#define XEN_PROCESSOR_PM_CX 1
+#define XEN_PROCESSOR_PM_PX 2
+#define XEN_PROCESSOR_PM_TX 4
+
+/* cmd type */
+#define XEN_PM_CX 0
+#define XEN_PM_PX 1
+#define XEN_PM_TX 2
+
+/* Px sub info type */
+#define XEN_PX_PCT 1
+#define XEN_PX_PSS 2
+#define XEN_PX_PPC 4
+#define XEN_PX_PSD 8
+
+struct xen_power_register {
+ uint32_t space_id;
+ uint32_t bit_width;
+ uint32_t bit_offset;
+ uint32_t access_size;
+ uint64_t address;
+};
+
+struct xen_processor_csd {
+ uint32_t domain; /* domain number of one dependent group */
+ uint32_t coord_type; /* coordination type */
+ uint32_t num; /* number of processors in same domain */
+};
+typedef struct xen_processor_csd xen_processor_csd_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_processor_csd);
+
+struct xen_processor_cx {
+ struct xen_power_register reg; /* GAS for Cx trigger register */
+ uint8_t type; /* cstate value, c0: 0, c1: 1, ... */
+ uint32_t latency; /* worst latency (ms) to enter/exit this cstate */
+ uint32_t power; /* average power consumption(mW) */
+ uint32_t dpcnt; /* number of dependency entries */
+ GUEST_HANDLE(xen_processor_csd) dp; /* NULL if no dependency */
+};
+typedef struct xen_processor_cx xen_processor_cx_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_processor_cx);
+
+struct xen_processor_flags {
+ uint32_t bm_control:1;
+ uint32_t bm_check:1;
+ uint32_t has_cst:1;
+ uint32_t power_setup_done:1;
+ uint32_t bm_rld_set:1;
+};
+
+struct xen_processor_power {
+ uint32_t count; /* number of C state entries in array below */
+ struct xen_processor_flags flags; /* global flags of this processor */
+ GUEST_HANDLE(xen_processor_cx) states; /* supported c states */
+};
+
+struct xen_pct_register {
+ uint8_t descriptor;
+ uint16_t length;
+ uint8_t space_id;
+ uint8_t bit_width;
+ uint8_t bit_offset;
+ uint8_t reserved;
+ uint64_t address;
+};
+
+struct xen_processor_px {
+ uint64_t core_frequency; /* megahertz */
+ uint64_t power; /* milliWatts */
+ uint64_t transition_latency; /* microseconds */
+ uint64_t bus_master_latency; /* microseconds */
+ uint64_t control; /* control value */
+ uint64_t status; /* success indicator */
+};
+typedef struct xen_processor_px xen_processor_px_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_processor_px);
+
+struct xen_psd_package {
+ uint64_t num_entries;
+ uint64_t revision;
+ uint64_t domain;
+ uint64_t coord_type;
+ uint64_t num_processors;
+};
+
+struct xen_processor_performance {
+ uint32_t flags; /* flag for Px sub info type */
+ uint32_t platform_limit; /* Platform limitation on freq usage */
+ struct xen_pct_register control_register;
+ struct xen_pct_register status_register;
+ uint32_t state_count; /* total available performance states */
+ GUEST_HANDLE(xen_processor_px) states;
+ struct xen_psd_package domain_info;
+ uint32_t shared_type; /* coordination type of this processor */
+};
+typedef struct xen_processor_performance xen_processor_performance_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_processor_performance);
+
+struct xenpf_set_processor_pminfo {
+ /* IN variables */
+ uint32_t id; /* ACPI CPU ID */
+ uint32_t type; /* {XEN_PM_CX, XEN_PM_PX} */
+ union {
+ struct xen_processor_power power;/* Cx: _CST/_CSD */
+ struct xen_processor_performance perf; /* Px: _PPC/_PCT/_PSS/_PSD */
+ };
+};
+typedef struct xenpf_set_processor_pminfo xenpf_set_processor_pminfo_t;
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_set_processor_pminfo);
+
+#define XENPF_get_cpuinfo 55
+struct xenpf_pcpuinfo {
+ /* IN */
+ uint32_t xen_cpuid;
+ /* OUT */
+ /* The maxium cpu_id that is present */
+ uint32_t max_present;
+#define XEN_PCPU_FLAGS_ONLINE 1
+ /* Correponding xen_cpuid is not present*/
+#define XEN_PCPU_FLAGS_INVALID 2
+ uint32_t flags;
+ uint32_t apic_id;
+ uint32_t acpi_id;
+};
+typedef struct xenpf_pcpuinfo xenpf_pcpuinfo_t;
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_pcpuinfo_t);
+
+#define XENPF_cpu_online 56
+#define XENPF_cpu_offline 57
+struct xenpf_cpu_ol {
+ uint32_t cpuid;
+};
+typedef struct xenpf_cpu_ol xenpf_cpu_ol_t;
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_cpu_ol_t);
+
+#define XENPF_cpu_hotadd 58
+struct xenpf_cpu_hotadd {
+ uint32_t apic_id;
+ uint32_t acpi_id;
+ uint32_t pxm;
+};
+
+
+#define XENPF_mem_hotadd 59
+struct xenpf_mem_hotadd {
+ uint64_t spfn;
+ uint64_t epfn;
+ uint32_t pxm;
+ uint32_t flags;
+};
+
+struct xen_platform_op {
+ uint32_t cmd;
+ uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
+ union {
+ struct xenpf_settime settime;
+ struct xenpf_add_memtype add_memtype;
+ struct xenpf_del_memtype del_memtype;
+ struct xenpf_read_memtype read_memtype;
+ struct xenpf_microcode_update microcode;
+ struct xenpf_platform_quirk platform_quirk;
+ struct xenpf_firmware_info firmware_info;
+ struct xenpf_enter_acpi_sleep enter_acpi_sleep;
+ struct xenpf_change_freq change_freq;
+ struct xenpf_getidletime getidletime;
+ struct xenpf_set_processor_pminfo set_pminfo;
+ struct xenpf_pcpuinfo pcpu_info;
+ struct xenpf_cpu_ol cpu_ol;
+ struct xenpf_cpu_hotadd cpu_add;
+ struct xenpf_mem_hotadd mem_add;
+ uint8_t pad[128];
+ } u;
+};
+typedef struct xen_platform_op xen_platform_op_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_platform_op_t);
+
+#endif /* __XEN_PUBLIC_PLATFORM_H__ */
diff --git a/include/xen/interface/sched.h b/include/xen/interface/sched.h
index 5fec575..dd55dac 100644
--- a/include/xen/interface/sched.h
+++ b/include/xen/interface/sched.h
@@ -65,6 +65,39 @@
DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
/*
+ * Declare a shutdown for another domain. The main use of this function is
+ * in interpreting shutdown requests and reasons for fully-virtualized
+ * domains. A para-virtualized domain may use SCHEDOP_shutdown directly.
+ * @arg == pointer to sched_remote_shutdown structure.
+ */
+#define SCHEDOP_remote_shutdown 4
+struct sched_remote_shutdown {
+ domid_t domain_id; /* Remote domain ID */
+ unsigned int reason; /* SHUTDOWN_xxx reason */
+};
+
+/*
+ * Latch a shutdown code, so that when the domain later shuts down it
+ * reports this code to the control tools.
+ * @arg == as for SCHEDOP_shutdown.
+ */
+#define SCHEDOP_shutdown_code 5
+
+/*
+ * Setup, poke and destroy a domain watchdog timer.
+ * @arg == pointer to sched_watchdog structure.
+ * With id == 0, setup a domain watchdog timer to cause domain shutdown
+ * after timeout, returns watchdog id.
+ * With id != 0 and timeout == 0, destroy domain watchdog timer.
+ * With id != 0 and timeout != 0, poke watchdog timer and set new timeout.
+ */
+#define SCHEDOP_watchdog 6
+struct sched_watchdog {
+ uint32_t id; /* watchdog ID */
+ uint32_t timeout; /* timeout */
+};
+
+/*
* Reason codes for SCHEDOP_shutdown. These may be interpreted by control
* software to determine the appropriate action. For the most part, Xen does
* not care about the shutdown code.
@@ -73,5 +106,6 @@
#define SHUTDOWN_reboot 1 /* Clean up, kill, and then restart. */
#define SHUTDOWN_suspend 2 /* Clean up, save suspend info, kill. */
#define SHUTDOWN_crash 3 /* Tell controller we've crashed. */
+#define SHUTDOWN_watchdog 4 /* Restart because watchdog time expired. */
#endif /* __XEN_PUBLIC_SCHED_H__ */
diff --git a/include/xen/interface/xen-mca.h b/include/xen/interface/xen-mca.h
new file mode 100644
index 0000000..f31fdab
--- /dev/null
+++ b/include/xen/interface/xen-mca.h
@@ -0,0 +1,429 @@
+/******************************************************************************
+ * arch-x86/mca.h
+ *
+ * Contributed by Advanced Micro Devices, Inc.
+ * Author: Christoph Egger <Christoph.Egger@amd.com>
+ *
+ * Guest OS machine check interface to x86 Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* Full MCA functionality has the following Usecases from the guest side:
+ *
+ * Must have's:
+ * 1. Dom0 and DomU register machine check trap callback handlers
+ * (already done via "set_trap_table" hypercall)
+ * 2. Dom0 registers machine check event callback handler
+ * (doable via EVTCHNOP_bind_virq)
+ * 3. Dom0 and DomU fetches machine check data
+ * 4. Dom0 wants Xen to notify a DomU
+ * 5. Dom0 gets DomU ID from physical address
+ * 6. Dom0 wants Xen to kill DomU (already done for "xm destroy")
+ *
+ * Nice to have's:
+ * 7. Dom0 wants Xen to deactivate a physical CPU
+ * This is better done as separate task, physical CPU hotplugging,
+ * and hypercall(s) should be sysctl's
+ * 8. Page migration proposed from Xen NUMA work, where Dom0 can tell Xen to
+ * move a DomU (or Dom0 itself) away from a malicious page
+ * producing correctable errors.
+ * 9. offlining physical page:
+ * Xen free's and never re-uses a certain physical page.
+ * 10. Testfacility: Allow Dom0 to write values into machine check MSR's
+ * and tell Xen to trigger a machine check
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__
+#define __XEN_PUBLIC_ARCH_X86_MCA_H__
+
+/* Hypercall */
+#define __HYPERVISOR_mca __HYPERVISOR_arch_0
+
+/*
+ * The xen-unstable repo has interface version 0x03000001; out interface
+ * is incompatible with that and any future minor revisions, so we
+ * choose a different version number range that is numerically less
+ * than that used in xen-unstable.
+ */
+#define XEN_MCA_INTERFACE_VERSION 0x01ecc003
+
+/* IN: Dom0 calls hypercall to retrieve nonurgent error log entry */
+#define XEN_MC_NONURGENT 0x0001
+/* IN: Dom0/DomU calls hypercall to retrieve urgent error log entry */
+#define XEN_MC_URGENT 0x0002
+/* IN: Dom0 acknowledges previosly-fetched error log entry */
+#define XEN_MC_ACK 0x0004
+
+/* OUT: All is ok */
+#define XEN_MC_OK 0x0
+/* OUT: Domain could not fetch data. */
+#define XEN_MC_FETCHFAILED 0x1
+/* OUT: There was no machine check data to fetch. */
+#define XEN_MC_NODATA 0x2
+/* OUT: Between notification time and this hypercall an other
+ * (most likely) correctable error happened. The fetched data,
+ * does not match the original machine check data. */
+#define XEN_MC_NOMATCH 0x4
+
+/* OUT: DomU did not register MC NMI handler. Try something else. */
+#define XEN_MC_CANNOTHANDLE 0x8
+/* OUT: Notifying DomU failed. Retry later or try something else. */
+#define XEN_MC_NOTDELIVERED 0x10
+/* Note, XEN_MC_CANNOTHANDLE and XEN_MC_NOTDELIVERED are mutually exclusive. */
+
+
+#ifndef __ASSEMBLY__
+
+#define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */
+
+/*
+ * Machine Check Architecure:
+ * structs are read-only and used to report all kinds of
+ * correctable and uncorrectable errors detected by the HW.
+ * Dom0 and DomU: register a handler to get notified.
+ * Dom0 only: Correctable errors are reported via VIRQ_MCA
+ */
+#define MC_TYPE_GLOBAL 0
+#define MC_TYPE_BANK 1
+#define MC_TYPE_EXTENDED 2
+#define MC_TYPE_RECOVERY 3
+
+struct mcinfo_common {
+ uint16_t type; /* structure type */
+ uint16_t size; /* size of this struct in bytes */
+};
+
+
+#define MC_FLAG_CORRECTABLE (1 << 0)
+#define MC_FLAG_UNCORRECTABLE (1 << 1)
+#define MC_FLAG_RECOVERABLE (1 << 2)
+#define MC_FLAG_POLLED (1 << 3)
+#define MC_FLAG_RESET (1 << 4)
+#define MC_FLAG_CMCI (1 << 5)
+#define MC_FLAG_MCE (1 << 6)
+/* contains global x86 mc information */
+struct mcinfo_global {
+ struct mcinfo_common common;
+
+ /* running domain at the time in error (most likely
+ * the impacted one) */
+ uint16_t mc_domid;
+ uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
+ uint32_t mc_socketid; /* physical socket of the physical core */
+ uint16_t mc_coreid; /* physical impacted core */
+ uint16_t mc_core_threadid; /* core thread of physical core */
+ uint32_t mc_apicid;
+ uint32_t mc_flags;
+ uint64_t mc_gstatus; /* global status */
+};
+
+/* contains bank local x86 mc information */
+struct mcinfo_bank {
+ struct mcinfo_common common;
+
+ uint16_t mc_bank; /* bank nr */
+ uint16_t mc_domid; /* Usecase 5: domain referenced by mc_addr on
+ * privileged pv-ops dom and if mc_addr is valid.
+ * Never valid on DomU. */
+ uint64_t mc_status; /* bank status */
+ uint64_t mc_addr; /* bank address, only valid
+ * if addr bit is set in mc_status */
+ uint64_t mc_misc;
+ uint64_t mc_ctrl2;
+ uint64_t mc_tsc;
+};
+
+
+struct mcinfo_msr {
+ uint64_t reg; /* MSR */
+ uint64_t value; /* MSR value */
+};
+
+/* contains mc information from other
+ * or additional mc MSRs */
+struct mcinfo_extended {
+ struct mcinfo_common common;
+
+ /* You can fill up to five registers.
+ * If you need more, then use this structure
+ * multiple times. */
+
+ uint32_t mc_msrs; /* Number of msr with valid values. */
+ /*
+ * Currently Intel extended MSR (32/64) include all gp registers
+ * and E(R)FLAGS, E(R)IP, E(R)MISC, up to 11/19 of them might be
+ * useful at present. So expand this array to 16/32 to leave room.
+ */
+ struct mcinfo_msr mc_msr[sizeof(void *) * 4];
+};
+
+/* Recovery Action flags. Giving recovery result information to DOM0 */
+
+/* Xen takes successful recovery action, the error is recovered */
+#define REC_ACTION_RECOVERED (0x1 << 0)
+/* No action is performed by XEN */
+#define REC_ACTION_NONE (0x1 << 1)
+/* It's possible DOM0 might take action ownership in some case */
+#define REC_ACTION_NEED_RESET (0x1 << 2)
+
+/* Different Recovery Action types, if the action is performed successfully,
+ * REC_ACTION_RECOVERED flag will be returned.
+ */
+
+/* Page Offline Action */
+#define MC_ACTION_PAGE_OFFLINE (0x1 << 0)
+/* CPU offline Action */
+#define MC_ACTION_CPU_OFFLINE (0x1 << 1)
+/* L3 cache disable Action */
+#define MC_ACTION_CACHE_SHRINK (0x1 << 2)
+
+/* Below interface used between XEN/DOM0 for passing XEN's recovery action
+ * information to DOM0.
+ * usage Senario: After offlining broken page, XEN might pass its page offline
+ * recovery action result to DOM0. DOM0 will save the information in
+ * non-volatile memory for further proactive actions, such as offlining the
+ * easy broken page earlier when doing next reboot.
+*/
+struct page_offline_action {
+ /* Params for passing the offlined page number to DOM0 */
+ uint64_t mfn;
+ uint64_t status;
+};
+
+struct cpu_offline_action {
+ /* Params for passing the identity of the offlined CPU to DOM0 */
+ uint32_t mc_socketid;
+ uint16_t mc_coreid;
+ uint16_t mc_core_threadid;
+};
+
+#define MAX_UNION_SIZE 16
+struct mcinfo_recovery {
+ struct mcinfo_common common;
+ uint16_t mc_bank; /* bank nr */
+ /* Recovery Action Flags defined above such as REC_ACTION_DONE */
+ uint8_t action_flags;
+ /* Recovery Action types defined above such as MC_ACTION_PAGE_OFFLINE */
+ uint8_t action_types;
+ /* In future if more than one recovery action permitted per error bank,
+ * a mcinfo_recovery data array will be returned
+ */
+ union {
+ struct page_offline_action page_retire;
+ struct cpu_offline_action cpu_offline;
+ uint8_t pad[MAX_UNION_SIZE];
+ } action_info;
+};
+
+
+#define MCINFO_HYPERCALLSIZE 1024
+#define MCINFO_MAXSIZE 768
+
+struct mc_info {
+ /* Number of mcinfo_* entries in mi_data */
+ uint32_t mi_nentries;
+ uint32_t _pad0;
+ uint64_t mi_data[(MCINFO_MAXSIZE - 1) / 8];
+};
+typedef struct mc_info mc_info_t;
+DEFINE_GUEST_HANDLE_STRUCT(mc_info);
+
+#define __MC_MSR_ARRAYSIZE 8
+#define __MC_NMSRS 1
+#define MC_NCAPS 7 /* 7 CPU feature flag words */
+#define MC_CAPS_STD_EDX 0 /* cpuid level 0x00000001 (%edx) */
+#define MC_CAPS_AMD_EDX 1 /* cpuid level 0x80000001 (%edx) */
+#define MC_CAPS_TM 2 /* cpuid level 0x80860001 (TransMeta) */
+#define MC_CAPS_LINUX 3 /* Linux-defined */
+#define MC_CAPS_STD_ECX 4 /* cpuid level 0x00000001 (%ecx) */
+#define MC_CAPS_VIA 5 /* cpuid level 0xc0000001 */
+#define MC_CAPS_AMD_ECX 6 /* cpuid level 0x80000001 (%ecx) */
+
+struct mcinfo_logical_cpu {
+ uint32_t mc_cpunr;
+ uint32_t mc_chipid;
+ uint16_t mc_coreid;
+ uint16_t mc_threadid;
+ uint32_t mc_apicid;
+ uint32_t mc_clusterid;
+ uint32_t mc_ncores;
+ uint32_t mc_ncores_active;
+ uint32_t mc_nthreads;
+ int32_t mc_cpuid_level;
+ uint32_t mc_family;
+ uint32_t mc_vendor;
+ uint32_t mc_model;
+ uint32_t mc_step;
+ char mc_vendorid[16];
+ char mc_brandid[64];
+ uint32_t mc_cpu_caps[MC_NCAPS];
+ uint32_t mc_cache_size;
+ uint32_t mc_cache_alignment;
+ int32_t mc_nmsrvals;
+ struct mcinfo_msr mc_msrvalues[__MC_MSR_ARRAYSIZE];
+};
+typedef struct mcinfo_logical_cpu mcinfo_logical_cpu_t;
+DEFINE_GUEST_HANDLE_STRUCT(mcinfo_logical_cpu);
+
+
+/*
+ * OS's should use these instead of writing their own lookup function
+ * each with its own bugs and drawbacks.
+ * We use macros instead of static inline functions to allow guests
+ * to include this header in assembly files (*.S).
+ */
+/* Prototype:
+ * uint32_t x86_mcinfo_nentries(struct mc_info *mi);
+ */
+#define x86_mcinfo_nentries(_mi) \
+ ((_mi)->mi_nentries)
+/* Prototype:
+ * struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi);
+ */
+#define x86_mcinfo_first(_mi) \
+ ((struct mcinfo_common *)(_mi)->mi_data)
+/* Prototype:
+ * struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic);
+ */
+#define x86_mcinfo_next(_mic) \
+ ((struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size))
+
+/* Prototype:
+ * void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type);
+ */
+
+static inline void x86_mcinfo_lookup
+ (struct mcinfo_common **ret, struct mc_info *mi, uint16_t type)
+{
+ uint32_t found = 0, i;
+ struct mcinfo_common *mic;
+
+ *ret = NULL;
+ if (!mi)
+ return;
+ mic = x86_mcinfo_first(mi);
+
+ for (i = 0; i < x86_mcinfo_nentries(mi); i++) {
+ if (mic->type == type) {
+ found = 1;
+ break;
+ }
+ mic = x86_mcinfo_next(mic);
+ }
+
+ *ret = found ? mic : NULL;
+}
+/* Usecase 1
+ * Register machine check trap callback handler
+ * (already done via "set_trap_table" hypercall)
+ */
+
+/* Usecase 2
+ * Dom0 registers machine check event callback handler
+ * done by EVTCHNOP_bind_virq
+ */
+
+/* Usecase 3
+ * Fetch machine check data from hypervisor.
+ * Note, this hypercall is special, because both Dom0 and DomU must use this.
+ */
+#define XEN_MC_fetch 1
+struct xen_mc_fetch {
+ /* IN/OUT variables.
+ * IN: XEN_MC_NONURGENT, XEN_MC_URGENT,
+ * XEN_MC_ACK if ack'king an earlier fetch
+ * OUT: XEN_MC_OK, XEN_MC_FETCHAILED,
+ * XEN_MC_NODATA, XEN_MC_NOMATCH
+ */
+ uint32_t flags;
+ uint32_t _pad0;
+ /* OUT: id for ack, IN: id we are ack'ing */
+ uint64_t fetch_id;
+
+ /* OUT variables. */
+ GUEST_HANDLE(mc_info) data;
+};
+typedef struct xen_mc_fetch xen_mc_fetch_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_mc_fetch);
+
+
+/* Usecase 4
+ * This tells the hypervisor to notify a DomU about the machine check error
+ */
+#define XEN_MC_notifydomain 2
+struct xen_mc_notifydomain {
+ /* IN variables. */
+ uint16_t mc_domid;/* The unprivileged domain to notify. */
+ uint16_t mc_vcpuid;/* The vcpu in mc_domid to notify.
+ * Usually echo'd value from the fetch hypercall. */
+
+ /* IN/OUT variables. */
+ uint32_t flags;
+
+/* OUT: XEN_MC_OK, XEN_MC_CANNOTHANDLE, XEN_MC_NOTDELIVERED, XEN_MC_NOMATCH */
+};
+typedef struct xen_mc_notifydomain xen_mc_notifydomain_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_mc_notifydomain);
+
+#define XEN_MC_physcpuinfo 3
+struct xen_mc_physcpuinfo {
+ /* IN/OUT */
+ uint32_t ncpus;
+ uint32_t _pad0;
+ /* OUT */
+ GUEST_HANDLE(mcinfo_logical_cpu) info;
+};
+
+#define XEN_MC_msrinject 4
+#define MC_MSRINJ_MAXMSRS 8
+struct xen_mc_msrinject {
+ /* IN */
+ uint32_t mcinj_cpunr;/* target processor id */
+ uint32_t mcinj_flags;/* see MC_MSRINJ_F_* below */
+ uint32_t mcinj_count;/* 0 .. count-1 in array are valid */
+ uint32_t _pad0;
+ struct mcinfo_msr mcinj_msr[MC_MSRINJ_MAXMSRS];
+};
+
+/* Flags for mcinj_flags above; bits 16-31 are reserved */
+#define MC_MSRINJ_F_INTERPOSE 0x1
+
+#define XEN_MC_mceinject 5
+struct xen_mc_mceinject {
+ unsigned int mceinj_cpunr; /* target processor id */
+};
+
+struct xen_mc {
+ uint32_t cmd;
+ uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
+ union {
+ struct xen_mc_fetch mc_fetch;
+ struct xen_mc_notifydomain mc_notifydomain;
+ struct xen_mc_physcpuinfo mc_physcpuinfo;
+ struct xen_mc_msrinject mc_msrinject;
+ struct xen_mc_mceinject mc_mceinject;
+ } u;
+};
+typedef struct xen_mc xen_mc_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_mc);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 2befa3e..391e0b7 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -30,7 +30,7 @@
#define __HYPERVISOR_stack_switch 3
#define __HYPERVISOR_set_callbacks 4
#define __HYPERVISOR_fpu_taskswitch 5
-#define __HYPERVISOR_sched_op 6
+#define __HYPERVISOR_sched_op_compat 6
#define __HYPERVISOR_dom0_op 7
#define __HYPERVISOR_set_debugreg 8
#define __HYPERVISOR_get_debugreg 9
@@ -52,7 +52,7 @@
#define __HYPERVISOR_mmuext_op 26
#define __HYPERVISOR_acm_op 27
#define __HYPERVISOR_nmi_op 28
-#define __HYPERVISOR_sched_op_new 29
+#define __HYPERVISOR_sched_op 29
#define __HYPERVISOR_callback_op 30
#define __HYPERVISOR_xenoprof_op 31
#define __HYPERVISOR_event_channel_op 32
@@ -79,6 +79,7 @@
#define VIRQ_CONSOLE 2 /* (DOM0) Bytes received on emergency console. */
#define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */
#define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */
+#define VIRQ_PCPU_STATE 9 /* (DOM0) PCPU state changed */
/* Architecture-specific VIRQ definitions. */
#define VIRQ_ARCH_0 16
@@ -184,6 +185,8 @@
#define MMUEXT_NEW_USER_BASEPTR 15
#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
struct mmuext_op {
unsigned int cmd;
union {
@@ -449,9 +452,49 @@
int8_t cmd_line[MAX_GUEST_CMDLINE];
};
+struct dom0_vga_console_info {
+ uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */
+#define XEN_VGATYPE_TEXT_MODE_3 0x03
+#define XEN_VGATYPE_VESA_LFB 0x23
+
+ union {
+ struct {
+ /* Font height, in pixels. */
+ uint16_t font_height;
+ /* Cursor location (column, row). */
+ uint16_t cursor_x, cursor_y;
+ /* Number of rows and columns (dimensions in characters). */
+ uint16_t rows, columns;
+ } text_mode_3;
+
+ struct {
+ /* Width and height, in pixels. */
+ uint16_t width, height;
+ /* Bytes per scan line. */
+ uint16_t bytes_per_line;
+ /* Bits per pixel. */
+ uint16_t bits_per_pixel;
+ /* LFB physical address, and size (in units of 64kB). */
+ uint32_t lfb_base;
+ uint32_t lfb_size;
+ /* RGB mask offsets and sizes, as defined by VBE 1.2+ */
+ uint8_t red_pos, red_size;
+ uint8_t green_pos, green_size;
+ uint8_t blue_pos, blue_size;
+ uint8_t rsvd_pos, rsvd_size;
+
+ /* VESA capabilities (offset 0xa, VESA command 0x4f00). */
+ uint32_t gbl_caps;
+ /* Mode attributes (offset 0x0, VESA command 0x4f01). */
+ uint16_t mode_attrs;
+ } vesa_lfb;
+ } u;
+};
+
/* These flags are passed in the 'flags' field of start_info_t. */
#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */
#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */
+#define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */
typedef uint64_t cpumap_t;
@@ -461,6 +504,8 @@
#define __mk_unsigned_long(x) x ## UL
#define mk_unsigned_long(x) __mk_unsigned_long(x)
+DEFINE_GUEST_HANDLE(uint64_t);
+
#else /* __ASSEMBLY__ */
/* In assembly code we cannot use C numeric constant suffixes. */
diff --git a/include/xen/page.h b/include/xen/page.h
index eaf85fa..0be36b9 100644
--- a/include/xen/page.h
+++ b/include/xen/page.h
@@ -1 +1,8 @@
+#ifndef _XEN_PAGE_H
+#define _XEN_PAGE_H
+
#include <asm/xen/page.h>
+
+extern phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
+
+#endif /* _XEN_PAGE_H */
diff --git a/include/xen/pcpu.h b/include/xen/pcpu.h
new file mode 100644
index 0000000..7e8f9d1
--- /dev/null
+++ b/include/xen/pcpu.h
@@ -0,0 +1,32 @@
+#ifndef _XEN_PCPU_H
+#define _XEN_PCPU_H
+
+#include <xen/interface/platform.h>
+#include <linux/sysdev.h>
+
+extern int xen_pcpu_hotplug(int type, uint32_t apic_id);
+#define XEN_PCPU_ONLINE 0x01
+#define XEN_PCPU_OFFLINE 0x02
+#define XEN_PCPU_ADD 0x04
+#define XEN_PCPU_REMOVE 0x08
+
+struct pcpu {
+ struct list_head pcpu_list;
+ struct sys_device sysdev;
+ uint32_t xen_id;
+ uint32_t apic_id;
+ uint32_t acpi_id;
+ uint32_t flags;
+};
+
+static inline int xen_pcpu_online(uint32_t flags)
+{
+ return !!(flags & XEN_PCPU_FLAGS_ONLINE);
+}
+
+extern int register_xen_pcpu_notifier(struct notifier_block *nb);
+
+extern void unregister_xen_pcpu_notifier(struct notifier_block *nb);
+
+extern int xen_pcpu_index(uint32_t acpi_id, int is_acpiid);
+#endif
diff --git a/include/xen/platform_pci.h b/include/xen/platform_pci.h
new file mode 100644
index 0000000..a785a3b
--- /dev/null
+++ b/include/xen/platform_pci.h
@@ -0,0 +1,53 @@
+#ifndef _XEN_PLATFORM_PCI_H
+#define _XEN_PLATFORM_PCI_H
+
+#define XEN_IOPORT_MAGIC_VAL 0x49d2
+#define XEN_IOPORT_LINUX_PRODNUM 0x0003
+#define XEN_IOPORT_LINUX_DRVVER 0x0001
+
+#define XEN_IOPORT_BASE 0x10
+
+#define XEN_IOPORT_PLATFLAGS (XEN_IOPORT_BASE + 0) /* 1 byte access (R/W) */
+#define XEN_IOPORT_MAGIC (XEN_IOPORT_BASE + 0) /* 2 byte access (R) */
+#define XEN_IOPORT_UNPLUG (XEN_IOPORT_BASE + 0) /* 2 byte access (W) */
+#define XEN_IOPORT_DRVVER (XEN_IOPORT_BASE + 0) /* 4 byte access (W) */
+
+#define XEN_IOPORT_SYSLOG (XEN_IOPORT_BASE + 2) /* 1 byte access (W) */
+#define XEN_IOPORT_PROTOVER (XEN_IOPORT_BASE + 2) /* 1 byte access (R) */
+#define XEN_IOPORT_PRODNUM (XEN_IOPORT_BASE + 2) /* 2 byte access (W) */
+
+#define XEN_UNPLUG_ALL_IDE_DISKS (1<<0)
+#define XEN_UNPLUG_ALL_NICS (1<<1)
+#define XEN_UNPLUG_AUX_IDE_DISKS (1<<2)
+#define XEN_UNPLUG_ALL (XEN_UNPLUG_ALL_IDE_DISKS|\
+ XEN_UNPLUG_ALL_NICS|\
+ XEN_UNPLUG_AUX_IDE_DISKS)
+
+#define XEN_UNPLUG_UNNECESSARY (1<<16)
+#define XEN_UNPLUG_NEVER (1<<17)
+
+static inline int xen_must_unplug_nics(void) {
+#if (defined(CONFIG_XEN_NETDEV_FRONTEND) || \
+ defined(CONFIG_XEN_NETDEV_FRONTEND_MODULE)) && \
+ (defined(CONFIG_XEN_PLATFORM_PCI) || \
+ defined(CONFIG_XEN_PLATFORM_PCI_MODULE))
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+static inline int xen_must_unplug_disks(void) {
+#if (defined(CONFIG_XEN_BLKDEV_FRONTEND) || \
+ defined(CONFIG_XEN_BLKDEV_FRONTEND_MODULE)) && \
+ (defined(CONFIG_XEN_PLATFORM_PCI) || \
+ defined(CONFIG_XEN_PLATFORM_PCI_MODULE))
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+extern int xen_platform_pci_unplug;
+
+#endif /* _XEN_PLATFORM_PCI_H */
diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h
new file mode 100644
index 0000000..b42cdfd
--- /dev/null
+++ b/include/xen/privcmd.h
@@ -0,0 +1,80 @@
+/******************************************************************************
+ * privcmd.h
+ *
+ * Interface to /proc/xen/privcmd.
+ *
+ * Copyright (c) 2003-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_PRIVCMD_H__
+#define __LINUX_PUBLIC_PRIVCMD_H__
+
+#include <linux/types.h>
+
+typedef unsigned long xen_pfn_t;
+
+#ifndef __user
+#define __user
+#endif
+
+struct privcmd_hypercall {
+ __u64 op;
+ __u64 arg[5];
+};
+
+struct privcmd_mmap_entry {
+ __u64 va;
+ __u64 mfn;
+ __u64 npages;
+};
+
+struct privcmd_mmap {
+ int num;
+ domid_t dom; /* target domain */
+ struct privcmd_mmap_entry __user *entry;
+};
+
+struct privcmd_mmapbatch {
+ int num; /* number of pages to populate */
+ domid_t dom; /* target domain */
+ __u64 addr; /* virtual address */
+ xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
+};
+
+/*
+ * @cmd: IOCTL_PRIVCMD_HYPERCALL
+ * @arg: &privcmd_hypercall_t
+ * Return: Value returned from execution of the specified hypercall.
+ */
+#define IOCTL_PRIVCMD_HYPERCALL \
+ _IOC(_IOC_NONE, 'P', 0, sizeof(struct privcmd_hypercall))
+#define IOCTL_PRIVCMD_MMAP \
+ _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap))
+#define IOCTL_PRIVCMD_MMAPBATCH \
+ _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch))
+
+#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 883a21b..4349e89 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -5,8 +5,9 @@
DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
-void xen_pre_suspend(void);
-void xen_post_suspend(int suspend_cancelled);
+void xen_arch_pre_suspend(void);
+void xen_arch_post_suspend(int suspend_cancelled);
+void xen_arch_hvm_post_suspend(int suspend_cancelled);
void xen_mm_pin_all(void);
void xen_mm_unpin_all(void);
@@ -14,4 +15,16 @@
void xen_timer_resume(void);
void xen_arch_resume(void);
+int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+ unsigned long addr,
+ unsigned long mfn, int nr,
+ pgprot_t prot, unsigned domid);
+
+extern unsigned long *xen_contiguous_bitmap;
+int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
+ unsigned int address_bits);
+
+void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order);
+int xen_setup_shutdown_event(void);
+
#endif /* INCLUDE_XEN_OPS_H */
diff --git a/include/xen/xen.h b/include/xen/xen.h
new file mode 100644
index 0000000..77604ed
--- /dev/null
+++ b/include/xen/xen.h
@@ -0,0 +1,34 @@
+#ifndef _XEN_XEN_H
+#define _XEN_XEN_H
+
+enum xen_domain_type {
+ XEN_NATIVE, /* running on bare hardware */
+ XEN_PV_DOMAIN, /* running in a PV domain */
+ XEN_HVM_DOMAIN, /* running in a Xen hvm domain */
+};
+
+#ifdef CONFIG_XEN
+extern enum xen_domain_type xen_domain_type;
+extern void xen_hvm_guest_init(void);
+#else
+#define xen_domain_type XEN_NATIVE
+#define xen_hvm_guest_init() do { } while (0)
+#endif
+
+#define xen_domain() (xen_domain_type != XEN_NATIVE)
+#define xen_pv_domain() (xen_domain() && \
+ xen_domain_type == XEN_PV_DOMAIN)
+#define xen_hvm_domain() (xen_domain() && \
+ xen_domain_type == XEN_HVM_DOMAIN)
+
+#ifdef CONFIG_XEN_DOM0
+#include <xen/interface/xen.h>
+#include <asm/xen/hypervisor.h>
+
+#define xen_initial_domain() (xen_pv_domain() && \
+ xen_start_info->flags & SIF_INITDOMAIN)
+#else /* !CONFIG_XEN_DOM0 */
+#define xen_initial_domain() (0)
+#endif /* CONFIG_XEN_DOM0 */
+
+#endif /* _XEN_XEN_H */
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index b9763ba..23e7f25 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -91,9 +91,9 @@
void (*otherend_changed)(struct xenbus_device *dev,
enum xenbus_state backend_state);
int (*remove)(struct xenbus_device *dev);
- int (*suspend)(struct xenbus_device *dev, pm_message_t state);
+ int (*suspend)(struct xenbus_device *dev);
int (*resume)(struct xenbus_device *dev);
- int (*uevent)(struct xenbus_device *, char **, int, char *, int);
+ int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *);
struct device_driver driver;
int (*read_otherend_details)(struct xenbus_device *dev);
int (*is_ready)(struct xenbus_device *dev);
diff --git a/lib/Makefile b/lib/Makefile
index 5ecf2ba..644d408 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -77,7 +77,8 @@
obj-$(CONFIG_SMP) += percpu_counter.o
obj-$(CONFIG_AUDIT_GENERIC) += audit.o
-obj-$(CONFIG_SWIOTLB) += swiotlb.o
+obj-$(CONFIG_SWIOTLB) += swiotlb-core.o swiotlb.o
+obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
diff --git a/lib/swiotlb-core.c b/lib/swiotlb-core.c
new file mode 100644
index 0000000..a17c89e
--- /dev/null
+++ b/lib/swiotlb-core.c
@@ -0,0 +1,572 @@
+/*
+ * Dynamic DMA mapping support.
+ *
+ * This implementation is a fallback for platforms that do not support
+ * I/O TLBs (aka DMA address translation hardware).
+ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
+ * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
+ * Copyright (C) 2000, 2003 Hewlett-Packard Co
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API.
+ * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid
+ * unnecessary i-cache flushing.
+ * 04/07/.. ak Better overflow handling. Assorted fixes.
+ * 05/09/10 linville Add support for syncing ranges, support syncing for
+ * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup.
+ * 08/12/11 beckyb Add highmem support
+ */
+
+#include <linux/cache.h>
+#include <linux/dma-mapping.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/swiotlb.h>
+#include <linux/pfn.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+
+#include <linux/io.h>
+#include <asm/dma.h>
+#include <linux/scatterlist.h>
+
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/iommu-helper.h>
+
+#define OFFSET(val, align) ((unsigned long) ((val) & ((align) - 1)))
+
+#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
+
+/*
+ * Minimum IO TLB size to bother booting with. Systems with mainly
+ * 64bit capable cards will only lightly use the swiotlb. If we can't
+ * allocate a contiguous 1MB, we're probably in trouble anyway.
+ */
+#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
+
+int swiotlb_force;
+
+/*
+ * Used to do a quick range check in do_unmap_single and
+ * do_sync_single_*, to see if the memory was in fact allocated by this
+ * API.
+ */
+char *io_tlb_start, *io_tlb_end;
+
+/*
+ * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and
+ * io_tlb_end. This is command line adjustable via setup_io_tlb_npages.
+ */
+unsigned long io_tlb_nslabs;
+
+/*
+ * When the IOMMU overflows we return a fallback buffer. This sets the size.
+ */
+unsigned long io_tlb_overflow = 32*1024;
+
+void *io_tlb_overflow_buffer;
+
+/*
+ * This is a free list describing the number of free entries available from
+ * each index
+ */
+static unsigned int *io_tlb_list;
+static unsigned int io_tlb_index;
+
+/*
+ * We need to save away the original address corresponding to a mapped entry
+ * for the sync operations.
+ */
+static phys_addr_t *io_tlb_orig_addr;
+
+/*
+ * Protect the above data structures in the map and unmap calls
+ */
+static DEFINE_SPINLOCK(io_tlb_lock);
+
+static int late_alloc;
+
+static int __init
+setup_io_tlb_npages(char *str)
+{
+ int get_value(const char *token, char *str, char **endp)
+ {
+ ssize_t len;
+ int val = 0;
+
+ len = strlen(token);
+ if (!strncmp(str, token, len)) {
+ str += len;
+ if (*str == '=')
+ ++str;
+ if (*str != '\0')
+ val = simple_strtoul(str, endp, 0);
+ }
+ *endp = str;
+ return val;
+ }
+
+ int val;
+
+ while (*str) {
+ /* The old syntax */
+ if (isdigit(*str)) {
+ io_tlb_nslabs = simple_strtoul(str, &str, 0);
+ /* avoid tail segment of size < IO_TLB_SEGSIZE */
+ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+ }
+ if (!strncmp(str, "force", 5))
+ swiotlb_force = 1;
+ /* The new syntax: swiotlb=nslabs=16384,overflow=32768,force */
+ val = get_value("nslabs", str, &str);
+ if (val)
+ io_tlb_nslabs = ALIGN(val, IO_TLB_SEGSIZE);
+
+ val = get_value("overflow", str, &str);
+ if (val)
+ io_tlb_overflow = val;
+ str = strpbrk(str, ",");
+ if (!str)
+ break;
+ str++; /* skip ',' */
+ }
+ return 1;
+}
+__setup("swiotlb=", setup_io_tlb_npages);
+
+void swiotlb_print_info(void)
+{
+ unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+ phys_addr_t pstart, pend;
+
+ pstart = virt_to_phys(io_tlb_start);
+ pend = virt_to_phys(io_tlb_end);
+
+ printk(KERN_INFO "DMA: Placing %luMB software IO TLB between %p - %p\n",
+ bytes >> 20, io_tlb_start, io_tlb_end);
+ printk(KERN_INFO "DMA: software IO TLB at phys %#llx - %#llx\n",
+ (unsigned long long)pstart,
+ (unsigned long long)pend);
+}
+
+/*
+ * Statically reserve bounce buffer space and initialize bounce buffer data
+ * structures for the software IO TLB used to implement the DMA API.
+ */
+void __init
+swiotlb_init_early(size_t default_size, int verbose)
+{
+ unsigned long i, bytes;
+
+ if (!io_tlb_nslabs) {
+ io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+ }
+
+ bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+
+ /*
+ * Get IO TLB memory from the low pages
+ */
+ io_tlb_start = alloc_bootmem_low_pages(bytes);
+ if (!io_tlb_start)
+ panic("DMA: Cannot allocate SWIOTLB buffer");
+ io_tlb_end = io_tlb_start + bytes;
+
+ /*
+ * Allocate and initialize the free list array. This array is used
+ * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
+ * between io_tlb_start and io_tlb_end.
+ */
+ io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
+ for (i = 0; i < io_tlb_nslabs; i++)
+ io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+ io_tlb_index = 0;
+ io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t));
+
+ /*
+ * Get the overflow emergency buffer
+ */
+ io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
+ if (!io_tlb_overflow_buffer)
+ panic("DMA: Cannot allocate SWIOTLB overflow buffer!\n");
+ if (verbose)
+ swiotlb_print_info();
+}
+
+void __init
+swiotlb_init(int verbose)
+{
+ swiotlb_init_early(64 * (1<<20), verbose); /* default to 64MB */
+}
+
+/*
+ * Systems with larger DMA zones (those that don't support ISA) can
+ * initialize the swiotlb later using the slab allocator if needed.
+ * This should be just like above, but with some error catching.
+ */
+int
+swiotlb_init_late(size_t default_size)
+{
+ unsigned long i, bytes, req_nslabs = io_tlb_nslabs;
+ unsigned int order;
+
+ if (!io_tlb_nslabs) {
+ io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+ }
+
+ /*
+ * Get IO TLB memory from the low pages
+ */
+ order = get_order(io_tlb_nslabs << IO_TLB_SHIFT);
+ io_tlb_nslabs = SLABS_PER_PAGE << order;
+ bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+
+ while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
+ io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+ order);
+ if (io_tlb_start)
+ break;
+ order--;
+ }
+
+ if (!io_tlb_start)
+ goto cleanup1;
+
+ if (order != get_order(bytes)) {
+ printk(KERN_WARNING "DMA: Warning: only able to allocate %ld MB"
+ " for software IO TLB\n", (PAGE_SIZE << order) >> 20);
+ io_tlb_nslabs = SLABS_PER_PAGE << order;
+ bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+ }
+ io_tlb_end = io_tlb_start + bytes;
+ memset(io_tlb_start, 0, bytes);
+
+ /*
+ * Allocate and initialize the free list array. This array is used
+ * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
+ * between io_tlb_start and io_tlb_end.
+ */
+ io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
+ get_order(io_tlb_nslabs * sizeof(int)));
+ if (!io_tlb_list)
+ goto cleanup2;
+
+ for (i = 0; i < io_tlb_nslabs; i++)
+ io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+ io_tlb_index = 0;
+
+ io_tlb_orig_addr = (phys_addr_t *) __get_free_pages(GFP_KERNEL,
+ get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
+ if (!io_tlb_orig_addr)
+ goto cleanup3;
+
+ memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(phys_addr_t));
+
+ /*
+ * Get the overflow emergency buffer
+ */
+ io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA,
+ get_order(io_tlb_overflow));
+ if (!io_tlb_overflow_buffer)
+ goto cleanup4;
+
+ swiotlb_print_info();
+
+ late_alloc = 1;
+
+ return 0;
+
+cleanup4:
+ free_pages((unsigned long)io_tlb_orig_addr,
+ get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
+ io_tlb_orig_addr = NULL;
+cleanup3:
+ free_pages((unsigned long)io_tlb_list,
+ get_order(io_tlb_nslabs * sizeof(int)));
+ io_tlb_list = NULL;
+cleanup2:
+ io_tlb_end = NULL;
+ free_pages((unsigned long)io_tlb_start, order);
+ io_tlb_start = NULL;
+cleanup1:
+ io_tlb_nslabs = req_nslabs;
+ return -ENOMEM;
+}
+
+void __init swiotlb_free(void)
+{
+ if (!io_tlb_overflow_buffer)
+ return;
+
+ if (late_alloc) {
+ free_pages((unsigned long)io_tlb_overflow_buffer,
+ get_order(io_tlb_overflow));
+ free_pages((unsigned long)io_tlb_orig_addr,
+ get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
+ free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
+ sizeof(int)));
+ free_pages((unsigned long)io_tlb_start,
+ get_order(io_tlb_nslabs << IO_TLB_SHIFT));
+ } else {
+ free_bootmem_late(__pa(io_tlb_overflow_buffer),
+ io_tlb_overflow);
+ free_bootmem_late(__pa(io_tlb_orig_addr),
+ io_tlb_nslabs * sizeof(phys_addr_t));
+ free_bootmem_late(__pa(io_tlb_list),
+ io_tlb_nslabs * sizeof(int));
+ free_bootmem_late(__pa(io_tlb_start),
+ io_tlb_nslabs << IO_TLB_SHIFT);
+ }
+}
+
+int is_swiotlb_buffer(phys_addr_t paddr)
+{
+ return paddr >= virt_to_phys(io_tlb_start) &&
+ paddr < virt_to_phys(io_tlb_end);
+}
+
+/*
+ * Bounce: copy the swiotlb buffer back to the original dma location
+ */
+void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
+ enum dma_data_direction dir)
+{
+ unsigned long pfn = PFN_DOWN(phys);
+
+ if (PageHighMem(pfn_to_page(pfn))) {
+ /* The buffer does not have a mapping. Map it in and copy */
+ unsigned int offset = phys & ~PAGE_MASK;
+ char *buffer;
+ unsigned int sz = 0;
+ unsigned long flags;
+
+ while (size) {
+ sz = min_t(size_t, PAGE_SIZE - offset, size);
+
+ local_irq_save(flags);
+ buffer = kmap_atomic(pfn_to_page(pfn),
+ KM_BOUNCE_READ);
+ if (dir == DMA_TO_DEVICE)
+ memcpy(dma_addr, buffer + offset, sz);
+ else
+ memcpy(buffer + offset, dma_addr, sz);
+ kunmap_atomic(buffer, KM_BOUNCE_READ);
+ local_irq_restore(flags);
+
+ size -= sz;
+ pfn++;
+ dma_addr += sz;
+ offset = 0;
+ }
+ } else {
+ if (dir == DMA_TO_DEVICE)
+ memcpy(dma_addr, phys_to_virt(phys), size);
+ else
+ memcpy(phys_to_virt(phys), dma_addr, size);
+ }
+}
+
+/*
+ * Allocates bounce buffer and returns its kernel virtual address.
+ */
+void *
+do_map_single(struct device *hwdev, phys_addr_t phys,
+ unsigned long start_dma_addr, size_t size, int dir)
+{
+ unsigned long flags;
+ char *dma_addr;
+ unsigned int nslots, stride, index, wrap;
+ int i;
+ unsigned long mask;
+ unsigned long offset_slots;
+ unsigned long max_slots;
+
+ mask = dma_get_seg_boundary(hwdev);
+ start_dma_addr = start_dma_addr & mask;
+ offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+
+ /*
+ * Carefully handle integer overflow which can occur when mask == ~0UL.
+ */
+ max_slots = mask + 1
+ ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
+ : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+
+ /*
+ * For mappings greater than a page, we limit the stride (and
+ * hence alignment) to a page size.
+ */
+ nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+ if (size > PAGE_SIZE)
+ stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
+ else
+ stride = 1;
+
+ BUG_ON(!nslots);
+
+ /*
+ * Find suitable number of IO TLB entries size that will fit this
+ * request and allocate a buffer from that IO TLB pool.
+ */
+ spin_lock_irqsave(&io_tlb_lock, flags);
+ index = ALIGN(io_tlb_index, stride);
+ if (index >= io_tlb_nslabs)
+ index = 0;
+ wrap = index;
+
+ do {
+ while (iommu_is_span_boundary(index, nslots, offset_slots,
+ max_slots)) {
+ index += stride;
+ if (index >= io_tlb_nslabs)
+ index = 0;
+ if (index == wrap)
+ goto not_found;
+ }
+
+ /*
+ * If we find a slot that indicates we have 'nslots' number of
+ * contiguous buffers, we allocate the buffers from that slot
+ * and mark the entries as '0' indicating unavailable.
+ */
+ if (io_tlb_list[index] >= nslots) {
+ int count = 0;
+
+ for (i = index; i < (int) (index + nslots); i++)
+ io_tlb_list[i] = 0;
+ for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE)
+ != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
+ io_tlb_list[i] = ++count;
+ dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
+
+ /*
+ * Update the indices to avoid searching in the next
+ * round.
+ */
+ io_tlb_index = ((index + nslots) < io_tlb_nslabs
+ ? (index + nslots) : 0);
+
+ goto found;
+ }
+ index += stride;
+ if (index >= io_tlb_nslabs)
+ index = 0;
+ } while (index != wrap);
+
+not_found:
+ spin_unlock_irqrestore(&io_tlb_lock, flags);
+ return NULL;
+found:
+ spin_unlock_irqrestore(&io_tlb_lock, flags);
+
+ /*
+ * Save away the mapping from the original address to the DMA address.
+ * This is needed when we sync the memory. Then we sync the buffer if
+ * needed.
+ */
+ for (i = 0; i < nslots; i++)
+ io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
+ if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+ swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
+
+ return dma_addr;
+}
+
+/*
+ * dma_addr is the kernel virtual address of the bounce buffer to unmap.
+ */
+void
+do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
+{
+ unsigned long flags;
+ int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+ int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+ phys_addr_t phys = io_tlb_orig_addr[index];
+
+ /*
+ * First, sync the memory before unmapping the entry
+ */
+ if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
+ swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
+
+ /*
+ * Return the buffer to the free list by setting the corresponding
+ * entries to indicate the number of contigous entries available.
+ * While returning the entries to the free list, we merge the entries
+ * with slots below and above the pool being returned.
+ */
+ spin_lock_irqsave(&io_tlb_lock, flags);
+ {
+ count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
+ io_tlb_list[index + nslots] : 0);
+ /*
+ * Step 1: return the slots to the free list, merging the
+ * slots with superceeding slots
+ */
+ for (i = index + nslots - 1; i >= index; i--)
+ io_tlb_list[i] = ++count;
+ /*
+ * Step 2: merge the returned slots with the preceding slots,
+ * if available (non zero)
+ */
+ for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) !=
+ IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
+ io_tlb_list[i] = ++count;
+ }
+ spin_unlock_irqrestore(&io_tlb_lock, flags);
+}
+
+void
+do_sync_single(struct device *hwdev, char *dma_addr, size_t size,
+ int dir, int target)
+{
+ int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+ phys_addr_t phys = io_tlb_orig_addr[index];
+
+ phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
+
+ switch (target) {
+ case SYNC_FOR_CPU:
+ if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+ swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
+ else
+ BUG_ON(dir != DMA_TO_DEVICE);
+ break;
+ case SYNC_FOR_DEVICE:
+ if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
+ swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
+ else
+ BUG_ON(dir != DMA_FROM_DEVICE);
+ break;
+ default:
+ BUG();
+ }
+}
+void
+swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
+{
+ /*
+ * Ran out of IOMMU space for this operation. This is very bad.
+ * Unfortunately the drivers cannot handle this operation properly.
+ * unless they check for dma_mapping_error (most don't)
+ * When the mapping is small enough return a static buffer to limit
+ * the damage, or panic when the transfer is too big.
+ */
+ dev_err(dev, "DMA: Out of SW-IOMMU space for %zu bytes.", size);
+
+ if (size <= io_tlb_overflow || !do_panic)
+ return;
+
+ if (dir == DMA_BIDIRECTIONAL)
+ panic("DMA: Random memory could be DMA accessed\n");
+ if (dir == DMA_FROM_DEVICE)
+ panic("DMA: Random memory could be DMA written\n");
+ if (dir == DMA_TO_DEVICE)
+ panic("DMA: Random memory could be DMA read\n");
+}
diff --git a/lib/swiotlb-xen.c b/lib/swiotlb-xen.c
new file mode 100644
index 0000000..bee577f
--- /dev/null
+++ b/lib/swiotlb-xen.c
@@ -0,0 +1,504 @@
+/* An software based IOMMU that utilizes the swiotlb-core fuctionality.
+ * It can function on Xen when there are PCI devices present.*/
+
+
+#include <linux/dma-mapping.h>
+#include <linux/io.h>
+#include <asm/dma.h>
+#include <linux/scatterlist.h>
+#include <xen/interface/xen.h>
+#include <xen/grant_table.h>
+
+#include <asm/xen/page.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+
+static dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
+{
+ return phys_to_machine(XPADDR(paddr)).maddr;;
+}
+
+static phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
+{
+ return machine_to_phys(XMADDR(baddr)).paddr;
+}
+
+static dma_addr_t xen_virt_to_bus(void *address)
+{
+ return xen_phys_to_bus(virt_to_phys(address));
+}
+
+static int check_pages_physically_contiguous(unsigned long pfn,
+ unsigned int offset,
+ size_t length)
+{
+ unsigned long next_mfn;
+ int i;
+ int nr_pages;
+
+ next_mfn = pfn_to_mfn(pfn);
+ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+ for (i = 1; i < nr_pages; i++) {
+ if (pfn_to_mfn(++pfn) != ++next_mfn)
+ return 0;
+ }
+ return 1;
+}
+
+static int range_straddles_page_boundary(phys_addr_t p, size_t size)
+{
+ unsigned long pfn = PFN_DOWN(p);
+ unsigned int offset = p & ~PAGE_MASK;
+
+ if (offset + size <= PAGE_SIZE)
+ return 0;
+ if (check_pages_physically_contiguous(pfn, offset, size))
+ return 0;
+ return 1;
+}
+
+
+bool xen_dma_capable(struct device *dev, dma_addr_t dev_addr,
+ phys_addr_t phys, size_t size)
+{
+ int rc = 0;
+
+ rc = dma_capable(dev, dev_addr, size) &&
+ !range_straddles_page_boundary(phys, size);
+ return rc;
+}
+
+static int is_xen_swiotlb_buffer(dma_addr_t dma_addr)
+{
+ unsigned long mfn = PFN_DOWN(dma_addr);
+ unsigned long pfn = mfn_to_local_pfn(mfn);
+
+ /* If the address is outside our domain, it CAN have the same virtual
+ * address as another address in our domain. Hence only check address
+ * within our domain. */
+ if (pfn_valid(pfn))
+ return is_swiotlb_buffer(PFN_PHYS(pfn));
+
+ return 0;
+}
+void *
+xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags)
+{
+ void *ret;
+ int order = get_order(size);
+ u64 dma_mask = DMA_BIT_MASK(32);
+ unsigned long vstart;
+
+ /*
+ * Ignore region specifiers - the kernel's ideas of
+ * pseudo-phys memory layout has nothing to do with the
+ * machine physical layout. We can't allocate highmem
+ * because we can't return a pointer to it.
+ */
+ flags &= ~(__GFP_DMA | __GFP_HIGHMEM);
+
+ if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))
+ return ret;
+
+ vstart = __get_free_pages(flags, order);
+ ret = (void *)vstart;
+
+ if (hwdev && hwdev->coherent_dma_mask)
+ dma_mask = dma_alloc_coherent_mask(hwdev, flags);
+
+ if (ret) {
+ if (xen_create_contiguous_region(vstart, order,
+ fls64(dma_mask)) != 0) {
+ free_pages(vstart, order);
+ return NULL;
+ }
+ memset(ret, 0, size);
+ *dma_handle = virt_to_machine(ret).maddr;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(xen_swiotlb_alloc_coherent);
+
+void
+xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
+ dma_addr_t dev_addr)
+{
+ int order = get_order(size);
+
+ if (dma_release_from_coherent(hwdev, order, vaddr))
+ return;
+
+ xen_destroy_contiguous_region((unsigned long)vaddr, order);
+ free_pages((unsigned long)vaddr, order);
+}
+EXPORT_SYMBOL(xen_swiotlb_free_coherent);
+
+
+static int max_dma_bits = 32;
+
+static int
+xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
+{
+ int i, rc;
+ int dma_bits;
+
+ printk(KERN_INFO "xen_swiotlb_fixup: buf=%p size=%zu\n",
+ buf, size);
+
+ dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
+
+ i = 0;
+ do {
+ int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE);
+
+ do {
+ rc = xen_create_contiguous_region(
+ (unsigned long)buf + (i << IO_TLB_SHIFT),
+ get_order(slabs << IO_TLB_SHIFT),
+ dma_bits);
+ } while (rc && dma_bits++ < max_dma_bits);
+ if (rc)
+ return rc;
+
+ i += slabs;
+ } while(i < nslabs);
+ return 0;
+}
+
+void __init xen_swiotlb_init(int verbose)
+{
+ int rc = 0;
+
+ swiotlb_init_early(64 * (1<<20), verbose);
+
+ if ((rc = xen_swiotlb_fixup(io_tlb_start,
+ io_tlb_nslabs << IO_TLB_SHIFT,
+ io_tlb_nslabs)))
+ goto error;
+
+ if ((rc = xen_swiotlb_fixup(io_tlb_overflow_buffer,
+ io_tlb_overflow,
+ io_tlb_overflow >> IO_TLB_SHIFT)))
+ goto error;
+
+ return;
+error:
+ panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\
+ "We either don't have the permission or you do not have enough"\
+ "free memory under 4GB!\n", rc);
+}
+
+/*
+ * Map a single buffer of the indicated size for DMA in streaming mode. The
+ * physical address to use is returned.
+ *
+ * Once the device is given the dma address, the device owns this memory until
+ * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed.
+ */
+dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ unsigned long start_dma_addr;
+ phys_addr_t phys = page_to_phys(page) + offset;
+ dma_addr_t dev_addr = xen_phys_to_bus(phys);
+ void *map;
+
+ BUG_ON(dir == DMA_NONE);
+ /*
+ * If the address happens to be in the device's DMA window,
+ * we can safely return the device addr and not worry about bounce
+ * buffering it.
+ */
+ if (dma_capable(dev, dev_addr, size) &&
+ !range_straddles_page_boundary(phys, size) && !swiotlb_force)
+ return dev_addr;
+
+ /*
+ * Oh well, have to allocate and map a bounce buffer.
+ */
+ start_dma_addr = xen_virt_to_bus(io_tlb_start);
+ map = do_map_single(dev, phys, start_dma_addr, size, dir);
+ if (!map) {
+ swiotlb_full(dev, size, dir, 1);
+ map = io_tlb_overflow_buffer;
+ }
+
+ dev_addr = xen_virt_to_bus(map);
+
+ /*
+ * Ensure that the address returned is DMA'ble
+ */
+ if (!dma_capable(dev, dev_addr, size))
+ panic("DMA: xen_swiotlb_map_single: bounce buffer is not " \
+ "DMA'ble\n");
+ return dev_addr;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_map_page);
+
+/*
+ * Unmap a single streaming mode DMA translation. The dma_addr and size must
+ * match what was provided for in a previous xen_swiotlb_map_page call. All
+ * other usages are undefined.
+ *
+ * After this call, reads by the cpu to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ */
+static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, int dir)
+{
+ phys_addr_t paddr = xen_bus_to_phys(dev_addr);
+
+ BUG_ON(dir == DMA_NONE);
+
+ /* NOTE: We use dev_addr here, not paddr! */
+ if (is_xen_swiotlb_buffer(dev_addr)) {
+ do_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
+ return;
+ }
+
+ if (dir != DMA_FROM_DEVICE)
+ return;
+
+ /*
+ * phys_to_virt doesn't work with hihgmem page but we could
+ * call dma_mark_clean() with hihgmem page here. However, we
+ * are fine since dma_mark_clean() is null on POWERPC. We can
+ * make dma_mark_clean() take a physical address if necessary.
+ */
+ dma_mark_clean(phys_to_virt(paddr), size);
+}
+
+void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ unmap_single(hwdev, dev_addr, size, dir);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page);
+
+/*
+ * Make physical memory consistent for a single streaming mode DMA translation
+ * after a transfer.
+ *
+ * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer
+ * using the cpu, yet do not wish to teardown the dma mapping, you must
+ * call this function before doing so. At the next point you give the dma
+ * address back to the card, you must first perform a
+ * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer
+ */
+static void
+xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, int dir, int target)
+{
+ phys_addr_t paddr = xen_bus_to_phys(dev_addr);
+
+ BUG_ON(dir == DMA_NONE);
+
+ if (is_xen_swiotlb_buffer(dev_addr)) {
+ do_sync_single(hwdev, phys_to_virt(paddr), size, dir, target);
+ return;
+ }
+
+ if (dir != DMA_FROM_DEVICE)
+ return;
+
+ dma_mark_clean(phys_to_virt(paddr), size);
+}
+
+void
+xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir)
+{
+ xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
+}
+EXPORT_SYMBOL(xen_swiotlb_sync_single_for_cpu);
+
+void
+xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir)
+{
+ xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
+}
+EXPORT_SYMBOL(xen_swiotlb_sync_single_for_device);
+
+/*
+ * Same as above, but for a sub-range of the mapping.
+ */
+static void
+xen_swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
+ unsigned long offset, size_t size,
+ int dir, int target)
+{
+ xen_swiotlb_sync_single(hwdev, dev_addr + offset, size, dir, target);
+}
+
+void
+xen_swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir)
+{
+ xen_swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
+ SYNC_FOR_CPU);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_range_for_cpu);
+
+void
+xen_swiotlb_sync_single_range_for_device(struct device *hwdev,
+ dma_addr_t dev_addr,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir)
+{
+ xen_swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
+ SYNC_FOR_DEVICE);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_range_for_device);
+
+/*
+ * Map a set of buffers described by scatterlist in streaming mode for DMA.
+ * This is the scatter-gather version of the above xen_swiotlb_map_page
+ * interface. Here the scatter gather list elements are each tagged with the
+ * appropriate dma address and length. They are obtained via
+ * sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ * DMA address/length pairs than there are SG table elements.
+ * (for example via virtual mapping capabilities)
+ * The routine returns the number of addr/length pairs actually
+ * used, at most nents.
+ *
+ * Device ownership issues as mentioned above for xen_swiotlb_map_page are the
+ * same here.
+ */
+int
+xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ unsigned long start_dma_addr;
+ struct scatterlist *sg;
+ int i;
+ BUG_ON(dir == DMA_NONE);
+
+ start_dma_addr = xen_virt_to_bus(io_tlb_start);
+ for_each_sg(sgl, sg, nelems, i) {
+ phys_addr_t paddr = sg_phys(sg);
+ dma_addr_t dev_addr = xen_phys_to_bus(paddr);
+
+ if (swiotlb_force ||
+ !dma_capable(hwdev, dev_addr, sg->length) ||
+ range_straddles_page_boundary(paddr, sg->length)) {
+ void *map = do_map_single(hwdev, sg_phys(sg),
+ start_dma_addr,
+ sg->length, dir);
+ if (!map) {
+ /* Don't panic here, we expect map_sg users
+ to do proper error handling. */
+ swiotlb_full(hwdev, sg->length, dir, 0);
+ xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
+ attrs);
+ sgl[0].dma_length = 0;
+ return 0;
+ }
+ sg->dma_address = xen_virt_to_bus(map);
+ } else
+ sg->dma_address = dev_addr;
+ sg->dma_length = sg->length;
+ }
+ return nelems;
+}
+EXPORT_SYMBOL(xen_swiotlb_map_sg_attrs);
+
+int
+xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+ int dir)
+{
+ return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}
+EXPORT_SYMBOL(xen_swiotlb_map_sg);
+
+/*
+ * Unmap a set of streaming mode DMA translations. Again, cpu read rules
+ * concerning calls here are the same as for xen_swiotlb_unmap_page() above.
+ */
+void
+xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ struct scatterlist *sg;
+ int i;
+
+ BUG_ON(dir == DMA_NONE);
+
+ for_each_sg(sgl, sg, nelems, i)
+ unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
+
+}
+EXPORT_SYMBOL(xen_swiotlb_unmap_sg_attrs);
+
+void
+xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+ int dir)
+{
+ return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}
+EXPORT_SYMBOL(xen_swiotlb_unmap_sg);
+
+/*
+ * Make physical memory consistent for a set of streaming mode DMA translations
+ * after a transfer.
+ *
+ * The same as xen_swiotlb_sync_single_* but for a scatter-gather list,
+ * same rules and usage.
+ */
+static void
+xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, int dir, int target)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nelems, i)
+ xen_swiotlb_sync_single(hwdev, sg->dma_address,
+ sg->dma_length, dir, target);
+}
+
+void
+xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
+}
+EXPORT_SYMBOL(xen_swiotlb_sync_sg_for_cpu);
+
+void
+xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
+}
+EXPORT_SYMBOL(xen_swiotlb_sync_sg_for_device);
+
+int
+xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
+{
+ return (dma_addr == xen_virt_to_bus(io_tlb_overflow_buffer));
+}
+EXPORT_SYMBOL(xen_swiotlb_dma_mapping_error);
+
+/*
+ * Return whether the given device DMA address mask can be supported
+ * properly. For example, if your device can only drive the low 24-bits
+ * during bus mastering, then you would pass 0x00ffffff as the mask to
+ * this function.
+ */
+int
+xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
+{
+ return xen_virt_to_bus(io_tlb_end - 1) <= mask;
+}
+EXPORT_SYMBOL(xen_swiotlb_dma_supported);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index ac25cd2..9ed5e19 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -1,118 +1,11 @@
-/*
- * Dynamic DMA mapping support.
- *
- * This implementation is a fallback for platforms that do not support
- * I/O TLBs (aka DMA address translation hardware).
- * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
- * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
- * Copyright (C) 2000, 2003 Hewlett-Packard Co
- * David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API.
- * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid
- * unnecessary i-cache flushing.
- * 04/07/.. ak Better overflow handling. Assorted fixes.
- * 05/09/10 linville Add support for syncing ranges, support syncing for
- * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup.
- * 08/12/11 beckyb Add highmem support
- */
-#include <linux/cache.h>
#include <linux/dma-mapping.h>
-#include <linux/mm.h>
#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
#include <linux/swiotlb.h>
-#include <linux/pfn.h>
-#include <linux/types.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <asm/io.h>
-#include <asm/dma.h>
#include <asm/scatterlist.h>
-
-#include <linux/init.h>
-#include <linux/bootmem.h>
#include <linux/iommu-helper.h>
-#define OFFSET(val,align) ((unsigned long) \
- ( (val) & ( (align) - 1)))
-
-#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
-
-/*
- * Minimum IO TLB size to bother booting with. Systems with mainly
- * 64bit capable cards will only lightly use the swiotlb. If we can't
- * allocate a contiguous 1MB, we're probably in trouble anyway.
- */
-#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
-
-/*
- * Enumeration for sync targets
- */
-enum dma_sync_target {
- SYNC_FOR_CPU = 0,
- SYNC_FOR_DEVICE = 1,
-};
-
-int swiotlb_force;
-
-/*
- * Used to do a quick range check in unmap_single and
- * sync_single_*, to see if the memory was in fact allocated by this
- * API.
- */
-static char *io_tlb_start, *io_tlb_end;
-
-/*
- * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and
- * io_tlb_end. This is command line adjustable via setup_io_tlb_npages.
- */
-static unsigned long io_tlb_nslabs;
-
-/*
- * When the IOMMU overflows we return a fallback buffer. This sets the size.
- */
-static unsigned long io_tlb_overflow = 32*1024;
-
-void *io_tlb_overflow_buffer;
-
-/*
- * This is a free list describing the number of free entries available from
- * each index
- */
-static unsigned int *io_tlb_list;
-static unsigned int io_tlb_index;
-
-/*
- * We need to save away the original address corresponding to a mapped entry
- * for the sync operations.
- */
-static phys_addr_t *io_tlb_orig_addr;
-
-/*
- * Protect the above data structures in the map and unmap calls
- */
-static DEFINE_SPINLOCK(io_tlb_lock);
-
-static int __init
-setup_io_tlb_npages(char *str)
-{
- if (isdigit(*str)) {
- io_tlb_nslabs = simple_strtoul(str, &str, 0);
- /* avoid tail segment of size < IO_TLB_SEGSIZE */
- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
- }
- if (*str == ',')
- ++str;
- if (!strcmp(str, "force"))
- swiotlb_force = 1;
- return 1;
-}
-__setup("swiotlb=", setup_io_tlb_npages);
-/* make io_tlb_overflow tunable too? */
/* Note that this doesn't work with highmem page */
static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
@@ -120,390 +13,6 @@
{
return phys_to_dma(hwdev, virt_to_phys(address));
}
-
-static void swiotlb_print_info(unsigned long bytes)
-{
- phys_addr_t pstart, pend;
-
- pstart = virt_to_phys(io_tlb_start);
- pend = virt_to_phys(io_tlb_end);
-
- printk(KERN_INFO "Placing %luMB software IO TLB between %p - %p\n",
- bytes >> 20, io_tlb_start, io_tlb_end);
- printk(KERN_INFO "software IO TLB at phys %#llx - %#llx\n",
- (unsigned long long)pstart,
- (unsigned long long)pend);
-}
-
-/*
- * Statically reserve bounce buffer space and initialize bounce buffer data
- * structures for the software IO TLB used to implement the DMA API.
- */
-void __init
-swiotlb_init_with_default_size(size_t default_size)
-{
- unsigned long i, bytes;
-
- if (!io_tlb_nslabs) {
- io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
- }
-
- bytes = io_tlb_nslabs << IO_TLB_SHIFT;
-
- /*
- * Get IO TLB memory from the low pages
- */
- io_tlb_start = alloc_bootmem_low_pages(bytes);
- if (!io_tlb_start)
- panic("Cannot allocate SWIOTLB buffer");
- io_tlb_end = io_tlb_start + bytes;
-
- /*
- * Allocate and initialize the free list array. This array is used
- * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
- * between io_tlb_start and io_tlb_end.
- */
- io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
- for (i = 0; i < io_tlb_nslabs; i++)
- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
- io_tlb_index = 0;
- io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t));
-
- /*
- * Get the overflow emergency buffer
- */
- io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
- if (!io_tlb_overflow_buffer)
- panic("Cannot allocate SWIOTLB overflow buffer!\n");
-
- swiotlb_print_info(bytes);
-}
-
-void __init
-swiotlb_init(void)
-{
- swiotlb_init_with_default_size(64 * (1<<20)); /* default to 64MB */
-}
-
-/*
- * Systems with larger DMA zones (those that don't support ISA) can
- * initialize the swiotlb later using the slab allocator if needed.
- * This should be just like above, but with some error catching.
- */
-int
-swiotlb_late_init_with_default_size(size_t default_size)
-{
- unsigned long i, bytes, req_nslabs = io_tlb_nslabs;
- unsigned int order;
-
- if (!io_tlb_nslabs) {
- io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
- }
-
- /*
- * Get IO TLB memory from the low pages
- */
- order = get_order(io_tlb_nslabs << IO_TLB_SHIFT);
- io_tlb_nslabs = SLABS_PER_PAGE << order;
- bytes = io_tlb_nslabs << IO_TLB_SHIFT;
-
- while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
- io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
- order);
- if (io_tlb_start)
- break;
- order--;
- }
-
- if (!io_tlb_start)
- goto cleanup1;
-
- if (order != get_order(bytes)) {
- printk(KERN_WARNING "Warning: only able to allocate %ld MB "
- "for software IO TLB\n", (PAGE_SIZE << order) >> 20);
- io_tlb_nslabs = SLABS_PER_PAGE << order;
- bytes = io_tlb_nslabs << IO_TLB_SHIFT;
- }
- io_tlb_end = io_tlb_start + bytes;
- memset(io_tlb_start, 0, bytes);
-
- /*
- * Allocate and initialize the free list array. This array is used
- * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
- * between io_tlb_start and io_tlb_end.
- */
- io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
- get_order(io_tlb_nslabs * sizeof(int)));
- if (!io_tlb_list)
- goto cleanup2;
-
- for (i = 0; i < io_tlb_nslabs; i++)
- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
- io_tlb_index = 0;
-
- io_tlb_orig_addr = (phys_addr_t *)
- __get_free_pages(GFP_KERNEL,
- get_order(io_tlb_nslabs *
- sizeof(phys_addr_t)));
- if (!io_tlb_orig_addr)
- goto cleanup3;
-
- memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(phys_addr_t));
-
- /*
- * Get the overflow emergency buffer
- */
- io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA,
- get_order(io_tlb_overflow));
- if (!io_tlb_overflow_buffer)
- goto cleanup4;
-
- swiotlb_print_info(bytes);
-
- return 0;
-
-cleanup4:
- free_pages((unsigned long)io_tlb_orig_addr,
- get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
- io_tlb_orig_addr = NULL;
-cleanup3:
- free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
- sizeof(int)));
- io_tlb_list = NULL;
-cleanup2:
- io_tlb_end = NULL;
- free_pages((unsigned long)io_tlb_start, order);
- io_tlb_start = NULL;
-cleanup1:
- io_tlb_nslabs = req_nslabs;
- return -ENOMEM;
-}
-
-static int is_swiotlb_buffer(phys_addr_t paddr)
-{
- return paddr >= virt_to_phys(io_tlb_start) &&
- paddr < virt_to_phys(io_tlb_end);
-}
-
-/*
- * Bounce: copy the swiotlb buffer back to the original dma location
- */
-static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
- enum dma_data_direction dir)
-{
- unsigned long pfn = PFN_DOWN(phys);
-
- if (PageHighMem(pfn_to_page(pfn))) {
- /* The buffer does not have a mapping. Map it in and copy */
- unsigned int offset = phys & ~PAGE_MASK;
- char *buffer;
- unsigned int sz = 0;
- unsigned long flags;
-
- while (size) {
- sz = min_t(size_t, PAGE_SIZE - offset, size);
-
- local_irq_save(flags);
- buffer = kmap_atomic(pfn_to_page(pfn),
- KM_BOUNCE_READ);
- if (dir == DMA_TO_DEVICE)
- memcpy(dma_addr, buffer + offset, sz);
- else
- memcpy(buffer + offset, dma_addr, sz);
- kunmap_atomic(buffer, KM_BOUNCE_READ);
- local_irq_restore(flags);
-
- size -= sz;
- pfn++;
- dma_addr += sz;
- offset = 0;
- }
- } else {
- if (dir == DMA_TO_DEVICE)
- memcpy(dma_addr, phys_to_virt(phys), size);
- else
- memcpy(phys_to_virt(phys), dma_addr, size);
- }
-}
-
-/*
- * Allocates bounce buffer and returns its kernel virtual address.
- */
-static void *
-map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir)
-{
- unsigned long flags;
- char *dma_addr;
- unsigned int nslots, stride, index, wrap;
- int i;
- unsigned long start_dma_addr;
- unsigned long mask;
- unsigned long offset_slots;
- unsigned long max_slots;
-
- mask = dma_get_seg_boundary(hwdev);
- start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start) & mask;
-
- offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
-
- /*
- * Carefully handle integer overflow which can occur when mask == ~0UL.
- */
- max_slots = mask + 1
- ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
- : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
-
- /*
- * For mappings greater than a page, we limit the stride (and
- * hence alignment) to a page size.
- */
- nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
- if (size > PAGE_SIZE)
- stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
- else
- stride = 1;
-
- BUG_ON(!nslots);
-
- /*
- * Find suitable number of IO TLB entries size that will fit this
- * request and allocate a buffer from that IO TLB pool.
- */
- spin_lock_irqsave(&io_tlb_lock, flags);
- index = ALIGN(io_tlb_index, stride);
- if (index >= io_tlb_nslabs)
- index = 0;
- wrap = index;
-
- do {
- while (iommu_is_span_boundary(index, nslots, offset_slots,
- max_slots)) {
- index += stride;
- if (index >= io_tlb_nslabs)
- index = 0;
- if (index == wrap)
- goto not_found;
- }
-
- /*
- * If we find a slot that indicates we have 'nslots' number of
- * contiguous buffers, we allocate the buffers from that slot
- * and mark the entries as '0' indicating unavailable.
- */
- if (io_tlb_list[index] >= nslots) {
- int count = 0;
-
- for (i = index; i < (int) (index + nslots); i++)
- io_tlb_list[i] = 0;
- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
- io_tlb_list[i] = ++count;
- dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
-
- /*
- * Update the indices to avoid searching in the next
- * round.
- */
- io_tlb_index = ((index + nslots) < io_tlb_nslabs
- ? (index + nslots) : 0);
-
- goto found;
- }
- index += stride;
- if (index >= io_tlb_nslabs)
- index = 0;
- } while (index != wrap);
-
-not_found:
- spin_unlock_irqrestore(&io_tlb_lock, flags);
- return NULL;
-found:
- spin_unlock_irqrestore(&io_tlb_lock, flags);
-
- /*
- * Save away the mapping from the original address to the DMA address.
- * This is needed when we sync the memory. Then we sync the buffer if
- * needed.
- */
- for (i = 0; i < nslots; i++)
- io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
- if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
- swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
-
- return dma_addr;
-}
-
-/*
- * dma_addr is the kernel virtual address of the bounce buffer to unmap.
- */
-static void
-do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
-{
- unsigned long flags;
- int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
- int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
- phys_addr_t phys = io_tlb_orig_addr[index];
-
- /*
- * First, sync the memory before unmapping the entry
- */
- if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
- swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
-
- /*
- * Return the buffer to the free list by setting the corresponding
- * entries to indicate the number of contigous entries available.
- * While returning the entries to the free list, we merge the entries
- * with slots below and above the pool being returned.
- */
- spin_lock_irqsave(&io_tlb_lock, flags);
- {
- count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
- io_tlb_list[index + nslots] : 0);
- /*
- * Step 1: return the slots to the free list, merging the
- * slots with superceeding slots
- */
- for (i = index + nslots - 1; i >= index; i--)
- io_tlb_list[i] = ++count;
- /*
- * Step 2: merge the returned slots with the preceding slots,
- * if available (non zero)
- */
- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
- io_tlb_list[i] = ++count;
- }
- spin_unlock_irqrestore(&io_tlb_lock, flags);
-}
-
-static void
-sync_single(struct device *hwdev, char *dma_addr, size_t size,
- int dir, int target)
-{
- int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
- phys_addr_t phys = io_tlb_orig_addr[index];
-
- phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
-
- switch (target) {
- case SYNC_FOR_CPU:
- if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
- else
- BUG_ON(dir != DMA_TO_DEVICE);
- break;
- case SYNC_FOR_DEVICE:
- if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
- else
- BUG_ON(dir != DMA_FROM_DEVICE);
- break;
- default:
- BUG();
- }
-}
-
void *
swiotlb_alloc_coherent(struct device *hwdev, size_t size,
dma_addr_t *dma_handle, gfp_t flags)
@@ -512,12 +21,13 @@
void *ret;
int order = get_order(size);
u64 dma_mask = DMA_BIT_MASK(32);
+ unsigned long start_dma_addr;
if (hwdev && hwdev->coherent_dma_mask)
dma_mask = hwdev->coherent_dma_mask;
ret = (void *)__get_free_pages(flags, order);
- if (ret && swiotlb_virt_to_bus(hwdev, ret) + size > dma_mask) {
+ if (ret && swiotlb_virt_to_bus(hwdev, ret) + size - 1 > dma_mask) {
/*
* The allocated memory isn't reachable by the device.
*/
@@ -527,10 +37,12 @@
if (!ret) {
/*
* We are either out of memory or the device can't DMA
- * to GFP_DMA memory; fall back on map_single(), which
+ * to GFP_DMA memory; fall back on do_map_single(), which
* will grab memory from the lowest available address range.
*/
- ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
+ start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start);
+ ret = do_map_single(hwdev, 0, start_dma_addr, size,
+ DMA_FROM_DEVICE);
if (!ret)
return NULL;
}
@@ -539,12 +51,13 @@
dev_addr = swiotlb_virt_to_bus(hwdev, ret);
/* Confirm address can be DMA'd by device */
- if (dev_addr + size > dma_mask) {
- printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n",
+ if (dev_addr + size - 1 > dma_mask) {
+ dev_err(hwdev, "DMA: hwdev DMA mask = 0x%016Lx, " \
+ "dev_addr = 0x%016Lx\n",
(unsigned long long)dma_mask,
(unsigned long long)dev_addr);
- /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
+ /* DMA_TO_DEVICE to avoid memcpy in do_unmap_single */
do_unmap_single(hwdev, ret, size, DMA_TO_DEVICE);
return NULL;
}
@@ -563,35 +76,11 @@
if (!is_swiotlb_buffer(paddr))
free_pages((unsigned long)vaddr, get_order(size));
else
- /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
+ /* DMA_TO_DEVICE to avoid memcpy in do_unmap_single */
do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
}
EXPORT_SYMBOL(swiotlb_free_coherent);
-static void
-swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
-{
- /*
- * Ran out of IOMMU space for this operation. This is very bad.
- * Unfortunately the drivers cannot handle this operation properly.
- * unless they check for dma_mapping_error (most don't)
- * When the mapping is small enough return a static buffer to limit
- * the damage, or panic when the transfer is too big.
- */
- printk(KERN_ERR "DMA: Out of SW-IOMMU space for %zu bytes at "
- "device %s\n", size, dev ? dev_name(dev) : "?");
-
- if (size <= io_tlb_overflow || !do_panic)
- return;
-
- if (dir == DMA_BIDIRECTIONAL)
- panic("DMA: Random memory could be DMA accessed\n");
- if (dir == DMA_FROM_DEVICE)
- panic("DMA: Random memory could be DMA written\n");
- if (dir == DMA_TO_DEVICE)
- panic("DMA: Random memory could be DMA read\n");
-}
-
/*
* Map a single buffer of the indicated size for DMA in streaming mode. The
* physical address to use is returned.
@@ -604,6 +93,7 @@
enum dma_data_direction dir,
struct dma_attrs *attrs)
{
+ unsigned long start_dma_addr;
phys_addr_t phys = page_to_phys(page) + offset;
dma_addr_t dev_addr = phys_to_dma(dev, phys);
void *map;
@@ -620,7 +110,8 @@
/*
* Oh well, have to allocate and map a bounce buffer.
*/
- map = map_single(dev, phys, size, dir);
+ start_dma_addr = swiotlb_virt_to_bus(dev, io_tlb_start);
+ map = do_map_single(dev, phys, start_dma_addr, size, dir);
if (!map) {
swiotlb_full(dev, size, dir, 1);
map = io_tlb_overflow_buffer;
@@ -697,7 +188,7 @@
BUG_ON(dir == DMA_NONE);
if (is_swiotlb_buffer(paddr)) {
- sync_single(hwdev, phys_to_virt(paddr), size, dir, target);
+ do_sync_single(hwdev, phys_to_virt(paddr), size, dir, target);
return;
}
@@ -774,19 +265,22 @@
swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
enum dma_data_direction dir, struct dma_attrs *attrs)
{
+ unsigned long start_dma_addr;
struct scatterlist *sg;
int i;
BUG_ON(dir == DMA_NONE);
+ start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start);
for_each_sg(sgl, sg, nelems, i) {
phys_addr_t paddr = sg_phys(sg);
dma_addr_t dev_addr = phys_to_dma(hwdev, paddr);
if (swiotlb_force ||
!dma_capable(hwdev, dev_addr, sg->length)) {
- void *map = map_single(hwdev, sg_phys(sg),
- sg->length, dir);
+ void *map = do_map_single(hwdev, sg_phys(sg),
+ start_dma_addr,
+ sg->length, dir);
if (!map) {
/* Don't panic here, we expect map_sg users
to do proper error handling. */
@@ -819,7 +313,8 @@
*/
void
swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
- int nelems, enum dma_data_direction dir, struct dma_attrs *attrs)
+ int nelems, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
{
struct scatterlist *sg;
int i;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 555d5d2..d1dc23c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -143,6 +143,30 @@
return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
}
+/*
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system. Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init free_bootmem_late(unsigned long addr, unsigned long size)
+{
+ unsigned long cursor, end;
+
+ kmemleak_free_part(__va(addr), size);
+
+ cursor = PFN_UP(addr);
+ end = PFN_DOWN(addr + size);
+
+ for (; cursor < end; cursor++) {
+ __free_pages_bootmem(pfn_to_page(cursor), 0);
+ totalram_pages++;
+ }
+}
+
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
int aligned;
diff --git a/mm/memory.c b/mm/memory.c
index 6c836d3..faa40d9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -553,6 +553,13 @@
if (is_zero_pfn(pfn))
return NULL;
check_pfn:
+
+#if defined(CONFIG_XEN) && defined(CONFIG_X86)
+ /* XEN: Covers user-space grant mappings (even of local pages). */
+ if (unlikely(vma->vm_flags & VM_FOREIGN))
+ return NULL;
+#endif
+
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
return NULL;
@@ -839,8 +846,12 @@
page->index > details->last_index))
continue;
}
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
+ if (unlikely(vma->vm_ops && vma->vm_ops->zap_pte))
+ ptent = vma->vm_ops->zap_pte(vma, addr, pte,
+ tlb->fullmm);
+ else
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
continue;
@@ -1100,6 +1111,7 @@
tlb_finish_mmu(tlb, address, end);
return end;
}
+EXPORT_SYMBOL_GPL(zap_page_range);
/**
* zap_vma_ptes - remove ptes mapping the vma
@@ -1306,6 +1318,29 @@
continue;
}
+#ifdef CONFIG_XEN
+ if (vma && (vma->vm_flags & VM_FOREIGN)) {
+ struct vm_foreign_map *foreign_map =
+ vma->vm_private_data;
+ struct page **map = foreign_map->map;
+ int offset = (start - vma->vm_start) >> PAGE_SHIFT;
+ if (map[offset] != NULL) {
+ if (pages) {
+ struct page *page = map[offset];
+
+ pages[i] = page;
+ get_page(page);
+ }
+ if (vmas)
+ vmas[i] = vma;
+ i++;
+ start += PAGE_SIZE;
+ nr_pages--;
+ continue;
+ }
+ }
+#endif
+
if (!vma ||
(vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
!(vm_flags & vma->vm_flags))
@@ -1781,6 +1816,10 @@
vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+#ifdef CONFIG_XEN
+ vma->vm_mm->context.has_foreign_mappings = 1;
+#endif
+
err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
if (err) {
/*
@@ -1896,11 +1935,10 @@
{
pgd_t *pgd;
unsigned long next;
- unsigned long start = addr, end = addr + size;
+ unsigned long end = addr + size;
int err;
BUG_ON(addr >= end);
- mmu_notifier_invalidate_range_start(mm, start, end);
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -1908,7 +1946,7 @@
if (err)
break;
} while (pgd++, addr = next, addr != end);
- mmu_notifier_invalidate_range_end(mm, start, end);
+
return err;
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3ecab7e..8ab5033 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -594,6 +594,13 @@
if (bad)
return;
+#ifdef CONFIG_XEN
+ if (PageForeign(page)) {
+ PageForeignDestructor(page, order);
+ return;
+ }
+#endif
+
if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
debug_check_no_obj_freed(page_address(page),
@@ -1088,6 +1095,13 @@
kmemcheck_free_shadow(page, 0);
+#ifdef CONFIG_XEN
+ if (PageForeign(page)) {
+ PageForeignDestructor(page, 0);
+ return;
+ }
+#endif
+
if (PageAnon(page))
page->mapping = NULL;
if (free_pages_check(page))
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f34ffd0..bd8e1ae 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,7 +31,6 @@
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
-
/*** Page table manipulation functions ***/
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
@@ -562,7 +561,6 @@
if (va->va_end > *end)
*end = va->va_end;
nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
- unmap_vmap_area(va);
list_add_tail(&va->purge_list, &valist);
va->flags |= VM_LAZY_FREEING;
va->flags &= ~VM_LAZY_FREE;
@@ -570,8 +568,9 @@
}
rcu_read_unlock();
- if (nr)
+ if (nr) {
atomic_sub(nr, &vmap_lazy_nr);
+ }
if (nr || force_flush)
flush_tlb_kernel_range(*start, *end);
@@ -607,10 +606,11 @@
}
/*
- * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
- * called for the correct range previously.
+ * Free a vmap area, caller ensuring that the area has been unmapped
+ * and flush_cache_vunmap had been called for the correct range
+ * previously.
*/
-static void free_unmap_vmap_area_noflush(struct vmap_area *va)
+static void free_vmap_area_noflush(struct vmap_area *va)
{
va->flags |= VM_LAZY_FREE;
atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
@@ -619,6 +619,16 @@
}
/*
+ * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
+ * called for the correct range previously.
+ */
+static void free_unmap_vmap_area_noflush(struct vmap_area *va)
+{
+ unmap_vmap_area(va);
+ free_vmap_area_noflush(va);
+}
+
+/*
* Free and unmap a vmap area
*/
static void free_unmap_vmap_area(struct vmap_area *va)
@@ -795,7 +805,7 @@
spin_unlock(&vmap_block_tree_lock);
BUG_ON(tmp != vb);
- free_unmap_vmap_area_noflush(vb->va);
+ free_vmap_area_noflush(vb->va);
call_rcu(&vb->rcu_head, rcu_free_vb);
}
@@ -932,6 +942,8 @@
rcu_read_unlock();
BUG_ON(!vb);
+ vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+
spin_lock(&vb->lock);
BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
@@ -984,7 +996,6 @@
s = vb->va->va_start + (i << PAGE_SHIFT);
e = vb->va->va_start + (j << PAGE_SHIFT);
- vunmap_page_range(s, e);
flush = 1;
if (s < start)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index abbe8fa..e661dd7 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -179,14 +179,24 @@
struct ethtool_drvinfo info;
const struct ethtool_ops *ops = dev->ethtool_ops;
- if (!ops->get_drvinfo)
- return -EOPNOTSUPP;
-
memset(&info, 0, sizeof(info));
info.cmd = ETHTOOL_GDRVINFO;
- ops->get_drvinfo(dev, &info);
+ if (ops && ops->get_drvinfo) {
+ ops->get_drvinfo(dev, &info);
+ } else if (dev->dev.parent && dev->dev.parent->driver) {
+ strlcpy(info.bus_info, dev_name(dev->dev.parent),
+ sizeof(info.bus_info));
+ strlcpy(info.driver, dev->dev.parent->driver->name,
+ sizeof(info.driver));
+ } else {
+ return -EOPNOTSUPP;
+ }
- if (ops->get_sset_count) {
+ /*
+ * this method of obtaining string set info is deprecated;
+ * Use ETHTOOL_GSSET_INFO instead.
+ */
+ if (ops && ops->get_sset_count) {
int rc;
rc = ops->get_sset_count(dev, ETH_SS_TEST);
@@ -201,14 +211,14 @@
} else {
/* code path for obsolete hooks */
- if (ops->self_test_count)
+ if (ops && ops->self_test_count)
info.testinfo_len = ops->self_test_count(dev);
- if (ops->get_stats_count)
+ if (ops && ops->get_stats_count)
info.n_stats = ops->get_stats_count(dev);
}
- if (ops->get_regs_len)
+ if (ops && ops->get_regs_len)
info.regdump_len = ops->get_regs_len(dev);
- if (ops->get_eeprom_len)
+ if (ops && ops->get_eeprom_len)
info.eedump_len = ops->get_eeprom_len(dev);
if (copy_to_user(useraddr, &info, sizeof(info)))
@@ -945,12 +955,19 @@
if (!dev || !netif_device_present(dev))
return -ENODEV;
- if (!dev->ethtool_ops)
- return -EOPNOTSUPP;
-
- if (copy_from_user(ðcmd, useraddr, sizeof (ethcmd)))
+ if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd)))
return -EFAULT;
+ if (!dev->ethtool_ops) {
+ /* ETHTOOL_GDRVINFO does not require any driver support.
+ * It is also unprivileged and does not change anything,
+ * so we can take a shortcut to it. */
+ if (ethcmd == ETHTOOL_GDRVINFO)
+ return ethtool_get_drvinfo(dev, useraddr);
+ else
+ return -EOPNOTSUPP;
+ }
+
/* Allow some commands to be done by anyone */
switch(ethcmd) {
case ETHTOOL_GDRVINFO:
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d4fd895..4ab8c97 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -35,6 +35,7 @@
#include <linux/security.h>
#include <linux/mutex.h>
#include <linux/if_addr.h>
+#include <linux/pci.h>
#include <asm/uaccess.h>
#include <asm/system.h>
@@ -582,6 +583,22 @@
a->tx_compressed = b->tx_compressed;
};
+/* All VF info */
+static inline int rtnl_vfinfo_size(const struct net_device *dev)
+{
+ if (dev->dev.parent && dev_is_pci(dev->dev.parent)) {
+
+ int num_vfs = dev_num_vf(dev->dev.parent);
+ size_t size = nlmsg_total_size(sizeof(struct nlattr));
+ size += nlmsg_total_size(num_vfs * sizeof(struct nlattr));
+ size += num_vfs * (sizeof(struct ifla_vf_mac) +
+ sizeof(struct ifla_vf_vlan) +
+ sizeof(struct ifla_vf_tx_rate));
+ return size;
+ } else
+ return 0;
+}
+
static inline size_t if_nlmsg_size(const struct net_device *dev)
{
return NLMSG_ALIGN(sizeof(struct ifinfomsg))
@@ -599,6 +616,8 @@
+ nla_total_size(4) /* IFLA_MASTER */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ + nla_total_size(4) /* IFLA_NUM_VF */
+ + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */
+ rtnl_link_get_size(dev); /* IFLA_LINKINFO */
}
@@ -667,6 +686,40 @@
stats = dev_get_stats(dev);
copy_rtnl_link_stats(nla_data(attr), stats);
+ if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) {
+ int i;
+
+ struct nlattr *vfinfo, *vf;
+ int num_vfs = dev_num_vf(dev->dev.parent);
+
+ NLA_PUT_U32(skb, IFLA_NUM_VF, num_vfs);
+ vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
+ if (!vfinfo)
+ goto nla_put_failure;
+ for (i = 0; i < num_vfs; i++) {
+ struct ifla_vf_info ivi;
+ struct ifla_vf_mac vf_mac;
+ struct ifla_vf_vlan vf_vlan;
+ struct ifla_vf_tx_rate vf_tx_rate;
+ if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))
+ break;
+ vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = ivi.vf;
+ memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+ vf_vlan.vlan = ivi.vlan;
+ vf_vlan.qos = ivi.qos;
+ vf_tx_rate.rate = ivi.tx_rate;
+ vf = nla_nest_start(skb, IFLA_VF_INFO);
+ if (!vf) {
+ nla_nest_cancel(skb, vfinfo);
+ goto nla_put_failure;
+ }
+ NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac);
+ NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan);
+ NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate);
+ nla_nest_end(skb, vf);
+ }
+ nla_nest_end(skb, vfinfo);
+ }
if (dev->rtnl_link_ops) {
if (rtnl_link_fill(skb, dev) < 0)
goto nla_put_failure;
@@ -716,6 +769,7 @@
[IFLA_LINKINFO] = { .type = NLA_NESTED },
[IFLA_NET_NS_PID] = { .type = NLA_U32 },
[IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 },
+ [IFLA_VFINFO_LIST] = {. type = NLA_NESTED },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -723,6 +777,33 @@
[IFLA_INFO_DATA] = { .type = NLA_NESTED },
};
+static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = {
+ [IFLA_VF_INFO] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
+ [IFLA_VF_MAC] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_mac) },
+ [IFLA_VF_VLAN] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_vlan) },
+ [IFLA_VF_TX_RATE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_tx_rate) },
+};
+
+struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
+{
+ struct net *net;
+ /* Examine the link attributes and figure out which
+ * network namespace we are talking about.
+ */
+ if (tb[IFLA_NET_NS_PID])
+ net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
+ else
+ net = get_net(src_net);
+ return net;
+}
+EXPORT_SYMBOL(rtnl_link_get_net);
+
static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
{
if (dev) {
@@ -738,6 +819,52 @@
return 0;
}
+static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
+{
+ int rem, err = -EINVAL;
+ struct nlattr *vf;
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ nla_for_each_nested(vf, attr, rem) {
+ switch (nla_type(vf)) {
+ case IFLA_VF_MAC: {
+ struct ifla_vf_mac *ivm;
+ ivm = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_mac)
+ err = ops->ndo_set_vf_mac(dev, ivm->vf,
+ ivm->mac);
+ break;
+ }
+ case IFLA_VF_VLAN: {
+ struct ifla_vf_vlan *ivv;
+ ivv = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_vlan)
+ err = ops->ndo_set_vf_vlan(dev, ivv->vf,
+ ivv->vlan,
+ ivv->qos);
+ break;
+ }
+ case IFLA_VF_TX_RATE: {
+ struct ifla_vf_tx_rate *ivt;
+ ivt = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_tx_rate)
+ err = ops->ndo_set_vf_tx_rate(dev, ivt->vf,
+ ivt->rate);
+ break;
+ }
+ default:
+ err = -EINVAL;
+ break;
+ }
+ if (err)
+ break;
+ }
+ return err;
+}
+
static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
struct nlattr **tb, char *ifname, int modified)
{
@@ -875,6 +1002,18 @@
write_unlock_bh(&dev_base_lock);
}
+ if (tb[IFLA_VFINFO_LIST]) {
+ struct nlattr *attr;
+ int rem;
+ nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
+ if (nla_type(attr) != IFLA_VF_INFO)
+ goto errout;
+ err = do_setvfinfo(dev, attr);
+ if (err < 0)
+ goto errout;
+ modified = 1;
+ }
+ }
err = 0;
errout:
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 929218a..956cd0a 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -215,6 +215,26 @@
To compile this code as a module, choose M here: the
module will be called sch_ingress.
+config NET_SCH_PLUG
+ tristate "Plug network traffic until release"
+ ---help---
+ Say Y here if you are using this kernel for Xen dom0 and
+ want to protect Xen guests with Remus.
+
+ This queueing discipline is controlled by netlink. When it receives an
+ enqueue command it inserts a plug into the outbound queue that causes
+ following packets to enqueue until a dequeue command arrives over
+ netlink, releasing packets up to the plug for delivery.
+
+ Its intention is to support speculative execution by allowing generated
+ network traffic to be rolled back. It is used to provide network
+ protection for the Remus high availability project.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_plug.
+
comment "Classification"
config NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index f14e71b..61ef5f7 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -31,6 +31,7 @@
obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
+obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
new file mode 100644
index 0000000..86c3ee1
--- /dev/null
+++ b/net/sched/sch_plug.c
@@ -0,0 +1,156 @@
+/*
+ * sch_plug.c Queue traffic until an explicit release command
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * The operation of the buffer is as follows:
+ * When a checkpoint begins, a plug is inserted into the
+ * network queue by a netlink request (it operates by storing
+ * a pointer to the next packet which arrives and blocking dequeue
+ * when that packet is at the head of the queue).
+ * When a checkpoint completes (the backup acknowledges receipt),
+ * currently-queued packets are released.
+ * So it supports two operations, plug and unplug.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+#define FIFO_BUF (10*1024*1024)
+
+#define TCQ_PLUG 0
+#define TCQ_UNPLUG 1
+
+struct plug_sched_data {
+ /*
+ * This packet is the first packet which should not be
+ * delivered. If it is NULL, plug_enqueue will set it to the
+ * next packet it sees.
+ */
+ struct sk_buff *stop;
+};
+
+struct tc_plug_qopt {
+ /* 0: reset stop packet pointer
+ * 1: dequeue to stop pointer */
+ int action;
+};
+
+static int skb_remove_foreign_references(struct sk_buff *skb)
+{
+ return !skb_linearize(skb);
+}
+
+static int plug_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+ struct plug_sched_data *q = qdisc_priv(sch);
+
+ if (likely(sch->qstats.backlog + skb->len <= FIFO_BUF)) {
+ if (!q->stop)
+ q->stop = skb;
+
+ if (!skb_remove_foreign_references(skb)) {
+ printk(KERN_DEBUG "error removing foreign ref\n");
+ return qdisc_reshape_fail(skb, sch);
+ }
+
+ return qdisc_enqueue_tail(skb, sch);
+ }
+ printk(KERN_WARNING "queue reported full: %d,%d\n",
+ sch->qstats.backlog, skb->len);
+
+ return qdisc_reshape_fail(skb, sch);
+}
+
+/* dequeue doesn't actually dequeue until the release command is
+ * received. */
+static struct sk_buff *plug_dequeue(struct Qdisc* sch)
+{
+ struct plug_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *peek;
+
+ if (sch->flags & TCQ_F_THROTTLED)
+ return NULL;
+
+ peek = (struct sk_buff *)((sch->q).next);
+
+ /* this pointer comparison may be shady */
+ if (peek == q->stop) {
+ /*
+ * This is the tail of the last round. Release it and
+ * block the queue
+ */
+ sch->flags |= TCQ_F_THROTTLED;
+ return NULL;
+ }
+
+ return qdisc_dequeue_head(sch);
+}
+
+static int plug_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ sch->flags |= TCQ_F_THROTTLED;
+
+ return 0;
+}
+
+/*
+ * receives two messages:
+ * 0: checkpoint queue (set stop to next packet)
+ * 1: dequeue until stop
+ */
+static int plug_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct plug_sched_data *q = qdisc_priv(sch);
+ struct tc_plug_qopt *msg;
+
+ if (!opt || nla_len(opt) < sizeof(*msg))
+ return -EINVAL;
+
+ msg = nla_data(opt);
+
+ if (msg->action == TCQ_PLUG) {
+ /* reset stop */
+ q->stop = NULL;
+ } else if (msg->action == TCQ_UNPLUG) {
+ /* dequeue */
+ sch->flags &= ~TCQ_F_THROTTLED;
+ netif_schedule_queue(sch->dev_queue);
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+struct Qdisc_ops plug_qdisc_ops = {
+ .id = "plug",
+ .priv_size = sizeof(struct plug_sched_data),
+ .enqueue = plug_enqueue,
+ .dequeue = plug_dequeue,
+ .peek = qdisc_peek_head,
+ .init = plug_init,
+ .change = plug_change,
+ .owner = THIS_MODULE,
+};
+
+static int __init plug_module_init(void)
+{
+ return register_qdisc(&plug_qdisc_ops);
+}
+
+static void __exit plug_module_exit(void)
+{
+ unregister_qdisc(&plug_qdisc_ops);
+}
+module_init(plug_module_init)
+module_exit(plug_module_exit)
+MODULE_LICENSE("GPL");