ARM: Compile the kernel into VMALLOC
This makes it possible to compile the kernel into the
VMALLOC area (kernel virtual memory allocation pool).
We hammer down the virtual location of the kernel to
0xf1000000 and augment all translation functions to take
a special roundtrip for any kernel addresses when mapping
virtual-to-physical or physical-to-virtual addresses.
We make sure all modules are loaded into the VMALLOC area
when using this configuration as well.
Make sure not to clear the PMD:s covering the kernel
when initializing the PGD.
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0af6709..c31c283 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -281,6 +281,17 @@
definitions for this platform. The need for mach/memory.h should
be avoided when possible.
+config ARM_KERNEL_IN_VMALLOC
+ bool "Compile the kernel into the VMALLOC area"
+ depends on MMU
+ select KASAN_VMALLOC if KASAN
+ select ARM_MODULE_PLTS if MODULES
+ help
+ This augments the build process so that the kernel TEXT, DATA
+ and BSS will be put into the VMALLOC area. This avoids a dedicated
+ mapping to userspace of the kernel for system calls and task
+ switching.
+
config PHYS_OFFSET
hex "Physical address of main memory" if MMU
depends on !ARM_PATCH_PHYS_VIRT || !AUTO_ZRELADDR
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index ef2aa79..8e98c4b 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -32,7 +32,11 @@
* we may further offset this with TEXT_OFFSET in practice.
*/
#define PAGE_OFFSET UL(CONFIG_PAGE_OFFSET)
+#ifdef CONFIG_ARM_KERNEL_IN_VMALLOC
+#define KERNEL_OFFSET (0xF1000000)
+#else
#define KERNEL_OFFSET (PAGE_OFFSET)
+#endif
#ifdef CONFIG_MMU
@@ -63,7 +67,7 @@
#define MODULES_VADDR (PAGE_OFFSET - SZ_8M)
#endif
-#if TASK_SIZE > MODULES_VADDR
+#if (TASK_SIZE > MODULES_VADDR) && !defined(CONFIG_ARM_KERNEL_IN_VMALLOC)
#error Top of user space clashes with start of module space
#endif
@@ -170,6 +174,9 @@ extern unsigned long vectors_base;
*/
extern u64 kernel_sec_start;
extern u64 kernel_sec_end;
+#define KERNEL_SECTION_SIZE (kernel_sec_end - kernel_sec_start)
+/* Page frame number for the first address of the kernel memory */
+#define KERNEL_PFN_OFFSET ((unsigned long)(kernel_sec_start >> PAGE_SHIFT))
/*
* Physical vs virtual RAM address space conversion. These are
@@ -254,6 +261,16 @@ static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
{
phys_addr_t t;
+ /*
+ * For kernel in vmalloc, move the address offset to
+ * where it would have been, if we had a 1:1 mapping so
+ * this virt-to-phys patching will work all the same.
+ */
+ if (IS_ENABLED(CONFIG_ARM_KERNEL_IN_VMALLOC) &&
+ (x >= KERNEL_OFFSET) &&
+ (x < (KERNEL_OFFSET + KERNEL_SECTION_SIZE)))
+ x = x - KERNEL_OFFSET + PAGE_OFFSET;
+
if (sizeof(phys_addr_t) == 4) {
__pv_stub(x, t, "add");
} else {
@@ -273,6 +290,17 @@ static inline unsigned long __phys_to_virt(phys_addr_t x)
* in place where 'r' 32 bit operand is expected.
*/
__pv_stub((unsigned long) x, t, "sub");
+
+ /*
+ * For kernel in vmalloc, move the address offset from
+ * where it would have been, if we had a 1:1 mapping so
+ * this phys-to-virt patching will work all the same.
+ */
+ if (IS_ENABLED(CONFIG_ARM_KERNEL_IN_VMALLOC) &&
+ (x >= kernel_sec_start) &&
+ (x < kernel_sec_end))
+ t = t - PAGE_OFFSET + KERNEL_OFFSET;
+
return t;
}
@@ -283,22 +311,58 @@ static inline unsigned long __phys_to_virt(phys_addr_t x)
static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
{
- return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
+ if (!IS_ENABLED(CONFIG_ARM_KERNEL_IN_VMALLOC)) {
+ return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
+ } else {
+ phys_addr_t addr = (phys_addr_t)x;
+
+ if ((addr >= KERNEL_OFFSET) &&
+ (addr < (KERNEL_OFFSET + KERNEL_SECTION_SIZE)))
+ return addr - KERNEL_OFFSET + kernel_sec_start;
+ else
+ return addr - PAGE_OFFSET + PHYS_OFFSET;
+ }
}
static inline unsigned long __phys_to_virt(phys_addr_t x)
{
- return x - PHYS_OFFSET + PAGE_OFFSET;
+ /* Normally just use the 1-to-1 mapping */
+ if (!IS_ENABLED(CONFIG_ARM_KERNEL_IN_VMALLOC)) {
+ return x - PHYS_OFFSET + PAGE_OFFSET;
+ } else {
+ /*
+ * We need very specific handling of the kernel physical memory
+ * 1-to-1 map: memory allocations for the kernel will be made
+ * above (e.g. page table) and above (any other allocations)
+ * in the physical memory.
+ */
+ if (x >= kernel_sec_start && x < kernel_sec_end)
+ return x - kernel_sec_start + KERNEL_OFFSET;
+ else
+ return x - PHYS_OFFSET + PAGE_OFFSET;
+ }
}
-
#endif
static inline unsigned long virt_to_pfn(const void *p)
{
unsigned long kaddr = (unsigned long)p;
- return (((kaddr - PAGE_OFFSET) >> PAGE_SHIFT) +
- PHYS_PFN_OFFSET);
+
+ if (!IS_ENABLED(CONFIG_ARM_KERNEL_IN_VMALLOC)) {
+ return (((kaddr - PAGE_OFFSET) >> PAGE_SHIFT) +
+ PHYS_PFN_OFFSET);
+ } else {
+ if ((kaddr >= KERNEL_OFFSET) &&
+ (kaddr < (KERNEL_OFFSET + KERNEL_SECTION_SIZE))) {
+ return (((kaddr - KERNEL_OFFSET) >> PAGE_SHIFT) +
+ KERNEL_PFN_OFFSET);
+ } else {
+ return (((kaddr - PAGE_OFFSET) >> PAGE_SHIFT) +
+ PHYS_PFN_OFFSET);
+ }
+ }
}
+
#define __pa_symbol_nodebug(x) __virt_to_phys_nodebug((x))
#ifdef CONFIG_DEBUG_VIRTUAL
diff --git a/arch/arm/include/asm/pgtable-3level-hwdef.h b/arch/arm/include/asm/pgtable-3level-hwdef.h
index 2f35b4e..f879da6 100644
--- a/arch/arm/include/asm/pgtable-3level-hwdef.h
+++ b/arch/arm/include/asm/pgtable-3level-hwdef.h
@@ -83,15 +83,29 @@
* Only use this feature if PHYS_OFFSET <= PAGE_OFFSET, otherwise
* booting secondary CPUs would end up using TTBR1 for the identity
* mapping set up in TTBR0.
+ *
+ * For kernel-in-vmalloc the kernel is at KERNEL_OFFSET (0xf1000000)
+ * and we simply disable the use of split TTBR:s.
*/
-#if defined CONFIG_VMSPLIT_2G
+#if defined CONFIG_ARM_KERNEL_IN_VMALLOC
+#define TTBR1_OFFSET 0
+#elif defined CONFIG_VMSPLIT_2G /* PAGE_OFFSET = 0x80000000 */
#define TTBR1_OFFSET 16 /* skip two L1 entries */
-#elif defined CONFIG_VMSPLIT_3G
+#elif defined CONFIG_VMSPLIT_3G /* PAGE_OFFSET = 0xc0000000 */
#define TTBR1_OFFSET (4096 * (1 + 3)) /* only L2, skip pgd + 3*pmd */
-#else
+#else /* PAGE_OFFSET = 0x40000000 or other */
#define TTBR1_OFFSET 0
#endif
+#if defined CONFIG_ARM_KERNEL_IN_VMALLOC
+#define TTBR1_SIZE 0
+#else
+/*
+ * (0x40000000 >> 30) - 1 = 1 => TTBR1_SIZE = 2^(32-0) = 0x100000000
+ * (0x80000000 >> 30) - 1 = 2 => TTBR1_SIZE = 2^(32-1) = 0x80000000
+ * (0xc0000000 >> 30) - 1 = 3 => TTBR1_SIZE = 2^(32-2) = 0x40000000
+ */
#define TTBR1_SIZE (((PAGE_OFFSET >> 30) - 1) << 16)
+#endif
#endif
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index e74d84f..d1913e2 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -36,18 +36,26 @@
#ifdef CONFIG_MMU
void *module_alloc(unsigned long size)
{
- gfp_t gfp_mask = GFP_KERNEL;
- void *p;
+ /*
+ * If we're compiling the kernel into the VMALLOC area, then make
+ * sure to only use that area for any kernel modules as well. Else
+ * the special MODULES_VADDR address takes precedence.
+ */
+ if (!IS_ENABLED(CONFIG_ARM_KERNEL_IN_VMALLOC)) {
+ gfp_t gfp_mask = GFP_KERNEL;
+ void *p;
- /* Silence the initial allocation */
- if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
- gfp_mask |= __GFP_NOWARN;
+ /* Silence the initial allocation */
+ if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
+ gfp_mask |= __GFP_NOWARN;
- p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
- gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
- if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
- return p;
+ p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+ gfp_mask, PAGE_KERNEL_EXEC, 0,
+ NUMA_NO_NODE,
+ __builtin_return_address(0));
+ if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
+ return p;
+ }
return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
__builtin_return_address(0));
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index a42e4cd..eef63ed 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -182,8 +182,19 @@ void check_cpu_icache_size(int cpuid)
void __init arm_memblock_init(const struct machine_desc *mdesc)
{
- /* Register the kernel text, kernel data and initrd with memblock. */
- memblock_reserve(__pa(KERNEL_START), KERNEL_END - KERNEL_START);
+ /*
+ * Register the kernel text, kernel data and initrd with memblock.
+ *
+ * When using kernel in vmalloc, we have to round up to the closest
+ * section size, or the temporary section mapping of the tail of the
+ * kernel will be overwritten by memblock allocations. This is not
+ * a problem with the linear kernel map, since the allocations can
+ * use the 1:1 map in that case.
+ */
+ if (!IS_ENABLED(CONFIG_ARM_KERNEL_IN_VMALLOC))
+ memblock_reserve(__pa(KERNEL_START), KERNEL_END - KERNEL_START);
+ else
+ memblock_reserve(kernel_sec_start, KERNEL_SECTION_SIZE);
reserve_initrd_mem();
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 674ed71..8b86ddd 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -962,6 +962,9 @@ static void __init __create_mapping(struct mm_struct *mm, struct map_desc *md,
pgd = pgd_offset(mm, addr);
end = addr + length;
+ pr_debug("map physical memory 0x%08llx-0x%08llx to virtual memory 0x%08lx-0x%08lx length: 0x%08lx\n",
+ (long long)phys, (long long)(phys + length - 1), addr, end - 1, length);
+
do {
unsigned long next = pgd_addr_end(addr, end);
@@ -1296,6 +1299,11 @@ static __init void prepare_page_table(void)
/*
* Clear out all the mappings below the kernel image.
*/
+ if (IS_ENABLED(CONFIG_ARM_KERNEL_IN_VMALLOC))
+ end = VMALLOC_START;
+ else
+ end = MODULES_VADDR;
+
#ifdef CONFIG_KASAN
/*
* KASan's shadow memory inserts itself between the TASK_SIZE
@@ -1309,13 +1317,21 @@ static __init void prepare_page_table(void)
* are using a thumb-compiled kernel, there there will be 8MB more
* to clear as KASan always offset to 16 MB below MODULES_VADDR.
*/
- for (addr = KASAN_SHADOW_END; addr < MODULES_VADDR; addr += PMD_SIZE)
+ for (addr = KASAN_SHADOW_END; addr < end; addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
#else
- for (addr = 0; addr < MODULES_VADDR; addr += PMD_SIZE)
+ for (addr = 0; addr < end; addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
#endif
+ /*
+ * If we use kernel in VMALLOC, we cleared all the way up to the
+ * VMALLOC area except for maybe KASAN shadow memory and we are done.
+ * Else we go on with the more complex lowmem handling.
+ */
+ if (IS_ENABLED(CONFIG_ARM_KERNEL_IN_VMALLOC))
+ return;
+
#ifdef CONFIG_XIP_KERNEL
/* The XIP kernel is mapped in the module area -- skip over it */
addr = ((unsigned long)_exiprom + PMD_SIZE - 1) & PMD_MASK;
@@ -1390,8 +1406,21 @@ static void __init devicemaps_init(const struct machine_desc *mdesc)
/*
* Clear page table except top pmd used by early fixmaps
*/
- for (addr = VMALLOC_START; addr < (FIXADDR_TOP & PMD_MASK); addr += PMD_SIZE)
+ for (addr = VMALLOC_START; addr < (FIXADDR_TOP & PMD_MASK); addr += PMD_SIZE) {
+ /*
+ * When putting the kernel into the VMALLOC area, we need to
+ * make sure we don't wipe out the VM mappings for the kernel.
+ * This would pull out the ground under our feet. This gets
+ * compiled out if we're not using kernel in VMALLOC.
+ */
+ if (IS_ENABLED(CONFIG_ARM_KERNEL_IN_VMALLOC)) {
+ if ((addr >= KERNEL_OFFSET) &&
+ (addr < (KERNEL_OFFSET + KERNEL_SECTION_SIZE)))
+ continue;
+ }
+ pr_debug("clear PMD at 0x%08llx\n", (unsigned long long)addr);
pmd_clear(pmd_off_k(addr));
+ }
if (__atags_pointer) {
/* create a read-only mapping of the device tree */
@@ -1580,6 +1609,24 @@ static void __init map_lowmem(void)
}
}
+/* Reserve memory used by the kernel when placing the kernel inside VMALLOC */
+static void __init vm_reserve_kernel(struct map_desc *md)
+{
+ struct vm_struct *vm;
+ struct static_vm *svm;
+
+ svm = early_alloc(sizeof(*svm));
+
+ vm = &svm->vm;
+ vm->addr = (void *)(md->virtual & PAGE_MASK);
+ vm->size = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));
+ vm->phys_addr = __pfn_to_phys(md->pfn);
+ vm->flags = VM_MAP | VM_ARM_STATIC_MAPPING;
+ vm->flags |= VM_ARM_MTYPE(md->type);
+ vm->caller = vm_reserve_kernel;
+ add_static_vm_early(svm);
+}
+
static void __init map_kernel(void)
{
/*
@@ -1609,11 +1656,24 @@ static void __init map_kernel(void)
phys_addr_t kernel_nx_end = kernel_sec_end;
struct map_desc map;
+ /*
+ * We cannot really remap the kernel properly when using kernel in VMALLOC:
+ * the PMDs covering it have not been wiped in devicemaps_init() as it would
+ * pull out the ground under our feet. By rounding up to the closest
+ * PMD_SIZE instead of SECTION_SIZE, we make sure that the next free
+ * VMALLOC segemnt does not end up on a section ending inside half a
+ * PMD: sections are 1 MB but PMDs are 2 MB.
+ */
+ if (IS_ENABLED(CONFIG_ARM_KERNEL_IN_VMALLOC))
+ kernel_nx_end = round_up(kernel_sec_end, PMD_SIZE);
+
map.pfn = __phys_to_pfn(kernel_x_start);
map.virtual = __phys_to_virt(kernel_x_start);
map.length = kernel_x_end - kernel_x_start;
map.type = MT_MEMORY_RWX;
create_mapping(&map);
+ if (IS_ENABLED(CONFIG_ARM_KERNEL_IN_VMALLOC))
+ vm_reserve_kernel(&map);
/* If the nx part is small it may end up covered by the tail of the RWX section */
if (kernel_x_end == kernel_nx_end)
@@ -1624,6 +1684,8 @@ static void __init map_kernel(void)
map.length = kernel_nx_end - kernel_nx_start;
map.type = MT_MEMORY_RW;
create_mapping(&map);
+ if (IS_ENABLED(CONFIG_ARM_KERNEL_IN_VMALLOC))
+ vm_reserve_kernel(&map);
}
#ifdef CONFIG_ARM_PV_FIXUP
diff --git a/arch/arm/mm/proc-v7-3level.S b/arch/arm/mm/proc-v7-3level.S
index 1319844..65864a9 100644
--- a/arch/arm/mm/proc-v7-3level.S
+++ b/arch/arm/mm/proc-v7-3level.S
@@ -115,7 +115,7 @@
* - \ttbr1 updated.
*/
.macro v7_ttb_setup, zero, ttbr0l, ttbr0h, ttbr1, tmp
- ldr \tmp, =swapper_pg_dir @ swapper_pg_dir virtual address
+ ldr \tmp, =KERNEL_OFFSET @ swapper_pg_dir virtual address
cmp \ttbr1, \tmp, lsr #12 @ PHYS_OFFSET > PAGE_OFFSET?
mov \tmp, #TTB_EAE @ for TTB control egister
ALT_SMP(orr \tmp, \tmp, #TTB_FLAGS_SMP)