| From: Alexander Graf <graf@amazon.com> |
| Subject: kexec: enable CMA based contiguous allocation |
| Date: Tue, 10 Jun 2025 08:53:27 +0000 |
| |
| When booting a new kernel with kexec_file, the kernel picks a target |
| location that the kernel should live at, then allocates random pages, |
| checks whether any of those patches magically happens to coincide with a |
| target address range and if so, uses them for that range. |
| |
| For every page allocated this way, it then creates a page list that the |
| relocation code - code that executes while all CPUs are off and we are |
| just about to jump into the new kernel - copies to their final memory |
| location. We can not put them there before, because chances are pretty |
| good that at least some page in the target range is already in use by the |
| currently running Linux environment. Copying is happening from a single |
| CPU at RAM rate, which takes around 4-50 ms per 100 MiB. |
| |
| All of this is inefficient and error prone. |
| |
| To successfully kexec, we need to quiesce all devices of the outgoing |
| kernel so they don't scribble over the new kernel's memory. We have seen |
| cases where that does not happen properly (*cough* GIC *cough*) and hence |
| the new kernel was corrupted. This started a month long journey to root |
| cause failing kexecs to eventually see memory corruption, because the new |
| kernel was corrupted severely enough that it could not emit output to tell |
| us about the fact that it was corrupted. By allocating memory for the |
| next kernel from a memory range that is guaranteed scribbling free, we can |
| boot the next kernel up to a point where it is at least able to detect |
| corruption and maybe even stop it before it becomes severe. This |
| increases the chance for successful kexecs. |
| |
| Since kexec got introduced, Linux has gained the CMA framework which can |
| perform physically contiguous memory mappings, while keeping that memory |
| available for movable memory when it is not needed for contiguous |
| allocations. The default CMA allocator is for DMA allocations. |
| |
| This patch adds logic to the kexec file loader to attempt to place the |
| target payload at a location allocated from CMA. If successful, it uses |
| that memory range directly instead of creating copy instructions during |
| the hot phase. To ensure that there is a safety net in case anything goes |
| wrong with the CMA allocation, it also adds a flag for user space to force |
| disable CMA allocations. |
| |
| Using CMA allocations has two advantages: |
| |
| 1) Faster by 4-50 ms per 100 MiB. There is no more need to copy in the |
| hot phase. |
| 2) More robust. Even if by accident some page is still in use for DMA, |
| the new kernel image will be safe from that access because it resides |
| in a memory region that is considered allocated in the old kernel and |
| has a chance to reinitialize that component. |
| |
| Link: https://lkml.kernel.org/r/20250610085327.51817-1-graf@amazon.com |
| Signed-off-by: Alexander Graf <graf@amazon.com> |
| Acked-by: Baoquan He <bhe@redhat.com> |
| Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com> |
| Cc: Zhongkun He <hezhongkun.hzk@bytedance.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| arch/riscv/kernel/kexec_elf.c | 1 |
| include/linux/kexec.h | 10 +++ |
| include/uapi/linux/kexec.h | 1 |
| kernel/kexec.c | 2 |
| kernel/kexec_core.c | 100 +++++++++++++++++++++++++++++--- |
| kernel/kexec_file.c | 51 ++++++++++++++++ |
| kernel/kexec_internal.h | 2 |
| 7 files changed, 156 insertions(+), 11 deletions(-) |
| |
| --- a/arch/riscv/kernel/kexec_elf.c~kexec-enable-cma-based-contiguous-allocation |
| +++ a/arch/riscv/kernel/kexec_elf.c |
| @@ -95,6 +95,7 @@ static int elf_find_pbase(struct kimage |
| kbuf.buf_align = PMD_SIZE; |
| kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; |
| kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE); |
| + kbuf.cma = NULL; |
| kbuf.top_down = false; |
| ret = arch_kexec_locate_mem_hole(&kbuf); |
| if (!ret) { |
| --- a/include/linux/kexec.h~kexec-enable-cma-based-contiguous-allocation |
| +++ a/include/linux/kexec.h |
| @@ -79,6 +79,12 @@ extern note_buf_t __percpu *crash_notes; |
| |
| typedef unsigned long kimage_entry_t; |
| |
| +/* |
| + * This is a copy of the UAPI struct kexec_segment and must be identical |
| + * to it because it gets copied straight from user space into kernel |
| + * memory. Do not modify this structure unless you change the way segments |
| + * get ingested from user space. |
| + */ |
| struct kexec_segment { |
| /* |
| * This pointer can point to user memory if kexec_load() system |
| @@ -172,6 +178,7 @@ int kexec_image_post_load_cleanup_defaul |
| * @buf_align: Minimum alignment needed. |
| * @buf_min: The buffer can't be placed below this address. |
| * @buf_max: The buffer can't be placed above this address. |
| + * @cma: CMA page if the buffer is backed by CMA. |
| * @top_down: Allocate from top of memory. |
| * @random: Place the buffer at a random position. |
| */ |
| @@ -184,6 +191,7 @@ struct kexec_buf { |
| unsigned long buf_align; |
| unsigned long buf_min; |
| unsigned long buf_max; |
| + struct page *cma; |
| bool top_down; |
| #ifdef CONFIG_CRASH_DUMP |
| bool random; |
| @@ -340,6 +348,7 @@ struct kimage { |
| |
| unsigned long nr_segments; |
| struct kexec_segment segment[KEXEC_SEGMENT_MAX]; |
| + struct page *segment_cma[KEXEC_SEGMENT_MAX]; |
| |
| struct list_head control_pages; |
| struct list_head dest_pages; |
| @@ -361,6 +370,7 @@ struct kimage { |
| */ |
| unsigned int hotplug_support:1; |
| #endif |
| + unsigned int no_cma:1; |
| |
| #ifdef ARCH_HAS_KIMAGE_ARCH |
| struct kimage_arch arch; |
| --- a/include/uapi/linux/kexec.h~kexec-enable-cma-based-contiguous-allocation |
| +++ a/include/uapi/linux/kexec.h |
| @@ -27,6 +27,7 @@ |
| #define KEXEC_FILE_ON_CRASH 0x00000002 |
| #define KEXEC_FILE_NO_INITRAMFS 0x00000004 |
| #define KEXEC_FILE_DEBUG 0x00000008 |
| +#define KEXEC_FILE_NO_CMA 0x00000010 |
| |
| /* These values match the ELF architecture values. |
| * Unless there is a good reason that should continue to be the case. |
| --- a/kernel/kexec.c~kexec-enable-cma-based-contiguous-allocation |
| +++ a/kernel/kexec.c |
| @@ -152,7 +152,7 @@ static int do_kexec_load(unsigned long e |
| goto out; |
| |
| for (i = 0; i < nr_segments; i++) { |
| - ret = kimage_load_segment(image, &image->segment[i]); |
| + ret = kimage_load_segment(image, i); |
| if (ret) |
| goto out; |
| } |
| --- a/kernel/kexec_core.c~kexec-enable-cma-based-contiguous-allocation |
| +++ a/kernel/kexec_core.c |
| @@ -40,6 +40,7 @@ |
| #include <linux/hugetlb.h> |
| #include <linux/objtool.h> |
| #include <linux/kmsg_dump.h> |
| +#include <linux/dma-map-ops.h> |
| |
| #include <asm/page.h> |
| #include <asm/sections.h> |
| @@ -553,6 +554,24 @@ static void kimage_free_entry(kimage_ent |
| kimage_free_pages(page); |
| } |
| |
| +static void kimage_free_cma(struct kimage *image) |
| +{ |
| + unsigned long i; |
| + |
| + for (i = 0; i < image->nr_segments; i++) { |
| + struct page *cma = image->segment_cma[i]; |
| + u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT; |
| + |
| + if (!cma) |
| + continue; |
| + |
| + arch_kexec_pre_free_pages(page_address(cma), nr_pages); |
| + dma_release_from_contiguous(NULL, cma, nr_pages); |
| + image->segment_cma[i] = NULL; |
| + } |
| + |
| +} |
| + |
| void kimage_free(struct kimage *image) |
| { |
| kimage_entry_t *ptr, entry; |
| @@ -591,6 +610,9 @@ void kimage_free(struct kimage *image) |
| /* Free the kexec control pages... */ |
| kimage_free_page_list(&image->control_pages); |
| |
| + /* Free CMA allocations */ |
| + kimage_free_cma(image); |
| + |
| /* |
| * Free up any temporary buffers allocated. This might hit if |
| * error occurred much later after buffer allocation. |
| @@ -716,9 +738,69 @@ static struct page *kimage_alloc_page(st |
| return page; |
| } |
| |
| -static int kimage_load_normal_segment(struct kimage *image, |
| - struct kexec_segment *segment) |
| +static int kimage_load_cma_segment(struct kimage *image, int idx) |
| +{ |
| + struct kexec_segment *segment = &image->segment[idx]; |
| + struct page *cma = image->segment_cma[idx]; |
| + char *ptr = page_address(cma); |
| + unsigned long maddr; |
| + size_t ubytes, mbytes; |
| + int result = 0; |
| + unsigned char __user *buf = NULL; |
| + unsigned char *kbuf = NULL; |
| + |
| + if (image->file_mode) |
| + kbuf = segment->kbuf; |
| + else |
| + buf = segment->buf; |
| + ubytes = segment->bufsz; |
| + mbytes = segment->memsz; |
| + maddr = segment->mem; |
| + |
| + /* Then copy from source buffer to the CMA one */ |
| + while (mbytes) { |
| + size_t uchunk, mchunk; |
| + |
| + ptr += maddr & ~PAGE_MASK; |
| + mchunk = min_t(size_t, mbytes, |
| + PAGE_SIZE - (maddr & ~PAGE_MASK)); |
| + uchunk = min(ubytes, mchunk); |
| + |
| + if (uchunk) { |
| + /* For file based kexec, source pages are in kernel memory */ |
| + if (image->file_mode) |
| + memcpy(ptr, kbuf, uchunk); |
| + else |
| + result = copy_from_user(ptr, buf, uchunk); |
| + ubytes -= uchunk; |
| + if (image->file_mode) |
| + kbuf += uchunk; |
| + else |
| + buf += uchunk; |
| + } |
| + |
| + if (result) { |
| + result = -EFAULT; |
| + goto out; |
| + } |
| + |
| + ptr += mchunk; |
| + maddr += mchunk; |
| + mbytes -= mchunk; |
| + |
| + cond_resched(); |
| + } |
| + |
| + /* Clear any remainder */ |
| + memset(ptr, 0, mbytes); |
| + |
| +out: |
| + return result; |
| +} |
| + |
| +static int kimage_load_normal_segment(struct kimage *image, int idx) |
| { |
| + struct kexec_segment *segment = &image->segment[idx]; |
| unsigned long maddr; |
| size_t ubytes, mbytes; |
| int result; |
| @@ -733,6 +815,9 @@ static int kimage_load_normal_segment(st |
| mbytes = segment->memsz; |
| maddr = segment->mem; |
| |
| + if (image->segment_cma[idx]) |
| + return kimage_load_cma_segment(image, idx); |
| + |
| result = kimage_set_destination(image, maddr); |
| if (result < 0) |
| goto out; |
| @@ -787,13 +872,13 @@ out: |
| } |
| |
| #ifdef CONFIG_CRASH_DUMP |
| -static int kimage_load_crash_segment(struct kimage *image, |
| - struct kexec_segment *segment) |
| +static int kimage_load_crash_segment(struct kimage *image, int idx) |
| { |
| /* For crash dumps kernels we simply copy the data from |
| * user space to it's destination. |
| * We do things a page at a time for the sake of kmap. |
| */ |
| + struct kexec_segment *segment = &image->segment[idx]; |
| unsigned long maddr; |
| size_t ubytes, mbytes; |
| int result; |
| @@ -858,18 +943,17 @@ out: |
| } |
| #endif |
| |
| -int kimage_load_segment(struct kimage *image, |
| - struct kexec_segment *segment) |
| +int kimage_load_segment(struct kimage *image, int idx) |
| { |
| int result = -ENOMEM; |
| |
| switch (image->type) { |
| case KEXEC_TYPE_DEFAULT: |
| - result = kimage_load_normal_segment(image, segment); |
| + result = kimage_load_normal_segment(image, idx); |
| break; |
| #ifdef CONFIG_CRASH_DUMP |
| case KEXEC_TYPE_CRASH: |
| - result = kimage_load_crash_segment(image, segment); |
| + result = kimage_load_crash_segment(image, idx); |
| break; |
| #endif |
| } |
| --- a/kernel/kexec_file.c~kexec-enable-cma-based-contiguous-allocation |
| +++ a/kernel/kexec_file.c |
| @@ -26,6 +26,7 @@ |
| #include <linux/kernel_read_file.h> |
| #include <linux/syscalls.h> |
| #include <linux/vmalloc.h> |
| +#include <linux/dma-map-ops.h> |
| #include "kexec_internal.h" |
| |
| #ifdef CONFIG_KEXEC_SIG |
| @@ -253,6 +254,8 @@ kimage_file_prepare_segments(struct kima |
| ret = 0; |
| } |
| |
| + image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); |
| + |
| if (cmdline_len) { |
| image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); |
| if (IS_ERR(image->cmdline_buf)) { |
| @@ -434,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, ke |
| i, ksegment->buf, ksegment->bufsz, ksegment->mem, |
| ksegment->memsz); |
| |
| - ret = kimage_load_segment(image, &image->segment[i]); |
| + ret = kimage_load_segment(image, i); |
| if (ret) |
| goto out; |
| } |
| @@ -663,6 +666,43 @@ static int kexec_walk_resources(struct k |
| return walk_system_ram_res(0, ULONG_MAX, kbuf, func); |
| } |
| |
| +static int kexec_alloc_contig(struct kexec_buf *kbuf) |
| +{ |
| + size_t nr_pages = kbuf->memsz >> PAGE_SHIFT; |
| + unsigned long mem; |
| + struct page *p; |
| + |
| + /* User space disabled CMA allocations, bail out. */ |
| + if (kbuf->image->no_cma) |
| + return -EPERM; |
| + |
| + /* Skip CMA logic for crash kernel */ |
| + if (kbuf->image->type == KEXEC_TYPE_CRASH) |
| + return -EPERM; |
| + |
| + p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true); |
| + if (!p) |
| + return -ENOMEM; |
| + |
| + pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p)); |
| + |
| + mem = page_to_boot_pfn(p) << PAGE_SHIFT; |
| + |
| + if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) { |
| + /* Our region is already in use by a statically defined one. Bail out. */ |
| + pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz); |
| + dma_release_from_contiguous(NULL, p, nr_pages); |
| + return -EBUSY; |
| + } |
| + |
| + kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT; |
| + kbuf->cma = p; |
| + |
| + arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0); |
| + |
| + return 0; |
| +} |
| + |
| /** |
| * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel |
| * @kbuf: Parameters for the memory search. |
| @@ -687,6 +727,13 @@ int kexec_locate_mem_hole(struct kexec_b |
| if (ret <= 0) |
| return ret; |
| |
| + /* |
| + * Try to find a free physically contiguous block of memory first. With that, we |
| + * can avoid any copying at kexec time. |
| + */ |
| + if (!kexec_alloc_contig(kbuf)) |
| + return 0; |
| + |
| if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) |
| ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); |
| else |
| @@ -732,6 +779,7 @@ int kexec_add_buffer(struct kexec_buf *k |
| /* Ensure minimum alignment needed for segments. */ |
| kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE); |
| kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); |
| + kbuf->cma = NULL; |
| |
| /* Walk the RAM ranges and allocate a suitable range for the buffer */ |
| ret = arch_kexec_locate_mem_hole(kbuf); |
| @@ -744,6 +792,7 @@ int kexec_add_buffer(struct kexec_buf *k |
| ksegment->bufsz = kbuf->bufsz; |
| ksegment->mem = kbuf->mem; |
| ksegment->memsz = kbuf->memsz; |
| + kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma; |
| kbuf->image->nr_segments++; |
| return 0; |
| } |
| --- a/kernel/kexec_internal.h~kexec-enable-cma-based-contiguous-allocation |
| +++ a/kernel/kexec_internal.h |
| @@ -10,7 +10,7 @@ struct kimage *do_kimage_alloc_init(void |
| int sanity_check_segment_list(struct kimage *image); |
| void kimage_free_page_list(struct list_head *list); |
| void kimage_free(struct kimage *image); |
| -int kimage_load_segment(struct kimage *image, struct kexec_segment *segment); |
| +int kimage_load_segment(struct kimage *image, int idx); |
| void kimage_terminate(struct kimage *image); |
| int kimage_is_destination_range(struct kimage *image, |
| unsigned long start, unsigned long end); |
| _ |