| From: Frank van der Linden <fvdl@google.com> |
| Subject: mm/sparse: allow for alternate vmemmap section init at boot |
| Date: Fri, 28 Feb 2025 18:29:11 +0000 |
| |
| Add functions that are called just before the per-section memmap is |
| initialized and just before the memmap page structures are initialized. |
| They are called sparse_vmemmap_init_nid_early and |
| sparse_vmemmap_init_nid_late, respectively. |
| |
| This allows for mm subsystems to add calls to initialize memmap and page |
| structures in a specific way, if using SPARSEMEM_VMEMMAP. Specifically, |
| hugetlb can pre-HVO bootmem allocated pages that way, so that no time and |
| resources are wasted on allocating vmemmap pages, only to free them later |
| (and possibly unnecessarily running the system out of memory in the |
| process). |
| |
| Refactor some code and export a few convenience functions for external |
| use. |
| |
| In sparse_init_nid, skip any sections that are already initialized, e.g. |
| they have been initialized by sparse_vmemmap_init_nid_early already. |
| |
| The hugetlb code to use these functions will be added in a later commit. |
| |
| Export section_map_size, as any alternate memmap init code will want to |
| use it. |
| |
| The internal config option to enable this is SPARSEMEM_VMEMMAP_PREINIT, |
| which is selected if an architecture-specific option, |
| ARCH_WANT_HUGETLB_VMEMMAP_PREINIT, is set. In the future, if other |
| subsystems want to do preinit too, they can do it in a similar fashion. |
| |
| The internal config option is there because a section flag is used, and |
| the number of flags available is architecture-dependent (see mmzone.h). |
| Architecures can decide if there is room for the flag when enabling |
| options that select SPARSEMEM_VMEMMAP_PREINIT. |
| |
| Fortunately, as of right now, all sparse vmemmap using architectures do |
| have room. |
| |
| Link: https://lkml.kernel.org/r/20250228182928.2645936-11-fvdl@google.com |
| Signed-off-by: Frank van der Linden <fvdl@google.com> |
| Cc: Johannes Weiner <hannes@cmpxchg.org> |
| Cc: Alexander Gordeev <agordeev@linux.ibm.com> |
| Cc: Andy Lutomirski <luto@kernel.org> |
| Cc: Arnd Bergmann <arnd@arndb.de> |
| Cc: Dan Carpenter <dan.carpenter@linaro.org> |
| Cc: Dave Hansen <dave.hansen@linux.intel.com> |
| Cc: David Hildenbrand <david@redhat.com> |
| Cc: Heiko Carstens <hca@linux.ibm.com> |
| Cc: Joao Martins <joao.m.martins@oracle.com> |
| Cc: Madhavan Srinivasan <maddy@linux.ibm.com> |
| Cc: Michael Ellerman <mpe@ellerman.id.au> |
| Cc: Muchun Song <muchun.song@linux.dev> |
| Cc: Oscar Salvador <osalvador@suse.de> |
| Cc: Peter Zijlstra <peterz@infradead.org> |
| Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev> |
| Cc: Usama Arif <usamaarif642@gmail.com> |
| Cc: Vasily Gorbik <gor@linux.ibm.com> |
| Cc: Yu Zhao <yuzhao@google.com> |
| Cc: Zi Yan <ziy@nvidia.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| fs/Kconfig | 1 |
| include/linux/mm.h | 1 |
| include/linux/mmzone.h | 35 +++++++++++++++ |
| mm/Kconfig | 6 ++ |
| mm/bootmem_info.c | 4 + |
| mm/mm_init.c | 3 + |
| mm/sparse-vmemmap.c | 23 ++++++++++ |
| mm/sparse.c | 87 +++++++++++++++++++++++++++++---------- |
| 8 files changed, 138 insertions(+), 22 deletions(-) |
| |
| --- a/fs/Kconfig~mm-sparse-allow-for-alternate-vmemmap-section-init-at-boot |
| +++ a/fs/Kconfig |
| @@ -286,6 +286,7 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP |
| def_bool HUGETLB_PAGE |
| depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP |
| depends on SPARSEMEM_VMEMMAP |
| + select SPARSEMEM_VMEMMAP_PREINIT if ARCH_WANT_HUGETLB_VMEMMAP_PREINIT |
| |
| config HUGETLB_PMD_PAGE_TABLE_SHARING |
| def_bool HUGETLB_PAGE |
| --- a/include/linux/mm.h~mm-sparse-allow-for-alternate-vmemmap-section-init-at-boot |
| +++ a/include/linux/mm.h |
| @@ -3928,6 +3928,7 @@ static inline void print_vma_addr(char * |
| #endif |
| |
| void *sparse_buffer_alloc(unsigned long size); |
| +unsigned long section_map_size(void); |
| struct page * __populate_section_memmap(unsigned long pfn, |
| unsigned long nr_pages, int nid, struct vmem_altmap *altmap, |
| struct dev_pagemap *pgmap); |
| --- a/include/linux/mmzone.h~mm-sparse-allow-for-alternate-vmemmap-section-init-at-boot |
| +++ a/include/linux/mmzone.h |
| @@ -1934,6 +1934,9 @@ enum { |
| #ifdef CONFIG_ZONE_DEVICE |
| SECTION_TAINT_ZONE_DEVICE_BIT, |
| #endif |
| +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT |
| + SECTION_IS_VMEMMAP_PREINIT_BIT, |
| +#endif |
| SECTION_MAP_LAST_BIT, |
| }; |
| |
| @@ -1944,6 +1947,9 @@ enum { |
| #ifdef CONFIG_ZONE_DEVICE |
| #define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT) |
| #endif |
| +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT |
| +#define SECTION_IS_VMEMMAP_PREINIT BIT(SECTION_IS_VMEMMAP_PREINIT_BIT) |
| +#endif |
| #define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1)) |
| #define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT |
| |
| @@ -1998,6 +2004,30 @@ static inline int online_device_section( |
| } |
| #endif |
| |
| +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT |
| +static inline int preinited_vmemmap_section(struct mem_section *section) |
| +{ |
| + return (section && |
| + (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT)); |
| +} |
| + |
| +void sparse_vmemmap_init_nid_early(int nid); |
| +void sparse_vmemmap_init_nid_late(int nid); |
| + |
| +#else |
| +static inline int preinited_vmemmap_section(struct mem_section *section) |
| +{ |
| + return 0; |
| +} |
| +static inline void sparse_vmemmap_init_nid_early(int nid) |
| +{ |
| +} |
| + |
| +static inline void sparse_vmemmap_init_nid_late(int nid) |
| +{ |
| +} |
| +#endif |
| + |
| static inline int online_section_nr(unsigned long nr) |
| { |
| return online_section(__nr_to_section(nr)); |
| @@ -2035,6 +2065,9 @@ static inline int pfn_section_valid(stru |
| } |
| #endif |
| |
| +void sparse_init_early_section(int nid, struct page *map, unsigned long pnum, |
| + unsigned long flags); |
| + |
| #ifndef CONFIG_HAVE_ARCH_PFN_VALID |
| /** |
| * pfn_valid - check if there is a valid memory map entry for a PFN |
| @@ -2116,6 +2149,8 @@ void sparse_init(void); |
| #else |
| #define sparse_init() do {} while (0) |
| #define sparse_index_init(_sec, _nid) do {} while (0) |
| +#define sparse_vmemmap_init_nid_early(_nid, _use) do {} while (0) |
| +#define sparse_vmemmap_init_nid_late(_nid) do {} while (0) |
| #define pfn_in_present_section pfn_valid |
| #define subsection_map_init(_pfn, _nr_pages) do {} while (0) |
| #endif /* CONFIG_SPARSEMEM */ |
| --- a/mm/bootmem_info.c~mm-sparse-allow-for-alternate-vmemmap-section-init-at-boot |
| +++ a/mm/bootmem_info.c |
| @@ -88,7 +88,9 @@ static void __init register_page_bootmem |
| |
| memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); |
| |
| - register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); |
| + if (!preinited_vmemmap_section(ms)) |
| + register_page_bootmem_memmap(section_nr, memmap, |
| + PAGES_PER_SECTION); |
| |
| usage = ms->usage; |
| page = virt_to_page(usage); |
| --- a/mm/Kconfig~mm-sparse-allow-for-alternate-vmemmap-section-init-at-boot |
| +++ a/mm/Kconfig |
| @@ -442,6 +442,9 @@ config SPARSEMEM_VMEMMAP |
| SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise |
| pfn_to_page and page_to_pfn operations. This is the most |
| efficient option when sufficient kernel resources are available. |
| + |
| +config SPARSEMEM_VMEMMAP_PREINIT |
| + bool |
| # |
| # Select this config option from the architecture Kconfig, if it is preferred |
| # to enable the feature of HugeTLB/dev_dax vmemmap optimization. |
| @@ -452,6 +455,9 @@ config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP |
| config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP |
| bool |
| |
| +config ARCH_WANT_HUGETLB_VMEMMAP_PREINIT |
| + bool |
| + |
| config HAVE_MEMBLOCK_PHYS_MAP |
| bool |
| |
| --- a/mm/mm_init.c~mm-sparse-allow-for-alternate-vmemmap-section-init-at-boot |
| +++ a/mm/mm_init.c |
| @@ -1862,6 +1862,9 @@ void __init free_area_init(unsigned long |
| } |
| } |
| |
| + for_each_node_state(nid, N_MEMORY) |
| + sparse_vmemmap_init_nid_late(nid); |
| + |
| calc_nr_kernel_pages(); |
| memmap_init(); |
| |
| --- a/mm/sparse.c~mm-sparse-allow-for-alternate-vmemmap-section-init-at-boot |
| +++ a/mm/sparse.c |
| @@ -408,13 +408,13 @@ static void __init check_usemap_section_ |
| #endif /* CONFIG_MEMORY_HOTREMOVE */ |
| |
| #ifdef CONFIG_SPARSEMEM_VMEMMAP |
| -static unsigned long __init section_map_size(void) |
| +unsigned long __init section_map_size(void) |
| { |
| return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE); |
| } |
| |
| #else |
| -static unsigned long __init section_map_size(void) |
| +unsigned long __init section_map_size(void) |
| { |
| return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); |
| } |
| @@ -495,6 +495,44 @@ void __weak __meminit vmemmap_populate_p |
| { |
| } |
| |
| +static void *sparse_usagebuf __meminitdata; |
| +static void *sparse_usagebuf_end __meminitdata; |
| + |
| +/* |
| + * Helper function that is used for generic section initialization, and |
| + * can also be used by any hooks added above. |
| + */ |
| +void __init sparse_init_early_section(int nid, struct page *map, |
| + unsigned long pnum, unsigned long flags) |
| +{ |
| + BUG_ON(!sparse_usagebuf || sparse_usagebuf >= sparse_usagebuf_end); |
| + check_usemap_section_nr(nid, sparse_usagebuf); |
| + sparse_init_one_section(__nr_to_section(pnum), pnum, map, |
| + sparse_usagebuf, SECTION_IS_EARLY | flags); |
| + sparse_usagebuf = (void *)sparse_usagebuf + mem_section_usage_size(); |
| +} |
| + |
| +static int __init sparse_usage_init(int nid, unsigned long map_count) |
| +{ |
| + unsigned long size; |
| + |
| + size = mem_section_usage_size() * map_count; |
| + sparse_usagebuf = sparse_early_usemaps_alloc_pgdat_section( |
| + NODE_DATA(nid), size); |
| + if (!sparse_usagebuf) { |
| + sparse_usagebuf_end = NULL; |
| + return -ENOMEM; |
| + } |
| + |
| + sparse_usagebuf_end = sparse_usagebuf + size; |
| + return 0; |
| +} |
| + |
| +static void __init sparse_usage_fini(void) |
| +{ |
| + sparse_usagebuf = sparse_usagebuf_end = NULL; |
| +} |
| + |
| /* |
| * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end) |
| * And number of present sections in this node is map_count. |
| @@ -503,47 +541,54 @@ static void __init sparse_init_nid(int n |
| unsigned long pnum_end, |
| unsigned long map_count) |
| { |
| - struct mem_section_usage *usage; |
| unsigned long pnum; |
| struct page *map; |
| + struct mem_section *ms; |
| |
| - usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid), |
| - mem_section_usage_size() * map_count); |
| - if (!usage) { |
| + if (sparse_usage_init(nid, map_count)) { |
| pr_err("%s: node[%d] usemap allocation failed", __func__, nid); |
| goto failed; |
| } |
| + |
| sparse_buffer_init(map_count * section_map_size(), nid); |
| + |
| + sparse_vmemmap_init_nid_early(nid); |
| + |
| for_each_present_section_nr(pnum_begin, pnum) { |
| unsigned long pfn = section_nr_to_pfn(pnum); |
| |
| if (pnum >= pnum_end) |
| break; |
| |
| - map = __populate_section_memmap(pfn, PAGES_PER_SECTION, |
| - nid, NULL, NULL); |
| - if (!map) { |
| - pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", |
| - __func__, nid); |
| - pnum_begin = pnum; |
| - sparse_buffer_fini(); |
| - goto failed; |
| + ms = __nr_to_section(pnum); |
| + if (!preinited_vmemmap_section(ms)) { |
| + map = __populate_section_memmap(pfn, PAGES_PER_SECTION, |
| + nid, NULL, NULL); |
| + if (!map) { |
| + pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", |
| + __func__, nid); |
| + pnum_begin = pnum; |
| + sparse_usage_fini(); |
| + sparse_buffer_fini(); |
| + goto failed; |
| + } |
| + sparse_init_early_section(nid, map, pnum, 0); |
| } |
| - check_usemap_section_nr(nid, usage); |
| - sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage, |
| - SECTION_IS_EARLY); |
| - usage = (void *) usage + mem_section_usage_size(); |
| } |
| + sparse_usage_fini(); |
| sparse_buffer_fini(); |
| return; |
| failed: |
| - /* We failed to allocate, mark all the following pnums as not present */ |
| + /* |
| + * We failed to allocate, mark all the following pnums as not present, |
| + * except the ones already initialized earlier. |
| + */ |
| for_each_present_section_nr(pnum_begin, pnum) { |
| - struct mem_section *ms; |
| - |
| if (pnum >= pnum_end) |
| break; |
| ms = __nr_to_section(pnum); |
| + if (!preinited_vmemmap_section(ms)) |
| + ms->section_mem_map = 0; |
| ms->section_mem_map = 0; |
| } |
| } |
| --- a/mm/sparse-vmemmap.c~mm-sparse-allow-for-alternate-vmemmap-section-init-at-boot |
| +++ a/mm/sparse-vmemmap.c |
| @@ -470,3 +470,26 @@ struct page * __meminit __populate_secti |
| |
| return pfn_to_page(pfn); |
| } |
| + |
| +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT |
| +/* |
| + * This is called just before initializing sections for a NUMA node. |
| + * Any special initialization that needs to be done before the |
| + * generic initialization can be done from here. Sections that |
| + * are initialized in hooks called from here will be skipped by |
| + * the generic initialization. |
| + */ |
| +void __init sparse_vmemmap_init_nid_early(int nid) |
| +{ |
| +} |
| + |
| +/* |
| + * This is called just before the initialization of page structures |
| + * through memmap_init. Zones are now initialized, so any work that |
| + * needs to be done that needs zone information can be done from |
| + * here. |
| + */ |
| +void __init sparse_vmemmap_init_nid_late(int nid) |
| +{ |
| +} |
| +#endif |
| _ |