| From: "Mike Rapoport (Microsoft)" <rppt@kernel.org> |
| Subject: mm: introduce numa_emulation |
| Date: Wed, 7 Aug 2024 09:41:03 +0300 |
| |
| Move numa_emulation code from arch/x86 to mm/numa_emulation.c |
| |
| This code will be later reused by arch_numa. |
| |
| No functional changes. |
| |
| Link: https://lkml.kernel.org/r/20240807064110.1003856-20-rppt@kernel.org |
| Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> |
| Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64 |
| Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> |
| Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU] |
| Acked-by: Dan Williams <dan.j.williams@intel.com> |
| Cc: Alexander Gordeev <agordeev@linux.ibm.com> |
| Cc: Andreas Larsson <andreas@gaisler.com> |
| Cc: Arnd Bergmann <arnd@arndb.de> |
| Cc: Borislav Petkov <bp@alien8.de> |
| Cc: Catalin Marinas <catalin.marinas@arm.com> |
| Cc: Christophe Leroy <christophe.leroy@csgroup.eu> |
| Cc: Dave Hansen <dave.hansen@linux.intel.com> |
| Cc: David Hildenbrand <david@redhat.com> |
| Cc: Davidlohr Bueso <dave@stgolabs.net> |
| Cc: David S. Miller <davem@davemloft.net> |
| Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| Cc: Heiko Carstens <hca@linux.ibm.com> |
| Cc: Huacai Chen <chenhuacai@kernel.org> |
| Cc: Ingo Molnar <mingo@redhat.com> |
| Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> |
| Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> |
| Cc: Jonathan Corbet <corbet@lwn.net> |
| Cc: Michael Ellerman <mpe@ellerman.id.au> |
| Cc: Palmer Dabbelt <palmer@dabbelt.com> |
| Cc: Rafael J. Wysocki <rafael@kernel.org> |
| Cc: Rob Herring (Arm) <robh@kernel.org> |
| Cc: Samuel Holland <samuel.holland@sifive.com> |
| Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> |
| Cc: Thomas Gleixner <tglx@linutronix.de> |
| Cc: Vasily Gorbik <gor@linux.ibm.com> |
| Cc: Will Deacon <will@kernel.org> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| arch/x86/Kconfig | 8 |
| arch/x86/include/asm/numa.h | 12 |
| arch/x86/mm/Makefile | 1 |
| arch/x86/mm/numa_emulation.c | 573 --------------------------------- |
| arch/x86/mm/numa_internal.h | 11 |
| include/linux/numa_memblks.h | 17 |
| mm/Kconfig | 8 |
| mm/Makefile | 1 |
| mm/numa_emulation.c | 571 ++++++++++++++++++++++++++++++++ |
| 9 files changed, 597 insertions(+), 605 deletions(-) |
| |
| --- a/arch/x86/include/asm/numa.h~mm-introduce-numa_emulation |
| +++ a/arch/x86/include/asm/numa.h |
| @@ -65,16 +65,4 @@ static inline void init_gi_nodes(void) |
| void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable); |
| #endif |
| |
| -#ifdef CONFIG_NUMA_EMU |
| -int numa_emu_cmdline(char *str); |
| -void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, |
| - unsigned int nr_emu_nids); |
| -u64 __init numa_emu_dma_end(void); |
| -#else /* CONFIG_NUMA_EMU */ |
| -static inline int numa_emu_cmdline(char *str) |
| -{ |
| - return -EINVAL; |
| -} |
| -#endif /* CONFIG_NUMA_EMU */ |
| - |
| #endif /* _ASM_X86_NUMA_H */ |
| --- a/arch/x86/Kconfig~mm-introduce-numa_emulation |
| +++ a/arch/x86/Kconfig |
| @@ -1600,14 +1600,6 @@ config X86_64_ACPI_NUMA |
| help |
| Enable ACPI SRAT based node topology detection. |
| |
| -config NUMA_EMU |
| - bool "NUMA emulation" |
| - depends on NUMA |
| - help |
| - Enable NUMA emulation. A flat machine will be split |
| - into virtual nodes when booted with "numa=fake=N", where N is the |
| - number of nodes. This is only useful for debugging. |
| - |
| config NODES_SHIFT |
| int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP |
| range 1 10 |
| --- a/arch/x86/mm/Makefile~mm-introduce-numa_emulation |
| +++ a/arch/x86/mm/Makefile |
| @@ -57,7 +57,6 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmio |
| obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o |
| obj-$(CONFIG_AMD_NUMA) += amdtopology.o |
| obj-$(CONFIG_ACPI_NUMA) += srat.o |
| -obj-$(CONFIG_NUMA_EMU) += numa_emulation.o |
| |
| obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o |
| obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o |
| diff --git a/arch/x86/mm/numa_emulation.c a/arch/x86/mm/numa_emulation.c |
| deleted file mode 100644 |
| --- a/arch/x86/mm/numa_emulation.c |
| +++ /dev/null |
| @@ -1,573 +0,0 @@ |
| -// SPDX-License-Identifier: GPL-2.0 |
| -/* |
| - * NUMA emulation |
| - */ |
| -#include <linux/kernel.h> |
| -#include <linux/errno.h> |
| -#include <linux/topology.h> |
| -#include <linux/memblock.h> |
| -#include <linux/numa_memblks.h> |
| -#include <asm/dma.h> |
| - |
| -#include "numa_internal.h" |
| - |
| -#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) |
| -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) |
| - |
| -static int emu_nid_to_phys[MAX_NUMNODES]; |
| -static char *emu_cmdline __initdata; |
| - |
| -int __init numa_emu_cmdline(char *str) |
| -{ |
| - emu_cmdline = str; |
| - return 0; |
| -} |
| - |
| -static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) |
| -{ |
| - int i; |
| - |
| - for (i = 0; i < mi->nr_blks; i++) |
| - if (mi->blk[i].nid == nid) |
| - return i; |
| - return -ENOENT; |
| -} |
| - |
| -static u64 __init mem_hole_size(u64 start, u64 end) |
| -{ |
| - unsigned long start_pfn = PFN_UP(start); |
| - unsigned long end_pfn = PFN_DOWN(end); |
| - |
| - if (start_pfn < end_pfn) |
| - return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); |
| - return 0; |
| -} |
| - |
| -/* |
| - * Sets up nid to range from @start to @end. The return value is -errno if |
| - * something went wrong, 0 otherwise. |
| - */ |
| -static int __init emu_setup_memblk(struct numa_meminfo *ei, |
| - struct numa_meminfo *pi, |
| - int nid, int phys_blk, u64 size) |
| -{ |
| - struct numa_memblk *eb = &ei->blk[ei->nr_blks]; |
| - struct numa_memblk *pb = &pi->blk[phys_blk]; |
| - |
| - if (ei->nr_blks >= NR_NODE_MEMBLKS) { |
| - pr_err("NUMA: Too many emulated memblks, failing emulation\n"); |
| - return -EINVAL; |
| - } |
| - |
| - ei->nr_blks++; |
| - eb->start = pb->start; |
| - eb->end = pb->start + size; |
| - eb->nid = nid; |
| - |
| - if (emu_nid_to_phys[nid] == NUMA_NO_NODE) |
| - emu_nid_to_phys[nid] = pb->nid; |
| - |
| - pb->start += size; |
| - if (pb->start >= pb->end) { |
| - WARN_ON_ONCE(pb->start > pb->end); |
| - numa_remove_memblk_from(phys_blk, pi); |
| - } |
| - |
| - printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", |
| - nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); |
| - return 0; |
| -} |
| - |
| -/* |
| - * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr |
| - * to max_addr. |
| - * |
| - * Returns zero on success or negative on error. |
| - */ |
| -static int __init split_nodes_interleave(struct numa_meminfo *ei, |
| - struct numa_meminfo *pi, |
| - u64 addr, u64 max_addr, int nr_nodes) |
| -{ |
| - nodemask_t physnode_mask = numa_nodes_parsed; |
| - u64 size; |
| - int big; |
| - int nid = 0; |
| - int i, ret; |
| - |
| - if (nr_nodes <= 0) |
| - return -1; |
| - if (nr_nodes > MAX_NUMNODES) { |
| - pr_info("numa=fake=%d too large, reducing to %d\n", |
| - nr_nodes, MAX_NUMNODES); |
| - nr_nodes = MAX_NUMNODES; |
| - } |
| - |
| - /* |
| - * Calculate target node size. x86_32 freaks on __udivdi3() so do |
| - * the division in ulong number of pages and convert back. |
| - */ |
| - size = max_addr - addr - mem_hole_size(addr, max_addr); |
| - size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); |
| - |
| - /* |
| - * Calculate the number of big nodes that can be allocated as a result |
| - * of consolidating the remainder. |
| - */ |
| - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / |
| - FAKE_NODE_MIN_SIZE; |
| - |
| - size &= FAKE_NODE_MIN_HASH_MASK; |
| - if (!size) { |
| - pr_err("Not enough memory for each node. " |
| - "NUMA emulation disabled.\n"); |
| - return -1; |
| - } |
| - |
| - /* |
| - * Continue to fill physical nodes with fake nodes until there is no |
| - * memory left on any of them. |
| - */ |
| - while (!nodes_empty(physnode_mask)) { |
| - for_each_node_mask(i, physnode_mask) { |
| - u64 dma32_end = numa_emu_dma_end(); |
| - u64 start, limit, end; |
| - int phys_blk; |
| - |
| - phys_blk = emu_find_memblk_by_nid(i, pi); |
| - if (phys_blk < 0) { |
| - node_clear(i, physnode_mask); |
| - continue; |
| - } |
| - start = pi->blk[phys_blk].start; |
| - limit = pi->blk[phys_blk].end; |
| - end = start + size; |
| - |
| - if (nid < big) |
| - end += FAKE_NODE_MIN_SIZE; |
| - |
| - /* |
| - * Continue to add memory to this fake node if its |
| - * non-reserved memory is less than the per-node size. |
| - */ |
| - while (end - start - mem_hole_size(start, end) < size) { |
| - end += FAKE_NODE_MIN_SIZE; |
| - if (end > limit) { |
| - end = limit; |
| - break; |
| - } |
| - } |
| - |
| - /* |
| - * If there won't be at least FAKE_NODE_MIN_SIZE of |
| - * non-reserved memory in ZONE_DMA32 for the next node, |
| - * this one must extend to the boundary. |
| - */ |
| - if (end < dma32_end && dma32_end - end - |
| - mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) |
| - end = dma32_end; |
| - |
| - /* |
| - * If there won't be enough non-reserved memory for the |
| - * next node, this one must extend to the end of the |
| - * physical node. |
| - */ |
| - if (limit - end - mem_hole_size(end, limit) < size) |
| - end = limit; |
| - |
| - ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, |
| - phys_blk, |
| - min(end, limit) - start); |
| - if (ret < 0) |
| - return ret; |
| - } |
| - } |
| - return 0; |
| -} |
| - |
| -/* |
| - * Returns the end address of a node so that there is at least `size' amount of |
| - * non-reserved memory or `max_addr' is reached. |
| - */ |
| -static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) |
| -{ |
| - u64 end = start + size; |
| - |
| - while (end - start - mem_hole_size(start, end) < size) { |
| - end += FAKE_NODE_MIN_SIZE; |
| - if (end > max_addr) { |
| - end = max_addr; |
| - break; |
| - } |
| - } |
| - return end; |
| -} |
| - |
| -static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) |
| -{ |
| - unsigned long max_pfn = PHYS_PFN(max_addr); |
| - unsigned long base_pfn = PHYS_PFN(base); |
| - unsigned long hole_pfns = PHYS_PFN(hole); |
| - |
| - return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); |
| -} |
| - |
| -/* |
| - * Sets up fake nodes of `size' interleaved over physical nodes ranging from |
| - * `addr' to `max_addr'. |
| - * |
| - * Returns zero on success or negative on error. |
| - */ |
| -static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, |
| - struct numa_meminfo *pi, |
| - u64 addr, u64 max_addr, u64 size, |
| - int nr_nodes, struct numa_memblk *pblk, |
| - int nid) |
| -{ |
| - nodemask_t physnode_mask = numa_nodes_parsed; |
| - int i, ret, uniform = 0; |
| - u64 min_size; |
| - |
| - if ((!size && !nr_nodes) || (nr_nodes && !pblk)) |
| - return -1; |
| - |
| - /* |
| - * In the 'uniform' case split the passed in physical node by |
| - * nr_nodes, in the non-uniform case, ignore the passed in |
| - * physical block and try to create nodes of at least size |
| - * @size. |
| - * |
| - * In the uniform case, split the nodes strictly by physical |
| - * capacity, i.e. ignore holes. In the non-uniform case account |
| - * for holes and treat @size as a minimum floor. |
| - */ |
| - if (!nr_nodes) |
| - nr_nodes = MAX_NUMNODES; |
| - else { |
| - nodes_clear(physnode_mask); |
| - node_set(pblk->nid, physnode_mask); |
| - uniform = 1; |
| - } |
| - |
| - if (uniform) { |
| - min_size = uniform_size(max_addr, addr, 0, nr_nodes); |
| - size = min_size; |
| - } else { |
| - /* |
| - * The limit on emulated nodes is MAX_NUMNODES, so the |
| - * size per node is increased accordingly if the |
| - * requested size is too small. This creates a uniform |
| - * distribution of node sizes across the entire machine |
| - * (but not necessarily over physical nodes). |
| - */ |
| - min_size = uniform_size(max_addr, addr, |
| - mem_hole_size(addr, max_addr), nr_nodes); |
| - } |
| - min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); |
| - if (size < min_size) { |
| - pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", |
| - size >> 20, min_size >> 20); |
| - size = min_size; |
| - } |
| - size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); |
| - |
| - /* |
| - * Fill physical nodes with fake nodes of size until there is no memory |
| - * left on any of them. |
| - */ |
| - while (!nodes_empty(physnode_mask)) { |
| - for_each_node_mask(i, physnode_mask) { |
| - u64 dma32_end = numa_emu_dma_end(); |
| - u64 start, limit, end; |
| - int phys_blk; |
| - |
| - phys_blk = emu_find_memblk_by_nid(i, pi); |
| - if (phys_blk < 0) { |
| - node_clear(i, physnode_mask); |
| - continue; |
| - } |
| - |
| - start = pi->blk[phys_blk].start; |
| - limit = pi->blk[phys_blk].end; |
| - |
| - if (uniform) |
| - end = start + size; |
| - else |
| - end = find_end_of_node(start, limit, size); |
| - /* |
| - * If there won't be at least FAKE_NODE_MIN_SIZE of |
| - * non-reserved memory in ZONE_DMA32 for the next node, |
| - * this one must extend to the boundary. |
| - */ |
| - if (end < dma32_end && dma32_end - end - |
| - mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) |
| - end = dma32_end; |
| - |
| - /* |
| - * If there won't be enough non-reserved memory for the |
| - * next node, this one must extend to the end of the |
| - * physical node. |
| - */ |
| - if ((limit - end - mem_hole_size(end, limit) < size) |
| - && !uniform) |
| - end = limit; |
| - |
| - ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, |
| - phys_blk, |
| - min(end, limit) - start); |
| - if (ret < 0) |
| - return ret; |
| - } |
| - } |
| - return nid; |
| -} |
| - |
| -static int __init split_nodes_size_interleave(struct numa_meminfo *ei, |
| - struct numa_meminfo *pi, |
| - u64 addr, u64 max_addr, u64 size) |
| -{ |
| - return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, |
| - 0, NULL, 0); |
| -} |
| - |
| -static int __init setup_emu2phys_nid(int *dfl_phys_nid) |
| -{ |
| - int i, max_emu_nid = 0; |
| - |
| - *dfl_phys_nid = NUMA_NO_NODE; |
| - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { |
| - if (emu_nid_to_phys[i] != NUMA_NO_NODE) { |
| - max_emu_nid = i; |
| - if (*dfl_phys_nid == NUMA_NO_NODE) |
| - *dfl_phys_nid = emu_nid_to_phys[i]; |
| - } |
| - } |
| - |
| - return max_emu_nid; |
| -} |
| - |
| -/** |
| - * numa_emulation - Emulate NUMA nodes |
| - * @numa_meminfo: NUMA configuration to massage |
| - * @numa_dist_cnt: The size of the physical NUMA distance table |
| - * |
| - * Emulate NUMA nodes according to the numa=fake kernel parameter. |
| - * @numa_meminfo contains the physical memory configuration and is modified |
| - * to reflect the emulated configuration on success. @numa_dist_cnt is |
| - * used to determine the size of the physical distance table. |
| - * |
| - * On success, the following modifications are made. |
| - * |
| - * - @numa_meminfo is updated to reflect the emulated nodes. |
| - * |
| - * - __apicid_to_node[] is updated such that APIC IDs are mapped to the |
| - * emulated nodes. |
| - * |
| - * - NUMA distance table is rebuilt to represent distances between emulated |
| - * nodes. The distances are determined considering how emulated nodes |
| - * are mapped to physical nodes and match the actual distances. |
| - * |
| - * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical |
| - * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). |
| - * |
| - * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with |
| - * identity mapping and no other modification is made. |
| - */ |
| -void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) |
| -{ |
| - static struct numa_meminfo ei __initdata; |
| - static struct numa_meminfo pi __initdata; |
| - const u64 max_addr = PFN_PHYS(max_pfn); |
| - u8 *phys_dist = NULL; |
| - size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); |
| - int max_emu_nid, dfl_phys_nid; |
| - int i, j, ret; |
| - |
| - if (!emu_cmdline) |
| - goto no_emu; |
| - |
| - memset(&ei, 0, sizeof(ei)); |
| - pi = *numa_meminfo; |
| - |
| - for (i = 0; i < MAX_NUMNODES; i++) |
| - emu_nid_to_phys[i] = NUMA_NO_NODE; |
| - |
| - /* |
| - * If the numa=fake command-line contains a 'M' or 'G', it represents |
| - * the fixed node size. Otherwise, if it is just a single number N, |
| - * split the system RAM into N fake nodes. |
| - */ |
| - if (strchr(emu_cmdline, 'U')) { |
| - nodemask_t physnode_mask = numa_nodes_parsed; |
| - unsigned long n; |
| - int nid = 0; |
| - |
| - n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); |
| - ret = -1; |
| - for_each_node_mask(i, physnode_mask) { |
| - /* |
| - * The reason we pass in blk[0] is due to |
| - * numa_remove_memblk_from() called by |
| - * emu_setup_memblk() will delete entry 0 |
| - * and then move everything else up in the pi.blk |
| - * array. Therefore we should always be looking |
| - * at blk[0]. |
| - */ |
| - ret = split_nodes_size_interleave_uniform(&ei, &pi, |
| - pi.blk[0].start, pi.blk[0].end, 0, |
| - n, &pi.blk[0], nid); |
| - if (ret < 0) |
| - break; |
| - if (ret < n) { |
| - pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", |
| - __func__, i, ret, n); |
| - ret = -1; |
| - break; |
| - } |
| - nid = ret; |
| - } |
| - } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { |
| - u64 size; |
| - |
| - size = memparse(emu_cmdline, &emu_cmdline); |
| - ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); |
| - } else { |
| - unsigned long n; |
| - |
| - n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); |
| - ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); |
| - } |
| - if (*emu_cmdline == ':') |
| - emu_cmdline++; |
| - |
| - if (ret < 0) |
| - goto no_emu; |
| - |
| - if (numa_cleanup_meminfo(&ei) < 0) { |
| - pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); |
| - goto no_emu; |
| - } |
| - |
| - /* copy the physical distance table */ |
| - if (numa_dist_cnt) { |
| - phys_dist = memblock_alloc(phys_size, PAGE_SIZE); |
| - if (!phys_dist) { |
| - pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); |
| - goto no_emu; |
| - } |
| - |
| - for (i = 0; i < numa_dist_cnt; i++) |
| - for (j = 0; j < numa_dist_cnt; j++) |
| - phys_dist[i * numa_dist_cnt + j] = |
| - node_distance(i, j); |
| - } |
| - |
| - /* |
| - * Determine the max emulated nid and the default phys nid to use |
| - * for unmapped nodes. |
| - */ |
| - max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); |
| - |
| - /* commit */ |
| - *numa_meminfo = ei; |
| - |
| - /* Make sure numa_nodes_parsed only contains emulated nodes */ |
| - nodes_clear(numa_nodes_parsed); |
| - for (i = 0; i < ARRAY_SIZE(ei.blk); i++) |
| - if (ei.blk[i].start != ei.blk[i].end && |
| - ei.blk[i].nid != NUMA_NO_NODE) |
| - node_set(ei.blk[i].nid, numa_nodes_parsed); |
| - |
| - numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys)); |
| - |
| - /* make sure all emulated nodes are mapped to a physical node */ |
| - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) |
| - if (emu_nid_to_phys[i] == NUMA_NO_NODE) |
| - emu_nid_to_phys[i] = dfl_phys_nid; |
| - |
| - /* transform distance table */ |
| - numa_reset_distance(); |
| - for (i = 0; i < max_emu_nid + 1; i++) { |
| - for (j = 0; j < max_emu_nid + 1; j++) { |
| - int physi = emu_nid_to_phys[i]; |
| - int physj = emu_nid_to_phys[j]; |
| - int dist; |
| - |
| - if (get_option(&emu_cmdline, &dist) == 2) |
| - ; |
| - else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) |
| - dist = physi == physj ? |
| - LOCAL_DISTANCE : REMOTE_DISTANCE; |
| - else |
| - dist = phys_dist[physi * numa_dist_cnt + physj]; |
| - |
| - numa_set_distance(i, j, dist); |
| - } |
| - } |
| - |
| - /* free the copied physical distance table */ |
| - memblock_free(phys_dist, phys_size); |
| - return; |
| - |
| -no_emu: |
| - /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ |
| - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) |
| - emu_nid_to_phys[i] = i; |
| -} |
| - |
| -#ifndef CONFIG_DEBUG_PER_CPU_MAPS |
| -void numa_add_cpu(unsigned int cpu) |
| -{ |
| - int physnid, nid; |
| - |
| - nid = early_cpu_to_node(cpu); |
| - BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); |
| - |
| - physnid = emu_nid_to_phys[nid]; |
| - |
| - /* |
| - * Map the cpu to each emulated node that is allocated on the physical |
| - * node of the cpu's apic id. |
| - */ |
| - for_each_online_node(nid) |
| - if (emu_nid_to_phys[nid] == physnid) |
| - cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); |
| -} |
| - |
| -void numa_remove_cpu(unsigned int cpu) |
| -{ |
| - int i; |
| - |
| - for_each_online_node(i) |
| - cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); |
| -} |
| -#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ |
| -static void numa_set_cpumask(unsigned int cpu, bool enable) |
| -{ |
| - int nid, physnid; |
| - |
| - nid = early_cpu_to_node(cpu); |
| - if (nid == NUMA_NO_NODE) { |
| - /* early_cpu_to_node() already emits a warning and trace */ |
| - return; |
| - } |
| - |
| - physnid = emu_nid_to_phys[nid]; |
| - |
| - for_each_online_node(nid) { |
| - if (emu_nid_to_phys[nid] != physnid) |
| - continue; |
| - |
| - debug_cpumask_set_cpu(cpu, nid, enable); |
| - } |
| -} |
| - |
| -void numa_add_cpu(unsigned int cpu) |
| -{ |
| - numa_set_cpumask(cpu, true); |
| -} |
| - |
| -void numa_remove_cpu(unsigned int cpu) |
| -{ |
| - numa_set_cpumask(cpu, false); |
| -} |
| -#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ |
| --- a/arch/x86/mm/numa_internal.h~mm-introduce-numa_emulation |
| +++ a/arch/x86/mm/numa_internal.h |
| @@ -7,15 +7,4 @@ |
| |
| void __init x86_numa_init(void); |
| |
| -struct numa_meminfo; |
| - |
| -#ifdef CONFIG_NUMA_EMU |
| -void __init numa_emulation(struct numa_meminfo *numa_meminfo, |
| - int numa_dist_cnt); |
| -#else |
| -static inline void numa_emulation(struct numa_meminfo *numa_meminfo, |
| - int numa_dist_cnt) |
| -{ } |
| -#endif |
| - |
| #endif /* __X86_MM_NUMA_INTERNAL_H */ |
| --- a/include/linux/numa_memblks.h~mm-introduce-numa_emulation |
| +++ a/include/linux/numa_memblks.h |
| @@ -34,6 +34,23 @@ int __init numa_register_meminfo(struct |
| void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, |
| const struct numa_meminfo *mi); |
| |
| +#ifdef CONFIG_NUMA_EMU |
| +int numa_emu_cmdline(char *str); |
| +void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, |
| + unsigned int nr_emu_nids); |
| +u64 __init numa_emu_dma_end(void); |
| +void __init numa_emulation(struct numa_meminfo *numa_meminfo, |
| + int numa_dist_cnt); |
| +#else |
| +static inline void numa_emulation(struct numa_meminfo *numa_meminfo, |
| + int numa_dist_cnt) |
| +{ } |
| +static inline int numa_emu_cmdline(char *str) |
| +{ |
| + return -EINVAL; |
| +} |
| +#endif /* CONFIG_NUMA_EMU */ |
| + |
| #endif /* CONFIG_NUMA_MEMBLKS */ |
| |
| #endif /* __NUMA_MEMBLKS_H */ |
| --- a/mm/Kconfig~mm-introduce-numa_emulation |
| +++ a/mm/Kconfig |
| @@ -1270,6 +1270,14 @@ config EXECMEM |
| config NUMA_MEMBLKS |
| bool |
| |
| +config NUMA_EMU |
| + bool "NUMA emulation" |
| + depends on NUMA_MEMBLKS |
| + help |
| + Enable NUMA emulation. A flat machine will be split |
| + into virtual nodes when booted with "numa=fake=N", where N is the |
| + number of nodes. This is only useful for debugging. |
| + |
| source "mm/damon/Kconfig" |
| |
| endmenu |
| --- a/mm/Makefile~mm-introduce-numa_emulation |
| +++ a/mm/Makefile |
| @@ -119,6 +119,7 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += e |
| obj-$(CONFIG_CMA) += cma.o |
| obj-$(CONFIG_NUMA) += numa.o |
| obj-$(CONFIG_NUMA_MEMBLKS) += numa_memblks.o |
| +obj-$(CONFIG_NUMA_EMU) += numa_emulation.o |
| obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o |
| obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o |
| obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o |
| diff --git a/mm/numa_emulation.c a/mm/numa_emulation.c |
| new file mode 100644 |
| --- /dev/null |
| +++ a/mm/numa_emulation.c |
| @@ -0,0 +1,571 @@ |
| +// SPDX-License-Identifier: GPL-2.0 |
| +/* |
| + * NUMA emulation |
| + */ |
| +#include <linux/kernel.h> |
| +#include <linux/errno.h> |
| +#include <linux/topology.h> |
| +#include <linux/memblock.h> |
| +#include <linux/numa_memblks.h> |
| +#include <asm/numa.h> |
| + |
| +#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) |
| +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) |
| + |
| +static int emu_nid_to_phys[MAX_NUMNODES]; |
| +static char *emu_cmdline __initdata; |
| + |
| +int __init numa_emu_cmdline(char *str) |
| +{ |
| + emu_cmdline = str; |
| + return 0; |
| +} |
| + |
| +static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) |
| +{ |
| + int i; |
| + |
| + for (i = 0; i < mi->nr_blks; i++) |
| + if (mi->blk[i].nid == nid) |
| + return i; |
| + return -ENOENT; |
| +} |
| + |
| +static u64 __init mem_hole_size(u64 start, u64 end) |
| +{ |
| + unsigned long start_pfn = PFN_UP(start); |
| + unsigned long end_pfn = PFN_DOWN(end); |
| + |
| + if (start_pfn < end_pfn) |
| + return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); |
| + return 0; |
| +} |
| + |
| +/* |
| + * Sets up nid to range from @start to @end. The return value is -errno if |
| + * something went wrong, 0 otherwise. |
| + */ |
| +static int __init emu_setup_memblk(struct numa_meminfo *ei, |
| + struct numa_meminfo *pi, |
| + int nid, int phys_blk, u64 size) |
| +{ |
| + struct numa_memblk *eb = &ei->blk[ei->nr_blks]; |
| + struct numa_memblk *pb = &pi->blk[phys_blk]; |
| + |
| + if (ei->nr_blks >= NR_NODE_MEMBLKS) { |
| + pr_err("NUMA: Too many emulated memblks, failing emulation\n"); |
| + return -EINVAL; |
| + } |
| + |
| + ei->nr_blks++; |
| + eb->start = pb->start; |
| + eb->end = pb->start + size; |
| + eb->nid = nid; |
| + |
| + if (emu_nid_to_phys[nid] == NUMA_NO_NODE) |
| + emu_nid_to_phys[nid] = pb->nid; |
| + |
| + pb->start += size; |
| + if (pb->start >= pb->end) { |
| + WARN_ON_ONCE(pb->start > pb->end); |
| + numa_remove_memblk_from(phys_blk, pi); |
| + } |
| + |
| + printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", |
| + nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); |
| + return 0; |
| +} |
| + |
| +/* |
| + * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr |
| + * to max_addr. |
| + * |
| + * Returns zero on success or negative on error. |
| + */ |
| +static int __init split_nodes_interleave(struct numa_meminfo *ei, |
| + struct numa_meminfo *pi, |
| + u64 addr, u64 max_addr, int nr_nodes) |
| +{ |
| + nodemask_t physnode_mask = numa_nodes_parsed; |
| + u64 size; |
| + int big; |
| + int nid = 0; |
| + int i, ret; |
| + |
| + if (nr_nodes <= 0) |
| + return -1; |
| + if (nr_nodes > MAX_NUMNODES) { |
| + pr_info("numa=fake=%d too large, reducing to %d\n", |
| + nr_nodes, MAX_NUMNODES); |
| + nr_nodes = MAX_NUMNODES; |
| + } |
| + |
| + /* |
| + * Calculate target node size. x86_32 freaks on __udivdi3() so do |
| + * the division in ulong number of pages and convert back. |
| + */ |
| + size = max_addr - addr - mem_hole_size(addr, max_addr); |
| + size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); |
| + |
| + /* |
| + * Calculate the number of big nodes that can be allocated as a result |
| + * of consolidating the remainder. |
| + */ |
| + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / |
| + FAKE_NODE_MIN_SIZE; |
| + |
| + size &= FAKE_NODE_MIN_HASH_MASK; |
| + if (!size) { |
| + pr_err("Not enough memory for each node. " |
| + "NUMA emulation disabled.\n"); |
| + return -1; |
| + } |
| + |
| + /* |
| + * Continue to fill physical nodes with fake nodes until there is no |
| + * memory left on any of them. |
| + */ |
| + while (!nodes_empty(physnode_mask)) { |
| + for_each_node_mask(i, physnode_mask) { |
| + u64 dma32_end = numa_emu_dma_end(); |
| + u64 start, limit, end; |
| + int phys_blk; |
| + |
| + phys_blk = emu_find_memblk_by_nid(i, pi); |
| + if (phys_blk < 0) { |
| + node_clear(i, physnode_mask); |
| + continue; |
| + } |
| + start = pi->blk[phys_blk].start; |
| + limit = pi->blk[phys_blk].end; |
| + end = start + size; |
| + |
| + if (nid < big) |
| + end += FAKE_NODE_MIN_SIZE; |
| + |
| + /* |
| + * Continue to add memory to this fake node if its |
| + * non-reserved memory is less than the per-node size. |
| + */ |
| + while (end - start - mem_hole_size(start, end) < size) { |
| + end += FAKE_NODE_MIN_SIZE; |
| + if (end > limit) { |
| + end = limit; |
| + break; |
| + } |
| + } |
| + |
| + /* |
| + * If there won't be at least FAKE_NODE_MIN_SIZE of |
| + * non-reserved memory in ZONE_DMA32 for the next node, |
| + * this one must extend to the boundary. |
| + */ |
| + if (end < dma32_end && dma32_end - end - |
| + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) |
| + end = dma32_end; |
| + |
| + /* |
| + * If there won't be enough non-reserved memory for the |
| + * next node, this one must extend to the end of the |
| + * physical node. |
| + */ |
| + if (limit - end - mem_hole_size(end, limit) < size) |
| + end = limit; |
| + |
| + ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, |
| + phys_blk, |
| + min(end, limit) - start); |
| + if (ret < 0) |
| + return ret; |
| + } |
| + } |
| + return 0; |
| +} |
| + |
| +/* |
| + * Returns the end address of a node so that there is at least `size' amount of |
| + * non-reserved memory or `max_addr' is reached. |
| + */ |
| +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) |
| +{ |
| + u64 end = start + size; |
| + |
| + while (end - start - mem_hole_size(start, end) < size) { |
| + end += FAKE_NODE_MIN_SIZE; |
| + if (end > max_addr) { |
| + end = max_addr; |
| + break; |
| + } |
| + } |
| + return end; |
| +} |
| + |
| +static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) |
| +{ |
| + unsigned long max_pfn = PHYS_PFN(max_addr); |
| + unsigned long base_pfn = PHYS_PFN(base); |
| + unsigned long hole_pfns = PHYS_PFN(hole); |
| + |
| + return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); |
| +} |
| + |
| +/* |
| + * Sets up fake nodes of `size' interleaved over physical nodes ranging from |
| + * `addr' to `max_addr'. |
| + * |
| + * Returns zero on success or negative on error. |
| + */ |
| +static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, |
| + struct numa_meminfo *pi, |
| + u64 addr, u64 max_addr, u64 size, |
| + int nr_nodes, struct numa_memblk *pblk, |
| + int nid) |
| +{ |
| + nodemask_t physnode_mask = numa_nodes_parsed; |
| + int i, ret, uniform = 0; |
| + u64 min_size; |
| + |
| + if ((!size && !nr_nodes) || (nr_nodes && !pblk)) |
| + return -1; |
| + |
| + /* |
| + * In the 'uniform' case split the passed in physical node by |
| + * nr_nodes, in the non-uniform case, ignore the passed in |
| + * physical block and try to create nodes of at least size |
| + * @size. |
| + * |
| + * In the uniform case, split the nodes strictly by physical |
| + * capacity, i.e. ignore holes. In the non-uniform case account |
| + * for holes and treat @size as a minimum floor. |
| + */ |
| + if (!nr_nodes) |
| + nr_nodes = MAX_NUMNODES; |
| + else { |
| + nodes_clear(physnode_mask); |
| + node_set(pblk->nid, physnode_mask); |
| + uniform = 1; |
| + } |
| + |
| + if (uniform) { |
| + min_size = uniform_size(max_addr, addr, 0, nr_nodes); |
| + size = min_size; |
| + } else { |
| + /* |
| + * The limit on emulated nodes is MAX_NUMNODES, so the |
| + * size per node is increased accordingly if the |
| + * requested size is too small. This creates a uniform |
| + * distribution of node sizes across the entire machine |
| + * (but not necessarily over physical nodes). |
| + */ |
| + min_size = uniform_size(max_addr, addr, |
| + mem_hole_size(addr, max_addr), nr_nodes); |
| + } |
| + min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); |
| + if (size < min_size) { |
| + pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", |
| + size >> 20, min_size >> 20); |
| + size = min_size; |
| + } |
| + size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); |
| + |
| + /* |
| + * Fill physical nodes with fake nodes of size until there is no memory |
| + * left on any of them. |
| + */ |
| + while (!nodes_empty(physnode_mask)) { |
| + for_each_node_mask(i, physnode_mask) { |
| + u64 dma32_end = numa_emu_dma_end(); |
| + u64 start, limit, end; |
| + int phys_blk; |
| + |
| + phys_blk = emu_find_memblk_by_nid(i, pi); |
| + if (phys_blk < 0) { |
| + node_clear(i, physnode_mask); |
| + continue; |
| + } |
| + |
| + start = pi->blk[phys_blk].start; |
| + limit = pi->blk[phys_blk].end; |
| + |
| + if (uniform) |
| + end = start + size; |
| + else |
| + end = find_end_of_node(start, limit, size); |
| + /* |
| + * If there won't be at least FAKE_NODE_MIN_SIZE of |
| + * non-reserved memory in ZONE_DMA32 for the next node, |
| + * this one must extend to the boundary. |
| + */ |
| + if (end < dma32_end && dma32_end - end - |
| + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) |
| + end = dma32_end; |
| + |
| + /* |
| + * If there won't be enough non-reserved memory for the |
| + * next node, this one must extend to the end of the |
| + * physical node. |
| + */ |
| + if ((limit - end - mem_hole_size(end, limit) < size) |
| + && !uniform) |
| + end = limit; |
| + |
| + ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, |
| + phys_blk, |
| + min(end, limit) - start); |
| + if (ret < 0) |
| + return ret; |
| + } |
| + } |
| + return nid; |
| +} |
| + |
| +static int __init split_nodes_size_interleave(struct numa_meminfo *ei, |
| + struct numa_meminfo *pi, |
| + u64 addr, u64 max_addr, u64 size) |
| +{ |
| + return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, |
| + 0, NULL, 0); |
| +} |
| + |
| +static int __init setup_emu2phys_nid(int *dfl_phys_nid) |
| +{ |
| + int i, max_emu_nid = 0; |
| + |
| + *dfl_phys_nid = NUMA_NO_NODE; |
| + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { |
| + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { |
| + max_emu_nid = i; |
| + if (*dfl_phys_nid == NUMA_NO_NODE) |
| + *dfl_phys_nid = emu_nid_to_phys[i]; |
| + } |
| + } |
| + |
| + return max_emu_nid; |
| +} |
| + |
| +/** |
| + * numa_emulation - Emulate NUMA nodes |
| + * @numa_meminfo: NUMA configuration to massage |
| + * @numa_dist_cnt: The size of the physical NUMA distance table |
| + * |
| + * Emulate NUMA nodes according to the numa=fake kernel parameter. |
| + * @numa_meminfo contains the physical memory configuration and is modified |
| + * to reflect the emulated configuration on success. @numa_dist_cnt is |
| + * used to determine the size of the physical distance table. |
| + * |
| + * On success, the following modifications are made. |
| + * |
| + * - @numa_meminfo is updated to reflect the emulated nodes. |
| + * |
| + * - __apicid_to_node[] is updated such that APIC IDs are mapped to the |
| + * emulated nodes. |
| + * |
| + * - NUMA distance table is rebuilt to represent distances between emulated |
| + * nodes. The distances are determined considering how emulated nodes |
| + * are mapped to physical nodes and match the actual distances. |
| + * |
| + * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical |
| + * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). |
| + * |
| + * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with |
| + * identity mapping and no other modification is made. |
| + */ |
| +void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) |
| +{ |
| + static struct numa_meminfo ei __initdata; |
| + static struct numa_meminfo pi __initdata; |
| + const u64 max_addr = PFN_PHYS(max_pfn); |
| + u8 *phys_dist = NULL; |
| + size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); |
| + int max_emu_nid, dfl_phys_nid; |
| + int i, j, ret; |
| + |
| + if (!emu_cmdline) |
| + goto no_emu; |
| + |
| + memset(&ei, 0, sizeof(ei)); |
| + pi = *numa_meminfo; |
| + |
| + for (i = 0; i < MAX_NUMNODES; i++) |
| + emu_nid_to_phys[i] = NUMA_NO_NODE; |
| + |
| + /* |
| + * If the numa=fake command-line contains a 'M' or 'G', it represents |
| + * the fixed node size. Otherwise, if it is just a single number N, |
| + * split the system RAM into N fake nodes. |
| + */ |
| + if (strchr(emu_cmdline, 'U')) { |
| + nodemask_t physnode_mask = numa_nodes_parsed; |
| + unsigned long n; |
| + int nid = 0; |
| + |
| + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); |
| + ret = -1; |
| + for_each_node_mask(i, physnode_mask) { |
| + /* |
| + * The reason we pass in blk[0] is due to |
| + * numa_remove_memblk_from() called by |
| + * emu_setup_memblk() will delete entry 0 |
| + * and then move everything else up in the pi.blk |
| + * array. Therefore we should always be looking |
| + * at blk[0]. |
| + */ |
| + ret = split_nodes_size_interleave_uniform(&ei, &pi, |
| + pi.blk[0].start, pi.blk[0].end, 0, |
| + n, &pi.blk[0], nid); |
| + if (ret < 0) |
| + break; |
| + if (ret < n) { |
| + pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", |
| + __func__, i, ret, n); |
| + ret = -1; |
| + break; |
| + } |
| + nid = ret; |
| + } |
| + } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { |
| + u64 size; |
| + |
| + size = memparse(emu_cmdline, &emu_cmdline); |
| + ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); |
| + } else { |
| + unsigned long n; |
| + |
| + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); |
| + ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); |
| + } |
| + if (*emu_cmdline == ':') |
| + emu_cmdline++; |
| + |
| + if (ret < 0) |
| + goto no_emu; |
| + |
| + if (numa_cleanup_meminfo(&ei) < 0) { |
| + pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); |
| + goto no_emu; |
| + } |
| + |
| + /* copy the physical distance table */ |
| + if (numa_dist_cnt) { |
| + phys_dist = memblock_alloc(phys_size, PAGE_SIZE); |
| + if (!phys_dist) { |
| + pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); |
| + goto no_emu; |
| + } |
| + |
| + for (i = 0; i < numa_dist_cnt; i++) |
| + for (j = 0; j < numa_dist_cnt; j++) |
| + phys_dist[i * numa_dist_cnt + j] = |
| + node_distance(i, j); |
| + } |
| + |
| + /* |
| + * Determine the max emulated nid and the default phys nid to use |
| + * for unmapped nodes. |
| + */ |
| + max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); |
| + |
| + /* commit */ |
| + *numa_meminfo = ei; |
| + |
| + /* Make sure numa_nodes_parsed only contains emulated nodes */ |
| + nodes_clear(numa_nodes_parsed); |
| + for (i = 0; i < ARRAY_SIZE(ei.blk); i++) |
| + if (ei.blk[i].start != ei.blk[i].end && |
| + ei.blk[i].nid != NUMA_NO_NODE) |
| + node_set(ei.blk[i].nid, numa_nodes_parsed); |
| + |
| + numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys)); |
| + |
| + /* make sure all emulated nodes are mapped to a physical node */ |
| + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) |
| + if (emu_nid_to_phys[i] == NUMA_NO_NODE) |
| + emu_nid_to_phys[i] = dfl_phys_nid; |
| + |
| + /* transform distance table */ |
| + numa_reset_distance(); |
| + for (i = 0; i < max_emu_nid + 1; i++) { |
| + for (j = 0; j < max_emu_nid + 1; j++) { |
| + int physi = emu_nid_to_phys[i]; |
| + int physj = emu_nid_to_phys[j]; |
| + int dist; |
| + |
| + if (get_option(&emu_cmdline, &dist) == 2) |
| + ; |
| + else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) |
| + dist = physi == physj ? |
| + LOCAL_DISTANCE : REMOTE_DISTANCE; |
| + else |
| + dist = phys_dist[physi * numa_dist_cnt + physj]; |
| + |
| + numa_set_distance(i, j, dist); |
| + } |
| + } |
| + |
| + /* free the copied physical distance table */ |
| + memblock_free(phys_dist, phys_size); |
| + return; |
| + |
| +no_emu: |
| + /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ |
| + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) |
| + emu_nid_to_phys[i] = i; |
| +} |
| + |
| +#ifndef CONFIG_DEBUG_PER_CPU_MAPS |
| +void numa_add_cpu(unsigned int cpu) |
| +{ |
| + int physnid, nid; |
| + |
| + nid = early_cpu_to_node(cpu); |
| + BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); |
| + |
| + physnid = emu_nid_to_phys[nid]; |
| + |
| + /* |
| + * Map the cpu to each emulated node that is allocated on the physical |
| + * node of the cpu's apic id. |
| + */ |
| + for_each_online_node(nid) |
| + if (emu_nid_to_phys[nid] == physnid) |
| + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); |
| +} |
| + |
| +void numa_remove_cpu(unsigned int cpu) |
| +{ |
| + int i; |
| + |
| + for_each_online_node(i) |
| + cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); |
| +} |
| +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ |
| +static void numa_set_cpumask(unsigned int cpu, bool enable) |
| +{ |
| + int nid, physnid; |
| + |
| + nid = early_cpu_to_node(cpu); |
| + if (nid == NUMA_NO_NODE) { |
| + /* early_cpu_to_node() already emits a warning and trace */ |
| + return; |
| + } |
| + |
| + physnid = emu_nid_to_phys[nid]; |
| + |
| + for_each_online_node(nid) { |
| + if (emu_nid_to_phys[nid] != physnid) |
| + continue; |
| + |
| + debug_cpumask_set_cpu(cpu, nid, enable); |
| + } |
| +} |
| + |
| +void numa_add_cpu(unsigned int cpu) |
| +{ |
| + numa_set_cpumask(cpu, true); |
| +} |
| + |
| +void numa_remove_cpu(unsigned int cpu) |
| +{ |
| + numa_set_cpumask(cpu, false); |
| +} |
| +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ |
| _ |