| From foo@baz Sun May 27 17:33:38 CEST 2018 |
| From: Michael Bringmann <mwb@linux.vnet.ibm.com> |
| Date: Tue, 28 Nov 2017 16:58:40 -0600 |
| Subject: powerpc/numa: Ensure nodes initialized for hotplug |
| |
| From: Michael Bringmann <mwb@linux.vnet.ibm.com> |
| |
| [ Upstream commit ea05ba7c559c8e5a5946c3a94a2a266e9a6680a6 ] |
| |
| This patch fixes some problems encountered at runtime with |
| configurations that support memory-less nodes, or that hot-add CPUs |
| into nodes that are memoryless during system execution after boot. The |
| problems of interest include: |
| |
| * Nodes known to powerpc to be memoryless at boot, but to have CPUs in |
| them are allowed to be 'possible' and 'online'. Memory allocations |
| for those nodes are taken from another node that does have memory |
| until and if memory is hot-added to the node. |
| |
| * Nodes which have no resources assigned at boot, but which may still |
| be referenced subsequently by affinity or associativity attributes, |
| are kept in the list of 'possible' nodes for powerpc. Hot-add of |
| memory or CPUs to the system can reference these nodes and bring |
| them online instead of redirecting the references to one of the set |
| of nodes known to have memory at boot. |
| |
| Note that this software operates under the context of CPU hotplug. We |
| are not doing memory hotplug in this code, but rather updating the |
| kernel's CPU topology (i.e. arch_update_cpu_topology / |
| numa_update_cpu_topology). We are initializing a node that may be used |
| by CPUs or memory before it can be referenced as invalid by a CPU |
| hotplug operation. CPU hotplug operations are protected by a range of |
| APIs including cpu_maps_update_begin/cpu_maps_update_done, |
| cpus_read/write_lock / cpus_read/write_unlock, device locks, and more. |
| Memory hotplug operations, including try_online_node, are protected by |
| mem_hotplug_begin/mem_hotplug_done, device locks, and more. In the |
| case of CPUs being hot-added to a previously memoryless node, the |
| try_online_node operation occurs wholly within the CPU locks with no |
| overlap. Using HMC hot-add/hot-remove operations, we have been able to |
| add and remove CPUs to any possible node without failures. HMC |
| operations involve a degree self-serialization, though. |
| |
| Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com> |
| Reviewed-by: Nathan Fontenot <nfont@linux.vnet.ibm.com> |
| Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> |
| Signed-off-by: Sasha Levin <alexander.levin@microsoft.com> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| arch/powerpc/mm/numa.c | 47 +++++++++++++++++++++++++++++++++++++---------- |
| 1 file changed, 37 insertions(+), 10 deletions(-) |
| |
| --- a/arch/powerpc/mm/numa.c |
| +++ b/arch/powerpc/mm/numa.c |
| @@ -551,7 +551,7 @@ static int numa_setup_cpu(unsigned long |
| nid = of_node_to_nid_single(cpu); |
| |
| out_present: |
| - if (nid < 0 || !node_online(nid)) |
| + if (nid < 0 || !node_possible(nid)) |
| nid = first_online_node; |
| |
| map_cpu_to_node(lcpu, nid); |
| @@ -922,10 +922,8 @@ static void __init find_possible_nodes(v |
| goto out; |
| |
| for (i = 0; i < numnodes; i++) { |
| - if (!node_possible(i)) { |
| - setup_node_data(i, 0, 0); |
| + if (!node_possible(i)) |
| node_set(i, node_possible_map); |
| - } |
| } |
| |
| out: |
| @@ -1305,6 +1303,40 @@ static long vphn_get_associativity(unsig |
| return rc; |
| } |
| |
| +static inline int find_and_online_cpu_nid(int cpu) |
| +{ |
| + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; |
| + int new_nid; |
| + |
| + /* Use associativity from first thread for all siblings */ |
| + vphn_get_associativity(cpu, associativity); |
| + new_nid = associativity_to_nid(associativity); |
| + if (new_nid < 0 || !node_possible(new_nid)) |
| + new_nid = first_online_node; |
| + |
| + if (NODE_DATA(new_nid) == NULL) { |
| +#ifdef CONFIG_MEMORY_HOTPLUG |
| + /* |
| + * Need to ensure that NODE_DATA is initialized for a node from |
| + * available memory (see memblock_alloc_try_nid). If unable to |
| + * init the node, then default to nearest node that has memory |
| + * installed. |
| + */ |
| + if (try_online_node(new_nid)) |
| + new_nid = first_online_node; |
| +#else |
| + /* |
| + * Default to using the nearest node that has memory installed. |
| + * Otherwise, it would be necessary to patch the kernel MM code |
| + * to deal with more memoryless-node error conditions. |
| + */ |
| + new_nid = first_online_node; |
| +#endif |
| + } |
| + |
| + return new_nid; |
| +} |
| + |
| /* |
| * Update the CPU maps and sysfs entries for a single CPU when its NUMA |
| * characteristics change. This function doesn't perform any locking and is |
| @@ -1370,7 +1402,6 @@ int arch_update_cpu_topology(void) |
| { |
| unsigned int cpu, sibling, changed = 0; |
| struct topology_update_data *updates, *ud; |
| - __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; |
| cpumask_t updated_cpus; |
| struct device *dev; |
| int weight, new_nid, i = 0; |
| @@ -1405,11 +1436,7 @@ int arch_update_cpu_topology(void) |
| continue; |
| } |
| |
| - /* Use associativity from first thread for all siblings */ |
| - vphn_get_associativity(cpu, associativity); |
| - new_nid = associativity_to_nid(associativity); |
| - if (new_nid < 0 || !node_online(new_nid)) |
| - new_nid = first_online_node; |
| + new_nid = find_and_online_cpu_nid(cpu); |
| |
| if (new_nid == numa_cpu_lookup_table[cpu]) { |
| cpumask_andnot(&cpu_associativity_changes_mask, |