patches/old/mm-migrate-move-node-demotion-code-to-near-its-user.patch - pub/scm/linux/kernel/git/akpm/25-new - Git at Google

 From: Huang Ying <ying.huang@intel.com>
 Subject: mm/migrate: move node demotion code to near its user

 Now, node_demotion and next_demotion_node() are placed between
 __unmap_and_move() and unmap_and_move().  This hurts code readability.  So
 move them near their users in the file.  There's no functionality change
 in this patch.

 Link: https://lkml.kernel.org/r/20211206031227.3323097-1-ying.huang@intel.com
 Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
 Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
 Reviewed-by: Yang Shi <shy828301@gmail.com>
 Reviewed-by: Wei Xu <weixugc@google.com>
 Cc: Dave Hansen <dave.hansen@linux.intel.com>
 Cc: Zi Yan <ziy@nvidia.com>
 Cc: Oscar Salvador <osalvador@suse.de>
 Cc: Michal Hocko <mhocko@suse.com>
 Cc: David Rientjes <rientjes@google.com>
 Cc: Dan Williams <dan.j.williams@intel.com>
 Cc: David Hildenbrand <david@redhat.com>
 Cc: Greg Thelen <gthelen@google.com>
 Cc: Keith Busch <kbusch@kernel.org>
 Cc: Yang Shi <yang.shi@linux.alibaba.com>
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 ---

  mm/migrate.c |  265 ++++++++++++++++++++++++-------------------------
  1 file changed, 132 insertions(+), 133 deletions(-)

 --- a/mm/migrate.c~mm-migrate-move-node-demotion-code-to-near-its-user
 +++ a/mm/migrate.c
 @@ -1093,139 +1093,6 @@ out:
  	return rc;
  }

 -
 -/*
 - * node_demotion[] example:
 - *
 - * Consider a system with two sockets.  Each socket has
 - * three classes of memory attached: fast, medium and slow.
 - * Each memory class is placed in its own NUMA node.  The
 - * CPUs are placed in the node with the "fast" memory.  The
 - * 6 NUMA nodes (0-5) might be split among the sockets like
 - * this:
 - *
 - *	Socket A: 0, 1, 2
 - *	Socket B: 3, 4, 5
 - *
 - * When Node 0 fills up, its memory should be migrated to
 - * Node 1.  When Node 1 fills up, it should be migrated to
 - * Node 2.  The migration path start on the nodes with the
 - * processors (since allocations default to this node) and
 - * fast memory, progress through medium and end with the
 - * slow memory:
 - *
 - *	0 -> 1 -> 2 -> stop
 - *	3 -> 4 -> 5 -> stop
 - *
 - * This is represented in the node_demotion[] like this:
 - *
 - *	{  nr=1, nodes[0]=1 }, // Node 0 migrates to 1
 - *	{  nr=1, nodes[0]=2 }, // Node 1 migrates to 2
 - *	{  nr=0, nodes[0]=-1 }, // Node 2 does not migrate
 - *	{  nr=1, nodes[0]=4 }, // Node 3 migrates to 4
 - *	{  nr=1, nodes[0]=5 }, // Node 4 migrates to 5
 - *	{  nr=0, nodes[0]=-1 }, // Node 5 does not migrate
 - *
 - * Moreover some systems may have multiple slow memory nodes.
 - * Suppose a system has one socket with 3 memory nodes, node 0
 - * is fast memory type, and node 1/2 both are slow memory
 - * type, and the distance between fast memory node and slow
 - * memory node is same. So the migration path should be:
 - *
 - *	0 -> 1/2 -> stop
 - *
 - * This is represented in the node_demotion[] like this:
 - *	{ nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
 - *	{ nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
 - *	{ nr=0, nodes[0]=-1, }, // Node 2 does not migrate
 - */
 -
 -/*
 - * Writes to this array occur without locking.  Cycles are
 - * not allowed: Node X demotes to Y which demotes to X...
 - *
 - * If multiple reads are performed, a single rcu_read_lock()
 - * must be held over all reads to ensure that no cycles are
 - * observed.
 - */
 -#define DEFAULT_DEMOTION_TARGET_NODES 15
 -
 -#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
 -#define DEMOTION_TARGET_NODES	(MAX_NUMNODES - 1)
 -#else
 -#define DEMOTION_TARGET_NODES	DEFAULT_DEMOTION_TARGET_NODES
 -#endif
 -
 -struct demotion_nodes {
 -	unsigned short nr;
 -	short nodes[DEMOTION_TARGET_NODES];
 -};
 -
 -static struct demotion_nodes *node_demotion __read_mostly;
 -
 -/**
 - * next_demotion_node() - Get the next node in the demotion path
 - * @node: The starting node to lookup the next node
 - *
 - * Return: node id for next memory node in the demotion path hierarchy
 - * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
 - * @node online or guarantee that it *continues* to be the next demotion
 - * target.
 - */
 -int next_demotion_node(int node)
 -{
 -	struct demotion_nodes *nd;
 -	unsigned short target_nr, index;
 -	int target;
 -
 -	if (!node_demotion)
 -		return NUMA_NO_NODE;
 -
 -	nd = &node_demotion[node];
 -
 -	/*
 -	 * node_demotion[] is updated without excluding this
 -	 * function from running.  RCU doesn't provide any
 -	 * compiler barriers, so the READ_ONCE() is required
 -	 * to avoid compiler reordering or read merging.
 -	 *
 -	 * Make sure to use RCU over entire code blocks if
 -	 * node_demotion[] reads need to be consistent.
 -	 */
 -	rcu_read_lock();
 -	target_nr = READ_ONCE(nd->nr);
 -
 -	switch (target_nr) {
 -	case 0:
 -		target = NUMA_NO_NODE;
 -		goto out;
 -	case 1:
 -		index = 0;
 -		break;
 -	default:
 -		/*
 -		 * If there are multiple target nodes, just select one
 -		 * target node randomly.
 -		 *
 -		 * In addition, we can also use round-robin to select
 -		 * target node, but we should introduce another variable
 -		 * for node_demotion[] to record last selected target node,
 -		 * that may cause cache ping-pong due to the changing of
 -		 * last target node. Or introducing per-cpu data to avoid
 -		 * caching issue, which seems more complicated. So selecting
 -		 * target node randomly seems better until now.
 -		 */
 -		index = get_random_int() % target_nr;
 -		break;
 -	}
 -
 -	target = READ_ONCE(nd->nodes[index]);
 -
 -out:
 -	rcu_read_unlock();
 -	return target;
 -}
 -
  /*
   * Obtain the lock on page, remove all ptes and migrate the page
   * to the newly allocated page in newpage.
 @@ -3059,6 +2926,138 @@ void migrate_vma_finalize(struct migrate
  EXPORT_SYMBOL(migrate_vma_finalize);
  #endif /* CONFIG_DEVICE_PRIVATE */

 +/*
 + * node_demotion[] example:
 + *
 + * Consider a system with two sockets.  Each socket has
 + * three classes of memory attached: fast, medium and slow.
 + * Each memory class is placed in its own NUMA node.  The
 + * CPUs are placed in the node with the "fast" memory.  The
 + * 6 NUMA nodes (0-5) might be split among the sockets like
 + * this:
 + *
 + *	Socket A: 0, 1, 2
 + *	Socket B: 3, 4, 5
 + *
 + * When Node 0 fills up, its memory should be migrated to
 + * Node 1.  When Node 1 fills up, it should be migrated to
 + * Node 2.  The migration path start on the nodes with the
 + * processors (since allocations default to this node) and
 + * fast memory, progress through medium and end with the
 + * slow memory:
 + *
 + *	0 -> 1 -> 2 -> stop
 + *	3 -> 4 -> 5 -> stop
 + *
 + * This is represented in the node_demotion[] like this:
 + *
 + *	{  nr=1, nodes[0]=1 }, // Node 0 migrates to 1
 + *	{  nr=1, nodes[0]=2 }, // Node 1 migrates to 2
 + *	{  nr=0, nodes[0]=-1 }, // Node 2 does not migrate
 + *	{  nr=1, nodes[0]=4 }, // Node 3 migrates to 4
 + *	{  nr=1, nodes[0]=5 }, // Node 4 migrates to 5
 + *	{  nr=0, nodes[0]=-1 }, // Node 5 does not migrate
 + *
 + * Moreover some systems may have multiple slow memory nodes.
 + * Suppose a system has one socket with 3 memory nodes, node 0
 + * is fast memory type, and node 1/2 both are slow memory
 + * type, and the distance between fast memory node and slow
 + * memory node is same. So the migration path should be:
 + *
 + *	0 -> 1/2 -> stop
 + *
 + * This is represented in the node_demotion[] like this:
 + *	{ nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
 + *	{ nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
 + *	{ nr=0, nodes[0]=-1, }, // Node 2 does not migrate
 + */
 +
 +/*
 + * Writes to this array occur without locking.  Cycles are
 + * not allowed: Node X demotes to Y which demotes to X...
 + *
 + * If multiple reads are performed, a single rcu_read_lock()
 + * must be held over all reads to ensure that no cycles are
 + * observed.
 + */
 +#define DEFAULT_DEMOTION_TARGET_NODES 15
 +
 +#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
 +#define DEMOTION_TARGET_NODES	(MAX_NUMNODES - 1)
 +#else
 +#define DEMOTION_TARGET_NODES	DEFAULT_DEMOTION_TARGET_NODES
 +#endif
 +
 +struct demotion_nodes {
 +	unsigned short nr;
 +	short nodes[DEMOTION_TARGET_NODES];
 +};
 +
 +static struct demotion_nodes *node_demotion __read_mostly;
 +
 +/**
 + * next_demotion_node() - Get the next node in the demotion path
 + * @node: The starting node to lookup the next node
 + *
 + * Return: node id for next memory node in the demotion path hierarchy
 + * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
 + * @node online or guarantee that it *continues* to be the next demotion
 + * target.
 + */
 +int next_demotion_node(int node)
 +{
 +	struct demotion_nodes *nd;
 +	unsigned short target_nr, index;
 +	int target;
 +
 +	if (!node_demotion)
 +		return NUMA_NO_NODE;
 +
 +	nd = &node_demotion[node];
 +
 +	/*
 +	 * node_demotion[] is updated without excluding this
 +	 * function from running.  RCU doesn't provide any
 +	 * compiler barriers, so the READ_ONCE() is required
 +	 * to avoid compiler reordering or read merging.
 +	 *
 +	 * Make sure to use RCU over entire code blocks if
 +	 * node_demotion[] reads need to be consistent.
 +	 */
 +	rcu_read_lock();
 +	target_nr = READ_ONCE(nd->nr);
 +
 +	switch (target_nr) {
 +	case 0:
 +		target = NUMA_NO_NODE;
 +		goto out;
 +	case 1:
 +		index = 0;
 +		break;
 +	default:
 +		/*
 +		 * If there are multiple target nodes, just select one
 +		 * target node randomly.
 +		 *
 +		 * In addition, we can also use round-robin to select
 +		 * target node, but we should introduce another variable
 +		 * for node_demotion[] to record last selected target node,
 +		 * that may cause cache ping-pong due to the changing of
 +		 * last target node. Or introducing per-cpu data to avoid
 +		 * caching issue, which seems more complicated. So selecting
 +		 * target node randomly seems better until now.
 +		 */
 +		index = get_random_int() % target_nr;
 +		break;
 +	}
 +
 +	target = READ_ONCE(nd->nodes[index]);
 +
 +out:
 +	rcu_read_unlock();
 +	return target;
 +}
 +
  #if defined(CONFIG_HOTPLUG_CPU)
  /* Disable reclaim-based migration. */
  static void __disable_all_migrate_targets(void)
 _
	From: Huang Ying <ying.huang@intel.com>
	Subject: mm/migrate: move node demotion code to near its user

	Now, node_demotion and next_demotion_node() are placed between
	__unmap_and_move() and unmap_and_move(). This hurts code readability. So
	move them near their users in the file. There's no functionality change
	in this patch.

	Link: https://lkml.kernel.org/r/20211206031227.3323097-1-ying.huang@intel.com
	Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
	Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
	Reviewed-by: Yang Shi <shy828301@gmail.com>
	Reviewed-by: Wei Xu <weixugc@google.com>
	Cc: Dave Hansen <dave.hansen@linux.intel.com>
	Cc: Zi Yan <ziy@nvidia.com>
	Cc: Oscar Salvador <osalvador@suse.de>
	Cc: Michal Hocko <mhocko@suse.com>
	Cc: David Rientjes <rientjes@google.com>
	Cc: Dan Williams <dan.j.williams@intel.com>
	Cc: David Hildenbrand <david@redhat.com>
	Cc: Greg Thelen <gthelen@google.com>
	Cc: Keith Busch <kbusch@kernel.org>
	Cc: Yang Shi <yang.shi@linux.alibaba.com>
	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	---

	mm/migrate.c \| 265 ++++++++++++++++++++++++-------------------------
	1 file changed, 132 insertions(+), 133 deletions(-)

	--- a/mm/migrate.c~mm-migrate-move-node-demotion-code-to-near-its-user
	+++ a/mm/migrate.c
	@@ -1093,139 +1093,6 @@ out:
	return rc;
	}

	-
	-/*
	- * node_demotion[] example:
	- *
	- * Consider a system with two sockets. Each socket has
	- * three classes of memory attached: fast, medium and slow.
	- * Each memory class is placed in its own NUMA node. The
	- * CPUs are placed in the node with the "fast" memory. The
	- * 6 NUMA nodes (0-5) might be split among the sockets like
	- * this:
	- *
	- * Socket A: 0, 1, 2
	- * Socket B: 3, 4, 5
	- *
	- * When Node 0 fills up, its memory should be migrated to
	- * Node 1. When Node 1 fills up, it should be migrated to
	- * Node 2. The migration path start on the nodes with the
	- * processors (since allocations default to this node) and
	- * fast memory, progress through medium and end with the
	- * slow memory:
	- *
	- * 0 -> 1 -> 2 -> stop
	- * 3 -> 4 -> 5 -> stop
	- *
	- * This is represented in the node_demotion[] like this:
	- *
	- * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1
	- * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2
	- * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate
	- * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4
	- * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5
	- * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate
	- *
	- * Moreover some systems may have multiple slow memory nodes.
	- * Suppose a system has one socket with 3 memory nodes, node 0
	- * is fast memory type, and node 1/2 both are slow memory
	- * type, and the distance between fast memory node and slow
	- * memory node is same. So the migration path should be:
	- *
	- * 0 -> 1/2 -> stop
	- *
	- * This is represented in the node_demotion[] like this:
	- * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
	- * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
	- * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
	- */
	-
	-/*
	- * Writes to this array occur without locking. Cycles are
	- * not allowed: Node X demotes to Y which demotes to X...
	- *
	- * If multiple reads are performed, a single rcu_read_lock()
	- * must be held over all reads to ensure that no cycles are
	- * observed.
	- */
	-#define DEFAULT_DEMOTION_TARGET_NODES 15
	-
	-#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
	-#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1)
	-#else
	-#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES
	-#endif
	-
	-struct demotion_nodes {
	- unsigned short nr;
	- short nodes[DEMOTION_TARGET_NODES];
	-};
	-
	-static struct demotion_nodes *node_demotion __read_mostly;
	-
	-/**
	- * next_demotion_node() - Get the next node in the demotion path
	- * @node: The starting node to lookup the next node
	- *
	- * Return: node id for next memory node in the demotion path hierarchy
	- * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
	- * @node online or guarantee that it continues to be the next demotion
	- * target.
	- */
	-int next_demotion_node(int node)
	-{
	- struct demotion_nodes *nd;
	- unsigned short target_nr, index;
	- int target;
	-
	- if (!node_demotion)
	- return NUMA_NO_NODE;
	-
	- nd = &node_demotion[node];
	-
	- /*
	- * node_demotion[] is updated without excluding this
	- * function from running. RCU doesn't provide any
	- * compiler barriers, so the READ_ONCE() is required
	- * to avoid compiler reordering or read merging.
	- *
	- * Make sure to use RCU over entire code blocks if
	- * node_demotion[] reads need to be consistent.
	- */
	- rcu_read_lock();
	- target_nr = READ_ONCE(nd->nr);
	-
	- switch (target_nr) {
	- case 0:
	- target = NUMA_NO_NODE;
	- goto out;
	- case 1:
	- index = 0;
	- break;
	- default:
	- /*
	- * If there are multiple target nodes, just select one
	- * target node randomly.
	- *
	- * In addition, we can also use round-robin to select
	- * target node, but we should introduce another variable
	- * for node_demotion[] to record last selected target node,
	- * that may cause cache ping-pong due to the changing of
	- * last target node. Or introducing per-cpu data to avoid
	- * caching issue, which seems more complicated. So selecting
	- * target node randomly seems better until now.
	- */
	- index = get_random_int() % target_nr;
	- break;
	- }
	-
	- target = READ_ONCE(nd->nodes[index]);
	-
	-out:
	- rcu_read_unlock();
	- return target;
	-}
	-
	/*
	* Obtain the lock on page, remove all ptes and migrate the page
	* to the newly allocated page in newpage.
	@@ -3059,6 +2926,138 @@ void migrate_vma_finalize(struct migrate
	EXPORT_SYMBOL(migrate_vma_finalize);
	#endif /* CONFIG_DEVICE_PRIVATE */

	+/*
	+ * node_demotion[] example:
	+ *
	+ * Consider a system with two sockets. Each socket has
	+ * three classes of memory attached: fast, medium and slow.
	+ * Each memory class is placed in its own NUMA node. The
	+ * CPUs are placed in the node with the "fast" memory. The
	+ * 6 NUMA nodes (0-5) might be split among the sockets like
	+ * this:
	+ *
	+ * Socket A: 0, 1, 2
	+ * Socket B: 3, 4, 5
	+ *
	+ * When Node 0 fills up, its memory should be migrated to
	+ * Node 1. When Node 1 fills up, it should be migrated to
	+ * Node 2. The migration path start on the nodes with the
	+ * processors (since allocations default to this node) and
	+ * fast memory, progress through medium and end with the
	+ * slow memory:
	+ *
	+ * 0 -> 1 -> 2 -> stop
	+ * 3 -> 4 -> 5 -> stop
	+ *
	+ * This is represented in the node_demotion[] like this:
	+ *
	+ * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1
	+ * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2
	+ * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate
	+ * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4
	+ * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5
	+ * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate
	+ *
	+ * Moreover some systems may have multiple slow memory nodes.
	+ * Suppose a system has one socket with 3 memory nodes, node 0
	+ * is fast memory type, and node 1/2 both are slow memory
	+ * type, and the distance between fast memory node and slow
	+ * memory node is same. So the migration path should be:
	+ *
	+ * 0 -> 1/2 -> stop
	+ *
	+ * This is represented in the node_demotion[] like this:
	+ * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
	+ * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
	+ * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
	+ */
	+
	+/*
	+ * Writes to this array occur without locking. Cycles are
	+ * not allowed: Node X demotes to Y which demotes to X...
	+ *
	+ * If multiple reads are performed, a single rcu_read_lock()
	+ * must be held over all reads to ensure that no cycles are
	+ * observed.
	+ */
	+#define DEFAULT_DEMOTION_TARGET_NODES 15
	+
	+#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
	+#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1)
	+#else
	+#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES
	+#endif
	+
	+struct demotion_nodes {
	+ unsigned short nr;
	+ short nodes[DEMOTION_TARGET_NODES];
	+};
	+
	+static struct demotion_nodes *node_demotion __read_mostly;
	+
	+/**
	+ * next_demotion_node() - Get the next node in the demotion path
	+ * @node: The starting node to lookup the next node
	+ *
	+ * Return: node id for next memory node in the demotion path hierarchy
	+ * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
	+ * @node online or guarantee that it continues to be the next demotion
	+ * target.
	+ */
	+int next_demotion_node(int node)
	+{
	+ struct demotion_nodes *nd;
	+ unsigned short target_nr, index;
	+ int target;
	+
	+ if (!node_demotion)
	+ return NUMA_NO_NODE;
	+
	+ nd = &node_demotion[node];
	+
	+ /*
	+ * node_demotion[] is updated without excluding this
	+ * function from running. RCU doesn't provide any
	+ * compiler barriers, so the READ_ONCE() is required
	+ * to avoid compiler reordering or read merging.
	+ *
	+ * Make sure to use RCU over entire code blocks if
	+ * node_demotion[] reads need to be consistent.
	+ */
	+ rcu_read_lock();
	+ target_nr = READ_ONCE(nd->nr);
	+
	+ switch (target_nr) {
	+ case 0:
	+ target = NUMA_NO_NODE;
	+ goto out;
	+ case 1:
	+ index = 0;
	+ break;
	+ default:
	+ /*
	+ * If there are multiple target nodes, just select one
	+ * target node randomly.
	+ *
	+ * In addition, we can also use round-robin to select
	+ * target node, but we should introduce another variable
	+ * for node_demotion[] to record last selected target node,
	+ * that may cause cache ping-pong due to the changing of
	+ * last target node. Or introducing per-cpu data to avoid
	+ * caching issue, which seems more complicated. So selecting
	+ * target node randomly seems better until now.
	+ */
	+ index = get_random_int() % target_nr;
	+ break;
	+ }
	+
	+ target = READ_ONCE(nd->nodes[index]);
	+
	+out:
	+ rcu_read_unlock();
	+ return target;
	+}
	+
	#if defined(CONFIG_HOTPLUG_CPU)
	/* Disable reclaim-based migration. */
	static void __disable_all_migrate_targets(void)
	_