blob: 4716359097fef3fc2d2977c323ce8f88929b9ddb [file] [log] [blame]
From: Chen Yu <yu.c.chen@intel.com>
Subject: sched/numa: add statistics of numa balance task migration
Date: Tue, 8 Apr 2025 18:14:44 +0800
On systems with NUMA balancing enabled, it is found that tracking the task
activities due to NUMA balancing is helpful. NUMA balancing has two
mechanisms for task migration: one is to migrate the task to an idle CPU
in its preferred node, the other is to swap tasks on different nodes if
they are on each other's preferred node.
The kernel already has NUMA page migration statistics in
/sys/fs/cgroup/mytest/memory.stat and /proc/{PID}/sched, but does not have
statistics for task migration/swap. Add the task migration and swap count
accordingly.
The following two new fields:
numa_task_migrated
numa_task_swapped
will be displayed in both
/sys/fs/cgroup/{GROUP}/memory.stat and /proc/{PID}/sched
Introducing both pertask and permemcg NUMA balancing statistics helps to
quickly evaluate the performance and resource usage of the target
workload. For example, the user can first identify the container which
has high NUMA balance activity and then narrow down to a specific task
within that group, and tune the memory policy of that task. In summary,
it is plausible to iterate the /proc/$pid/sched to find the offending
task, but the introduction of per memcg tasks' Numa balancing aggregated
activity can further help users identify the task in a divide-and-conquer
way.
[yu.c.chen@intel.com: v3]
Link: https://lkml.kernel.org/r/20250430103623.3349842-1-yu.c.chen@intel.com
Link: https://lkml.kernel.org/r/20250430103623.3349842-1-yu.c.chen@intel.com
Link: https://lkml.kernel.org/r/20250408101444.192519-1-yu.c.chen@intel.com
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: "Chen, Tim C" <tim.c.chen@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mel Gorman <mgorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Cc: Libo Chen <libo.chen@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
Documentation/admin-guide/cgroup-v2.rst | 6 ++++++
include/linux/sched.h | 4 ++++
include/linux/vm_event_item.h | 2 ++
kernel/sched/core.c | 7 +++++--
kernel/sched/debug.c | 4 ++++
mm/memcontrol.c | 2 ++
mm/vmstat.c | 2 ++
7 files changed, 25 insertions(+), 2 deletions(-)
--- a/Documentation/admin-guide/cgroup-v2.rst~sched-numa-add-statistics-of-numa-balance-task-migration-and-swap
+++ a/Documentation/admin-guide/cgroup-v2.rst
@@ -1670,6 +1670,12 @@ The following nested keys are defined.
numa_hint_faults (npn)
Number of NUMA hinting faults.
+ numa_task_migrated (npn)
+ Number of task migration by NUMA balancing.
+
+ numa_task_swapped (npn)
+ Number of task swap by NUMA balancing.
+
pgdemote_kswapd
Number of pages demoted by kswapd.
--- a/include/linux/sched.h~sched-numa-add-statistics-of-numa-balance-task-migration-and-swap
+++ a/include/linux/sched.h
@@ -549,6 +549,10 @@ struct sched_statistics {
u64 nr_failed_migrations_running;
u64 nr_failed_migrations_hot;
u64 nr_forced_migrations;
+#ifdef CONFIG_NUMA_BALANCING
+ u64 numa_task_migrated;
+ u64 numa_task_swapped;
+#endif
u64 nr_wakeups;
u64 nr_wakeups_sync;
--- a/include/linux/vm_event_item.h~sched-numa-add-statistics-of-numa-balance-task-migration-and-swap
+++ a/include/linux/vm_event_item.h
@@ -66,6 +66,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS
NUMA_HINT_FAULTS,
NUMA_HINT_FAULTS_LOCAL,
NUMA_PAGE_MIGRATE,
+ NUMA_TASK_MIGRATE,
+ NUMA_TASK_SWAP,
#endif
#ifdef CONFIG_MIGRATION
PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
--- a/kernel/sched/core.c~sched-numa-add-statistics-of-numa-balance-task-migration-and-swap
+++ a/kernel/sched/core.c
@@ -3352,6 +3352,9 @@ void set_task_cpu(struct task_struct *p,
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
+ __schedstat_inc(p->stats.numa_task_swapped);
+ count_memcg_events_mm(p->mm, NUMA_TASK_SWAP, 1);
+
if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
struct rq_flags srf, drf;
@@ -7953,8 +7956,8 @@ int migrate_task_to(struct task_struct *
if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
- /* TODO: This is not properly updating schedstats */
-
+ __schedstat_inc(p->stats.numa_task_migrated);
+ count_memcg_events_mm(p->mm, NUMA_TASK_MIGRATE, 1);
trace_sched_move_numa(p, curr_cpu, target_cpu);
return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
}
--- a/kernel/sched/debug.c~sched-numa-add-statistics-of-numa-balance-task-migration-and-swap
+++ a/kernel/sched/debug.c
@@ -1206,6 +1206,10 @@ void proc_sched_show_task(struct task_st
P_SCHEDSTAT(nr_failed_migrations_running);
P_SCHEDSTAT(nr_failed_migrations_hot);
P_SCHEDSTAT(nr_forced_migrations);
+#ifdef CONFIG_NUMA_BALANCING
+ P_SCHEDSTAT(numa_task_migrated);
+ P_SCHEDSTAT(numa_task_swapped);
+#endif
P_SCHEDSTAT(nr_wakeups);
P_SCHEDSTAT(nr_wakeups_sync);
P_SCHEDSTAT(nr_wakeups_migrate);
--- a/mm/memcontrol.c~sched-numa-add-statistics-of-numa-balance-task-migration-and-swap
+++ a/mm/memcontrol.c
@@ -470,6 +470,8 @@ static const unsigned int memcg_vm_event
NUMA_PAGE_MIGRATE,
NUMA_PTE_UPDATES,
NUMA_HINT_FAULTS,
+ NUMA_TASK_MIGRATE,
+ NUMA_TASK_SWAP,
#endif
};
--- a/mm/vmstat.c~sched-numa-add-statistics-of-numa-balance-task-migration-and-swap
+++ a/mm/vmstat.c
@@ -1347,6 +1347,8 @@ const char * const vmstat_text[] = {
"numa_hint_faults",
"numa_hint_faults_local",
"numa_pages_migrated",
+ "numa_task_migrated",
+ "numa_task_swapped",
#endif
#ifdef CONFIG_MIGRATION
"pgmigrate_success",
_