blob: 110b03b0c450bf523e66def946643be35344761b [file] [log] [blame]
From: Chen Yu <yu.c.chen@intel.com>
Subject: Revert "sched/numa: add statistics of numa balance task"
Date: Fri, 4 Jul 2025 21:56:20 +0800
This reverts commit ad6b26b6a0a79166b53209df2ca1cf8636296382.
This commit introduces per-memcg/task NUMA balance statistics, but
unfortunately it introduced a NULL pointer exception due to the following
race condition: After a swap task candidate was chosen, its mm_struct
pointer was set to NULL due to task exit. Later, when performing the
actual task swapping, the p->mm caused the problem.
CPU0 CPU1
:
...
task_numa_migrate
task_numa_find_cpu
task_numa_compare
# a normal task p is chosen
env->best_task = p
# p exit:
exit_signals(p);
p->flags |= PF_EXITING
exit_mm
p->mm = NULL;
migrate_swap_stop
__migrate_swap_task((arg->src_task, arg->dst_cpu)
count_memcg_event_mm(p->mm, NUMA_TASK_SWAP)# p->mm is NULL
task_lock() should be held and the PF_EXITING flag needs to be checked to
prevent this from happening. After discussion, the conclusion was that
adding a lock is not worthwhile for some statistics calculations. Revert
the change and rely on the tracepoint for this purpose.
Link: https://lkml.kernel.org/r/20250704135620.685752-1-yu.c.chen@intel.com
Link: https://lkml.kernel.org/r/20250708064917.BBD13C4CEED@smtp.kernel.org
Fixes: ad6b26b6a0a7 ("sched/numa: add statistics of numa balance task")
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Reported-by: Jirka Hladky <jhladky@redhat.com>
Closes: https://lore.kernel.org/all/CAE4VaGBLJxpd=NeRJXpSCuw=REhC5LWJpC29kDy-Zh2ZDyzQZA@mail.gmail.com/
Reported-by: Srikanth Aithal <Srikanth.Aithal@amd.com>
Reported-by: Suneeth D <Suneeth.D@amd.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Hladky <jhladky@redhat.com>
Cc: Libo Chen <libo.chen@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
Documentation/admin-guide/cgroup-v2.rst | 6 ------
include/linux/sched.h | 4 ----
include/linux/vm_event_item.h | 2 --
kernel/sched/core.c | 9 ++-------
kernel/sched/debug.c | 4 ----
mm/memcontrol.c | 2 --
mm/vmstat.c | 2 --
7 files changed, 2 insertions(+), 27 deletions(-)
--- a/Documentation/admin-guide/cgroup-v2.rst~revert-sched-numa-add-statistics-of-numa-balance-task
+++ a/Documentation/admin-guide/cgroup-v2.rst
@@ -1732,12 +1732,6 @@ The following nested keys are defined.
numa_hint_faults (npn)
Number of NUMA hinting faults.
- numa_task_migrated (npn)
- Number of task migration by NUMA balancing.
-
- numa_task_swapped (npn)
- Number of task swap by NUMA balancing.
-
pgdemote_kswapd
Number of pages demoted by kswapd.
--- a/include/linux/sched.h~revert-sched-numa-add-statistics-of-numa-balance-task
+++ a/include/linux/sched.h
@@ -548,10 +548,6 @@ struct sched_statistics {
u64 nr_failed_migrations_running;
u64 nr_failed_migrations_hot;
u64 nr_forced_migrations;
-#ifdef CONFIG_NUMA_BALANCING
- u64 numa_task_migrated;
- u64 numa_task_swapped;
-#endif
u64 nr_wakeups;
u64 nr_wakeups_sync;
--- a/include/linux/vm_event_item.h~revert-sched-numa-add-statistics-of-numa-balance-task
+++ a/include/linux/vm_event_item.h
@@ -66,8 +66,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS
NUMA_HINT_FAULTS,
NUMA_HINT_FAULTS_LOCAL,
NUMA_PAGE_MIGRATE,
- NUMA_TASK_MIGRATE,
- NUMA_TASK_SWAP,
#endif
#ifdef CONFIG_MIGRATION
PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
--- a/kernel/sched/core.c~revert-sched-numa-add-statistics-of-numa-balance-task
+++ a/kernel/sched/core.c
@@ -3362,10 +3362,6 @@ void set_task_cpu(struct task_struct *p,
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
- __schedstat_inc(p->stats.numa_task_swapped);
- count_vm_numa_event(NUMA_TASK_SWAP);
- count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
-
if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
struct rq_flags srf, drf;
@@ -7939,9 +7935,8 @@ int migrate_task_to(struct task_struct *
if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
- __schedstat_inc(p->stats.numa_task_migrated);
- count_vm_numa_event(NUMA_TASK_MIGRATE);
- count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE);
+ /* TODO: This is not properly updating schedstats */
+
trace_sched_move_numa(p, curr_cpu, target_cpu);
return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
}
--- a/kernel/sched/debug.c~revert-sched-numa-add-statistics-of-numa-balance-task
+++ a/kernel/sched/debug.c
@@ -1210,10 +1210,6 @@ void proc_sched_show_task(struct task_st
P_SCHEDSTAT(nr_failed_migrations_running);
P_SCHEDSTAT(nr_failed_migrations_hot);
P_SCHEDSTAT(nr_forced_migrations);
-#ifdef CONFIG_NUMA_BALANCING
- P_SCHEDSTAT(numa_task_migrated);
- P_SCHEDSTAT(numa_task_swapped);
-#endif
P_SCHEDSTAT(nr_wakeups);
P_SCHEDSTAT(nr_wakeups_sync);
P_SCHEDSTAT(nr_wakeups_migrate);
--- a/mm/memcontrol.c~revert-sched-numa-add-statistics-of-numa-balance-task
+++ a/mm/memcontrol.c
@@ -474,8 +474,6 @@ static const unsigned int memcg_vm_event
NUMA_PAGE_MIGRATE,
NUMA_PTE_UPDATES,
NUMA_HINT_FAULTS,
- NUMA_TASK_MIGRATE,
- NUMA_TASK_SWAP,
#endif
};
--- a/mm/vmstat.c~revert-sched-numa-add-statistics-of-numa-balance-task
+++ a/mm/vmstat.c
@@ -1346,8 +1346,6 @@ const char * const vmstat_text[] = {
"numa_hint_faults",
"numa_hint_faults_local",
"numa_pages_migrated",
- "numa_task_migrated",
- "numa_task_swapped",
#endif
#ifdef CONFIG_MIGRATION
"pgmigrate_success",
_