blob: dbe2401c1684c2546d1b006a82a6363d63a2b3ae [file] [log] [blame]
From 1ca54bbe52de08550e773961d2b0cbbd012870d5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:05 -0500
Subject: [PATCH] sched: mmdrop needs to be delayed on -rt
commit 5b6e135f5e1e9e5586ad69e35c96494a4b413a00 in tip.
[PG: upstream per_cpu__ prefix removal dd17c8f729) caused an implicit
(and hard to spot) shadowing of the percpu desched_task with a
local var of the same name in __mmdrop_delayed, so add cpu_ prefix]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index b8bb9a6..a977b30 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -270,6 +270,9 @@ struct mm_struct {
/* Architecture-specific MM context */
mm_context_t context;
+ /* realtime bits */
+ struct list_head delayed_drop;
+
/* Swap token stuff */
/*
* Last value of global fault stamp as seen by this process.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e645994..6035a1b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2177,12 +2177,20 @@ extern struct mm_struct * mm_alloc(void);
/* mmdrop drops the mm and the page tables */
extern void __mmdrop(struct mm_struct *);
+extern void __mmdrop_delayed(struct mm_struct *);
+
static inline void mmdrop(struct mm_struct * mm)
{
if (unlikely(atomic_dec_and_test(&mm->mm_count)))
__mmdrop(mm);
}
+static inline void mmdrop_delayed(struct mm_struct * mm)
+{
+ if (atomic_dec_and_test(&mm->mm_count))
+ __mmdrop_delayed(mm);
+}
+
/* mmput gets rid of the mappings and all user-space */
extern void mmput(struct mm_struct *);
/* Grab a reference to a task's mm, if it is not already going away */
diff --git a/kernel/fork.c b/kernel/fork.c
index 0a4f17f..12f9c64 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -38,6 +38,7 @@
#include <linux/syscalls.h>
#include <linux/jiffies.h>
#include <linux/tracehook.h>
+#include <linux/interrupt.h>
#include <linux/futex.h>
#include <linux/compat.h>
#include <linux/task_io_accounting_ops.h>
@@ -65,6 +66,8 @@
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -99,6 +102,14 @@ int lockdep_tasklist_lock_is_held(void)
EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
#endif /* #ifdef CONFIG_PROVE_RCU */
+/*
+ * Delayed mmdrop. In the PREEMPT_RT case we
+ * dont want to do this from the scheduling
+ * context.
+ */
+static DEFINE_PER_CPU(struct task_struct *, desched_task);
+static DEFINE_PER_CPU(struct list_head, delayed_drop_list);
+
int nr_processes(void)
{
int cpu;
@@ -192,6 +203,8 @@ void __put_task_struct(struct task_struct *tsk)
void __init fork_init(unsigned long mempages)
{
+ int i;
+
#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
@@ -222,6 +235,9 @@ void __init fork_init(unsigned long mempages)
init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
init_task.signal->rlim[RLIMIT_SIGPENDING] =
init_task.signal->rlim[RLIMIT_NPROC];
+
+ for (i = 0; i < NR_CPUS; i++)
+ INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i));
}
int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
@@ -307,6 +323,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
mm->locked_vm = 0;
mm->mmap = NULL;
mm->mmap_cache = NULL;
+ INIT_LIST_HEAD(&mm->delayed_drop);
mm->free_area_cache = oldmm->mmap_base;
mm->cached_hole_size = ~0UL;
mm->map_count = 0;
@@ -1278,7 +1295,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
attach_pid(p, PIDTYPE_SID, task_session(current));
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
+ preempt_disable();
__get_cpu_var(process_counts)++;
+ preempt_enable();
}
attach_pid(p, PIDTYPE_PID, pid);
nr_threads++;
@@ -1752,3 +1771,138 @@ int unshare_files(struct files_struct **displaced)
task_unlock(task);
return 0;
}
+
+static int mmdrop_complete(void)
+{
+ struct list_head *head;
+ int ret = 0;
+
+ head = &get_cpu_var(delayed_drop_list);
+ while (!list_empty(head)) {
+ struct mm_struct *mm = list_entry(head->next,
+ struct mm_struct, delayed_drop);
+ list_del(&mm->delayed_drop);
+ put_cpu_var(delayed_drop_list);
+
+ __mmdrop(mm);
+ ret = 1;
+
+ head = &get_cpu_var(delayed_drop_list);
+ }
+ put_cpu_var(delayed_drop_list);
+
+ return ret;
+}
+
+/*
+ * We dont want to do complex work from the scheduler, thus
+ * we delay the work to a per-CPU worker thread:
+ */
+void __mmdrop_delayed(struct mm_struct *mm)
+{
+ struct task_struct *cpu_desched_task;
+ struct list_head *head;
+
+ head = &get_cpu_var(delayed_drop_list);
+ list_add_tail(&mm->delayed_drop, head);
+ cpu_desched_task = __get_cpu_var(desched_task);
+ if (cpu_desched_task)
+ wake_up_process(cpu_desched_task);
+ put_cpu_var(delayed_drop_list);
+}
+
+static void takeover_delayed_drop(int hotcpu)
+{
+ struct list_head *head = &per_cpu(delayed_drop_list, hotcpu);
+
+ while (!list_empty(head)) {
+ struct mm_struct *mm = list_entry(head->next,
+ struct mm_struct, delayed_drop);
+
+ list_del(&mm->delayed_drop);
+ __mmdrop_delayed(mm);
+ }
+}
+
+static int desched_thread(void * __bind_cpu)
+{
+ set_user_nice(current, -10);
+ current->flags |= PF_NOFREEZE;
+ current->extra_flags |= PFE_SOFTIRQ;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop()) {
+
+ if (mmdrop_complete())
+ continue;
+ schedule();
+
+ /*
+ * This must be called from time to time on ia64, and is a
+ * no-op on other archs. Used to be in cpu_idle(), but with
+ * the new -rt semantics it can't stay there.
+ */
+ check_pgt_cache();
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+ return 0;
+}
+
+static int __devinit cpu_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int hotcpu = (unsigned long)hcpu;
+ struct task_struct *p;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+
+ BUG_ON(per_cpu(desched_task, hotcpu));
+ INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu));
+ p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu);
+ if (IS_ERR(p)) {
+ printk("desched_thread for %i failed\n", hotcpu);
+ return NOTIFY_BAD;
+ }
+ per_cpu(desched_task, hotcpu) = p;
+ kthread_bind(p, hotcpu);
+ break;
+ case CPU_ONLINE:
+
+ wake_up_process(per_cpu(desched_task, hotcpu));
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_UP_CANCELED:
+
+ /* Unbind so it can run. Fall thru. */
+ kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id());
+ case CPU_DEAD:
+
+ p = per_cpu(desched_task, hotcpu);
+ per_cpu(desched_task, hotcpu) = NULL;
+ kthread_stop(p);
+ takeover_delayed_drop(hotcpu);
+ takeover_tasklets(hotcpu);
+ break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+ .notifier_call = cpu_callback
+};
+
+__init int spawn_desched_task(void)
+{
+ void *cpu = (void *)(long)smp_processor_id();
+
+ cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+ cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+ register_cpu_notifier(&cpu_nfb);
+ return 0;
+}
diff --git a/kernel/sched.c b/kernel/sched.c
index 987e4c3..7cb260b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2906,8 +2906,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
finish_lock_switch(rq, prev);
fire_sched_in_preempt_notifiers(current);
+ /*
+ * Delay the final freeing of the mm or task, so that we dont have
+ * to do complex work from within the scheduler:
+ */
if (mm)
- mmdrop(mm);
+ mmdrop_delayed(mm);
if (unlikely(prev_state == TASK_DEAD)) {
/*
* Remove function-return probe instances associated with this
@@ -5903,7 +5907,11 @@ void idle_task_exit(void)
if (mm != &init_mm)
switch_mm(mm, &init_mm, current);
+#ifdef CONFIG_PREEMPT_RT
+ mmdrop_delayed(mm);
+#else
mmdrop(mm);
+#endif
}
/* called under rq->lock with disabled interrupts */
--
1.7.0.4