| From 1ca54bbe52de08550e773961d2b0cbbd012870d5 Mon Sep 17 00:00:00 2001 |
| From: Ingo Molnar <mingo@elte.hu> |
| Date: Fri, 3 Jul 2009 08:30:05 -0500 |
| Subject: [PATCH] sched: mmdrop needs to be delayed on -rt |
| |
| commit 5b6e135f5e1e9e5586ad69e35c96494a4b413a00 in tip. |
| |
| [PG: upstream per_cpu__ prefix removal dd17c8f729) caused an implicit |
| (and hard to spot) shadowing of the percpu desched_task with a |
| local var of the same name in __mmdrop_delayed, so add cpu_ prefix] |
| |
| Signed-off-by: Ingo Molnar <mingo@elte.hu> |
| Signed-off-by: Thomas Gleixner <tglx@linutronix.de> |
| Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> |
| |
| diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h |
| index b8bb9a6..a977b30 100644 |
| --- a/include/linux/mm_types.h |
| +++ b/include/linux/mm_types.h |
| @@ -270,6 +270,9 @@ struct mm_struct { |
| /* Architecture-specific MM context */ |
| mm_context_t context; |
| |
| + /* realtime bits */ |
| + struct list_head delayed_drop; |
| + |
| /* Swap token stuff */ |
| /* |
| * Last value of global fault stamp as seen by this process. |
| diff --git a/include/linux/sched.h b/include/linux/sched.h |
| index e645994..6035a1b 100644 |
| --- a/include/linux/sched.h |
| +++ b/include/linux/sched.h |
| @@ -2177,12 +2177,20 @@ extern struct mm_struct * mm_alloc(void); |
| |
| /* mmdrop drops the mm and the page tables */ |
| extern void __mmdrop(struct mm_struct *); |
| +extern void __mmdrop_delayed(struct mm_struct *); |
| + |
| static inline void mmdrop(struct mm_struct * mm) |
| { |
| if (unlikely(atomic_dec_and_test(&mm->mm_count))) |
| __mmdrop(mm); |
| } |
| |
| +static inline void mmdrop_delayed(struct mm_struct * mm) |
| +{ |
| + if (atomic_dec_and_test(&mm->mm_count)) |
| + __mmdrop_delayed(mm); |
| +} |
| + |
| /* mmput gets rid of the mappings and all user-space */ |
| extern void mmput(struct mm_struct *); |
| /* Grab a reference to a task's mm, if it is not already going away */ |
| diff --git a/kernel/fork.c b/kernel/fork.c |
| index 0a4f17f..12f9c64 100644 |
| --- a/kernel/fork.c |
| +++ b/kernel/fork.c |
| @@ -38,6 +38,7 @@ |
| #include <linux/syscalls.h> |
| #include <linux/jiffies.h> |
| #include <linux/tracehook.h> |
| +#include <linux/interrupt.h> |
| #include <linux/futex.h> |
| #include <linux/compat.h> |
| #include <linux/task_io_accounting_ops.h> |
| @@ -65,6 +66,8 @@ |
| #include <linux/perf_event.h> |
| #include <linux/posix-timers.h> |
| #include <linux/user-return-notifier.h> |
| +#include <linux/kthread.h> |
| +#include <linux/notifier.h> |
| |
| #include <asm/pgtable.h> |
| #include <asm/pgalloc.h> |
| @@ -99,6 +102,14 @@ int lockdep_tasklist_lock_is_held(void) |
| EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held); |
| #endif /* #ifdef CONFIG_PROVE_RCU */ |
| |
| +/* |
| + * Delayed mmdrop. In the PREEMPT_RT case we |
| + * dont want to do this from the scheduling |
| + * context. |
| + */ |
| +static DEFINE_PER_CPU(struct task_struct *, desched_task); |
| +static DEFINE_PER_CPU(struct list_head, delayed_drop_list); |
| + |
| int nr_processes(void) |
| { |
| int cpu; |
| @@ -192,6 +203,8 @@ void __put_task_struct(struct task_struct *tsk) |
| |
| void __init fork_init(unsigned long mempages) |
| { |
| + int i; |
| + |
| #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR |
| #ifndef ARCH_MIN_TASKALIGN |
| #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES |
| @@ -222,6 +235,9 @@ void __init fork_init(unsigned long mempages) |
| init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; |
| init_task.signal->rlim[RLIMIT_SIGPENDING] = |
| init_task.signal->rlim[RLIMIT_NPROC]; |
| + |
| + for (i = 0; i < NR_CPUS; i++) |
| + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i)); |
| } |
| |
| int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, |
| @@ -307,6 +323,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
| mm->locked_vm = 0; |
| mm->mmap = NULL; |
| mm->mmap_cache = NULL; |
| + INIT_LIST_HEAD(&mm->delayed_drop); |
| mm->free_area_cache = oldmm->mmap_base; |
| mm->cached_hole_size = ~0UL; |
| mm->map_count = 0; |
| @@ -1278,7 +1295,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, |
| attach_pid(p, PIDTYPE_SID, task_session(current)); |
| list_add_tail(&p->sibling, &p->real_parent->children); |
| list_add_tail_rcu(&p->tasks, &init_task.tasks); |
| + preempt_disable(); |
| __get_cpu_var(process_counts)++; |
| + preempt_enable(); |
| } |
| attach_pid(p, PIDTYPE_PID, pid); |
| nr_threads++; |
| @@ -1752,3 +1771,138 @@ int unshare_files(struct files_struct **displaced) |
| task_unlock(task); |
| return 0; |
| } |
| + |
| +static int mmdrop_complete(void) |
| +{ |
| + struct list_head *head; |
| + int ret = 0; |
| + |
| + head = &get_cpu_var(delayed_drop_list); |
| + while (!list_empty(head)) { |
| + struct mm_struct *mm = list_entry(head->next, |
| + struct mm_struct, delayed_drop); |
| + list_del(&mm->delayed_drop); |
| + put_cpu_var(delayed_drop_list); |
| + |
| + __mmdrop(mm); |
| + ret = 1; |
| + |
| + head = &get_cpu_var(delayed_drop_list); |
| + } |
| + put_cpu_var(delayed_drop_list); |
| + |
| + return ret; |
| +} |
| + |
| +/* |
| + * We dont want to do complex work from the scheduler, thus |
| + * we delay the work to a per-CPU worker thread: |
| + */ |
| +void __mmdrop_delayed(struct mm_struct *mm) |
| +{ |
| + struct task_struct *cpu_desched_task; |
| + struct list_head *head; |
| + |
| + head = &get_cpu_var(delayed_drop_list); |
| + list_add_tail(&mm->delayed_drop, head); |
| + cpu_desched_task = __get_cpu_var(desched_task); |
| + if (cpu_desched_task) |
| + wake_up_process(cpu_desched_task); |
| + put_cpu_var(delayed_drop_list); |
| +} |
| + |
| +static void takeover_delayed_drop(int hotcpu) |
| +{ |
| + struct list_head *head = &per_cpu(delayed_drop_list, hotcpu); |
| + |
| + while (!list_empty(head)) { |
| + struct mm_struct *mm = list_entry(head->next, |
| + struct mm_struct, delayed_drop); |
| + |
| + list_del(&mm->delayed_drop); |
| + __mmdrop_delayed(mm); |
| + } |
| +} |
| + |
| +static int desched_thread(void * __bind_cpu) |
| +{ |
| + set_user_nice(current, -10); |
| + current->flags |= PF_NOFREEZE; |
| + current->extra_flags |= PFE_SOFTIRQ; |
| + |
| + set_current_state(TASK_INTERRUPTIBLE); |
| + |
| + while (!kthread_should_stop()) { |
| + |
| + if (mmdrop_complete()) |
| + continue; |
| + schedule(); |
| + |
| + /* |
| + * This must be called from time to time on ia64, and is a |
| + * no-op on other archs. Used to be in cpu_idle(), but with |
| + * the new -rt semantics it can't stay there. |
| + */ |
| + check_pgt_cache(); |
| + |
| + set_current_state(TASK_INTERRUPTIBLE); |
| + } |
| + __set_current_state(TASK_RUNNING); |
| + return 0; |
| +} |
| + |
| +static int __devinit cpu_callback(struct notifier_block *nfb, |
| + unsigned long action, |
| + void *hcpu) |
| +{ |
| + int hotcpu = (unsigned long)hcpu; |
| + struct task_struct *p; |
| + |
| + switch (action) { |
| + case CPU_UP_PREPARE: |
| + |
| + BUG_ON(per_cpu(desched_task, hotcpu)); |
| + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu)); |
| + p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu); |
| + if (IS_ERR(p)) { |
| + printk("desched_thread for %i failed\n", hotcpu); |
| + return NOTIFY_BAD; |
| + } |
| + per_cpu(desched_task, hotcpu) = p; |
| + kthread_bind(p, hotcpu); |
| + break; |
| + case CPU_ONLINE: |
| + |
| + wake_up_process(per_cpu(desched_task, hotcpu)); |
| + break; |
| +#ifdef CONFIG_HOTPLUG_CPU |
| + case CPU_UP_CANCELED: |
| + |
| + /* Unbind so it can run. Fall thru. */ |
| + kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id()); |
| + case CPU_DEAD: |
| + |
| + p = per_cpu(desched_task, hotcpu); |
| + per_cpu(desched_task, hotcpu) = NULL; |
| + kthread_stop(p); |
| + takeover_delayed_drop(hotcpu); |
| + takeover_tasklets(hotcpu); |
| + break; |
| +#endif /* CONFIG_HOTPLUG_CPU */ |
| + } |
| + return NOTIFY_OK; |
| +} |
| + |
| +static struct notifier_block __devinitdata cpu_nfb = { |
| + .notifier_call = cpu_callback |
| +}; |
| + |
| +__init int spawn_desched_task(void) |
| +{ |
| + void *cpu = (void *)(long)smp_processor_id(); |
| + |
| + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); |
| + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); |
| + register_cpu_notifier(&cpu_nfb); |
| + return 0; |
| +} |
| diff --git a/kernel/sched.c b/kernel/sched.c |
| index 987e4c3..7cb260b 100644 |
| --- a/kernel/sched.c |
| +++ b/kernel/sched.c |
| @@ -2906,8 +2906,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) |
| finish_lock_switch(rq, prev); |
| |
| fire_sched_in_preempt_notifiers(current); |
| + /* |
| + * Delay the final freeing of the mm or task, so that we dont have |
| + * to do complex work from within the scheduler: |
| + */ |
| if (mm) |
| - mmdrop(mm); |
| + mmdrop_delayed(mm); |
| if (unlikely(prev_state == TASK_DEAD)) { |
| /* |
| * Remove function-return probe instances associated with this |
| @@ -5903,7 +5907,11 @@ void idle_task_exit(void) |
| |
| if (mm != &init_mm) |
| switch_mm(mm, &init_mm, current); |
| +#ifdef CONFIG_PREEMPT_RT |
| + mmdrop_delayed(mm); |
| +#else |
| mmdrop(mm); |
| +#endif |
| } |
| |
| /* called under rq->lock with disabled interrupts */ |
| -- |
| 1.7.0.4 |
| |