sched-mmdrop-needs-to-be-delayed-on-rt.patch - pub/scm/linux/kernel/git/paulg/rt-patches - Git at Google

 From 1ca54bbe52de08550e773961d2b0cbbd012870d5 Mon Sep 17 00:00:00 2001
 From: Ingo Molnar <mingo@elte.hu>
 Date: Fri, 3 Jul 2009 08:30:05 -0500
 Subject: [PATCH] sched: mmdrop needs to be delayed on -rt

 commit 5b6e135f5e1e9e5586ad69e35c96494a4b413a00 in tip.

 [PG: upstream per_cpu__ prefix removal dd17c8f729) caused an implicit
  (and hard to spot) shadowing of the percpu desched_task with a
  local var of the same name in __mmdrop_delayed, so add cpu_ prefix]

 Signed-off-by: Ingo Molnar <mingo@elte.hu>
 Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
 Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>

 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
 index b8bb9a6..a977b30 100644
 --- a/include/linux/mm_types.h
 +++ b/include/linux/mm_types.h
 @@ -270,6 +270,9 @@ struct mm_struct {
  	/* Architecture-specific MM context */
  	mm_context_t context;

 +	/* realtime bits */
 +	struct list_head	delayed_drop;
 +
  	/* Swap token stuff */
  	/*
  	 * Last value of global fault stamp as seen by this process.
 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index e645994..6035a1b 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -2177,12 +2177,20 @@ extern struct mm_struct * mm_alloc(void);

  /* mmdrop drops the mm and the page tables */
  extern void __mmdrop(struct mm_struct *);
 +extern void __mmdrop_delayed(struct mm_struct *);
 +
  static inline void mmdrop(struct mm_struct * mm)
  {
  	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
  		__mmdrop(mm);
  }

 +static inline void mmdrop_delayed(struct mm_struct * mm)
 +{
 +	if (atomic_dec_and_test(&mm->mm_count))
 +		__mmdrop_delayed(mm);
 +}
 +
  /* mmput gets rid of the mappings and all user-space */
  extern void mmput(struct mm_struct *);
  /* Grab a reference to a task's mm, if it is not already going away */
 diff --git a/kernel/fork.c b/kernel/fork.c
 index 0a4f17f..12f9c64 100644
 --- a/kernel/fork.c
 +++ b/kernel/fork.c
 @@ -38,6 +38,7 @@
  #include <linux/syscalls.h>
  #include <linux/jiffies.h>
  #include <linux/tracehook.h>
 +#include <linux/interrupt.h>
  #include <linux/futex.h>
  #include <linux/compat.h>
  #include <linux/task_io_accounting_ops.h>
 @@ -65,6 +66,8 @@
  #include <linux/perf_event.h>
  #include <linux/posix-timers.h>
  #include <linux/user-return-notifier.h>
 +#include <linux/kthread.h>
 +#include <linux/notifier.h>

  #include <asm/pgtable.h>
  #include <asm/pgalloc.h>
 @@ -99,6 +102,14 @@ int lockdep_tasklist_lock_is_held(void)
  EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
  #endif /* #ifdef CONFIG_PROVE_RCU */

 +/*
 + * Delayed mmdrop. In the PREEMPT_RT case we
 + * dont want to do this from the scheduling
 + * context.
 + */
 +static DEFINE_PER_CPU(struct task_struct *, desched_task);
 +static DEFINE_PER_CPU(struct list_head, delayed_drop_list);
 +
  int nr_processes(void)
  {
  	int cpu;
 @@ -192,6 +203,8 @@ void __put_task_struct(struct task_struct *tsk)

  void __init fork_init(unsigned long mempages)
  {
 +	int i;
 +
  #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
  #ifndef ARCH_MIN_TASKALIGN
  #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 @@ -222,6 +235,9 @@ void __init fork_init(unsigned long mempages)
  	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
  	init_task.signal->rlim[RLIMIT_SIGPENDING] =
  		init_task.signal->rlim[RLIMIT_NPROC];
 +
 +	for (i = 0; i < NR_CPUS; i++)
 +		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i));
  }

  int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
 @@ -307,6 +323,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
  	mm->locked_vm = 0;
  	mm->mmap = NULL;
  	mm->mmap_cache = NULL;
 +	INIT_LIST_HEAD(&mm->delayed_drop);
  	mm->free_area_cache = oldmm->mmap_base;
  	mm->cached_hole_size = ~0UL;
  	mm->map_count = 0;
 @@ -1278,7 +1295,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  			attach_pid(p, PIDTYPE_SID, task_session(current));
  			list_add_tail(&p->sibling, &p->real_parent->children);
  			list_add_tail_rcu(&p->tasks, &init_task.tasks);
 +			preempt_disable();
  			__get_cpu_var(process_counts)++;
 +			preempt_enable();
  		}
  		attach_pid(p, PIDTYPE_PID, pid);
  		nr_threads++;
 @@ -1752,3 +1771,138 @@ int unshare_files(struct files_struct **displaced)
  	task_unlock(task);
  	return 0;
  }
 +
 +static int mmdrop_complete(void)
 +{
 +	struct list_head *head;
 +	int ret = 0;
 +
 +	head = &get_cpu_var(delayed_drop_list);
 +	while (!list_empty(head)) {
 +		struct mm_struct *mm = list_entry(head->next,
 +					struct mm_struct, delayed_drop);
 +		list_del(&mm->delayed_drop);
 +		put_cpu_var(delayed_drop_list);
 +
 +		__mmdrop(mm);
 +		ret = 1;
 +
 +		head = &get_cpu_var(delayed_drop_list);
 +	}
 +	put_cpu_var(delayed_drop_list);
 +
 +	return ret;
 +}
 +
 +/*
 + * We dont want to do complex work from the scheduler, thus
 + * we delay the work to a per-CPU worker thread:
 + */
 +void  __mmdrop_delayed(struct mm_struct *mm)
 +{
 +	struct task_struct *cpu_desched_task;
 +	struct list_head *head;
 +
 +	head = &get_cpu_var(delayed_drop_list);
 +	list_add_tail(&mm->delayed_drop, head);
 +	cpu_desched_task = __get_cpu_var(desched_task);
 +	if (cpu_desched_task)
 +		wake_up_process(cpu_desched_task);
 +	put_cpu_var(delayed_drop_list);
 +}
 +
 +static void takeover_delayed_drop(int hotcpu)
 +{
 +	struct list_head *head = &per_cpu(delayed_drop_list, hotcpu);
 +
 +	while (!list_empty(head)) {
 +		struct mm_struct *mm = list_entry(head->next,
 +				struct mm_struct, delayed_drop);
 +
 +		list_del(&mm->delayed_drop);
 +		__mmdrop_delayed(mm);
 +	}
 +}
 +
 +static int desched_thread(void * __bind_cpu)
 +{
 +	set_user_nice(current, -10);
 +	current->flags |= PF_NOFREEZE;
 +	current->extra_flags |= PFE_SOFTIRQ;
 +
 +	set_current_state(TASK_INTERRUPTIBLE);
 +
 +	while (!kthread_should_stop()) {
 +
 +		if (mmdrop_complete())
 +			continue;
 +		schedule();
 +
 +		/*
 +		 * This must be called from time to time on ia64, and is a
 +		 * no-op on other archs. Used to be in cpu_idle(), but with
 +		 * the new -rt semantics it can't stay there.
 +		 */
 +		check_pgt_cache();
 +
 +		set_current_state(TASK_INTERRUPTIBLE);
 +	}
 +	__set_current_state(TASK_RUNNING);
 +	return 0;
 +}
 +
 +static int __devinit cpu_callback(struct notifier_block *nfb,
 +				  unsigned long action,
 +				  void *hcpu)
 +{
 +	int hotcpu = (unsigned long)hcpu;
 +	struct task_struct *p;
 +
 +	switch (action) {
 +	case CPU_UP_PREPARE:
 +
 +		BUG_ON(per_cpu(desched_task, hotcpu));
 +		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu));
 +		p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu);
 +		if (IS_ERR(p)) {
 +			printk("desched_thread for %i failed\n", hotcpu);
 +			return NOTIFY_BAD;
 +		}
 +		per_cpu(desched_task, hotcpu) = p;
 +		kthread_bind(p, hotcpu);
 +		break;
 +	case CPU_ONLINE:
 +
 +		wake_up_process(per_cpu(desched_task, hotcpu));
 +		break;
 +#ifdef CONFIG_HOTPLUG_CPU
 +	case CPU_UP_CANCELED:
 +
 +		/* Unbind so it can run.  Fall thru. */
 +		kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id());
 +	case CPU_DEAD:
 +
 +		p = per_cpu(desched_task, hotcpu);
 +		per_cpu(desched_task, hotcpu) = NULL;
 +		kthread_stop(p);
 +		takeover_delayed_drop(hotcpu);
 +		takeover_tasklets(hotcpu);
 +		break;
 +#endif /* CONFIG_HOTPLUG_CPU */
 +	}
 +	return NOTIFY_OK;
 +}
 +
 +static struct notifier_block __devinitdata cpu_nfb = {
 +	.notifier_call = cpu_callback
 +};
 +
 +__init int spawn_desched_task(void)
 +{
 +	void *cpu = (void *)(long)smp_processor_id();
 +
 +	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
 +	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 +	register_cpu_notifier(&cpu_nfb);
 +	return 0;
 +}
 diff --git a/kernel/sched.c b/kernel/sched.c
 index 987e4c3..7cb260b 100644
 --- a/kernel/sched.c
 +++ b/kernel/sched.c
 @@ -2906,8 +2906,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
  	finish_lock_switch(rq, prev);

  	fire_sched_in_preempt_notifiers(current);
 +	/*
 +	 * Delay the final freeing of the mm or task, so that we dont have
 +	 * to do complex work from within the scheduler:
 +	 */
  	if (mm)
 -		mmdrop(mm);
 +		mmdrop_delayed(mm);
  	if (unlikely(prev_state == TASK_DEAD)) {
  		/*
  		 * Remove function-return probe instances associated with this
 @@ -5903,7 +5907,11 @@ void idle_task_exit(void)

  	if (mm != &init_mm)
  		switch_mm(mm, &init_mm, current);
 +#ifdef CONFIG_PREEMPT_RT
 +	mmdrop_delayed(mm);
 +#else
  	mmdrop(mm);
 +#endif
  }

  /* called under rq->lock with disabled interrupts */
 --
 1.7.0.4
	From 1ca54bbe52de08550e773961d2b0cbbd012870d5 Mon Sep 17 00:00:00 2001
	From: Ingo Molnar <mingo@elte.hu>
	Date: Fri, 3 Jul 2009 08:30:05 -0500
	Subject: [PATCH] sched: mmdrop needs to be delayed on -rt

	commit 5b6e135f5e1e9e5586ad69e35c96494a4b413a00 in tip.

	[PG: upstream per_cpu__ prefix removal dd17c8f729) caused an implicit
	(and hard to spot) shadowing of the percpu desched_task with a
	local var of the same name in __mmdrop_delayed, so add cpu_ prefix]

	Signed-off-by: Ingo Molnar <mingo@elte.hu>
	Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
	Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>

	diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
	index b8bb9a6..a977b30 100644
	--- a/include/linux/mm_types.h
	+++ b/include/linux/mm_types.h
	@@ -270,6 +270,9 @@ struct mm_struct {
	/* Architecture-specific MM context */
	mm_context_t context;

	+ /* realtime bits */
	+ struct list_head delayed_drop;
	+
	/* Swap token stuff */
	/*
	* Last value of global fault stamp as seen by this process.
	diff --git a/include/linux/sched.h b/include/linux/sched.h
	index e645994..6035a1b 100644
	--- a/include/linux/sched.h
	+++ b/include/linux/sched.h
	@@ -2177,12 +2177,20 @@ extern struct mm_struct * mm_alloc(void);

	/* mmdrop drops the mm and the page tables */
	extern void __mmdrop(struct mm_struct *);
	+extern void __mmdrop_delayed(struct mm_struct *);
	+
	static inline void mmdrop(struct mm_struct * mm)
	{
	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
	__mmdrop(mm);
	}

	+static inline void mmdrop_delayed(struct mm_struct * mm)
	+{
	+ if (atomic_dec_and_test(&mm->mm_count))
	+ __mmdrop_delayed(mm);
	+}
	+
	/* mmput gets rid of the mappings and all user-space */
	extern void mmput(struct mm_struct *);
	/* Grab a reference to a task's mm, if it is not already going away */
	diff --git a/kernel/fork.c b/kernel/fork.c
	index 0a4f17f..12f9c64 100644
	--- a/kernel/fork.c
	+++ b/kernel/fork.c
	@@ -38,6 +38,7 @@
	#include <linux/syscalls.h>
	#include <linux/jiffies.h>
	#include <linux/tracehook.h>
	+#include <linux/interrupt.h>
	#include <linux/futex.h>
	#include <linux/compat.h>
	#include <linux/task_io_accounting_ops.h>
	@@ -65,6 +66,8 @@
	#include <linux/perf_event.h>
	#include <linux/posix-timers.h>
	#include <linux/user-return-notifier.h>
	+#include <linux/kthread.h>
	+#include <linux/notifier.h>

	#include <asm/pgtable.h>
	#include <asm/pgalloc.h>
	@@ -99,6 +102,14 @@ int lockdep_tasklist_lock_is_held(void)
	EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
	#endif /* #ifdef CONFIG_PROVE_RCU */

	+/*
	+ * Delayed mmdrop. In the PREEMPT_RT case we
	+ * dont want to do this from the scheduling
	+ * context.
	+ */
	+static DEFINE_PER_CPU(struct task_struct *, desched_task);
	+static DEFINE_PER_CPU(struct list_head, delayed_drop_list);
	+
	int nr_processes(void)
	{
	int cpu;
	@@ -192,6 +203,8 @@ void __put_task_struct(struct task_struct *tsk)

	void __init fork_init(unsigned long mempages)
	{
	+ int i;
	+
	#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
	#ifndef ARCH_MIN_TASKALIGN
	#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
	@@ -222,6 +235,9 @@ void __init fork_init(unsigned long mempages)
	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
	init_task.signal->rlim[RLIMIT_SIGPENDING] =
	init_task.signal->rlim[RLIMIT_NPROC];
	+
	+ for (i = 0; i < NR_CPUS; i++)
	+ INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i));
	}

	int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
	@@ -307,6 +323,7 @@ static int dup_mmap(struct mm_struct mm, struct mm_struct oldmm)
	mm->locked_vm = 0;
	mm->mmap = NULL;
	mm->mmap_cache = NULL;
	+ INIT_LIST_HEAD(&mm->delayed_drop);
	mm->free_area_cache = oldmm->mmap_base;
	mm->cached_hole_size = ~0UL;
	mm->map_count = 0;
	@@ -1278,7 +1295,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
	attach_pid(p, PIDTYPE_SID, task_session(current));
	list_add_tail(&p->sibling, &p->real_parent->children);
	list_add_tail_rcu(&p->tasks, &init_task.tasks);
	+ preempt_disable();
	__get_cpu_var(process_counts)++;
	+ preempt_enable();
	}
	attach_pid(p, PIDTYPE_PID, pid);
	nr_threads++;
	@@ -1752,3 +1771,138 @@ int unshare_files(struct files_struct **displaced)
	task_unlock(task);
	return 0;
	}
	+
	+static int mmdrop_complete(void)
	+{
	+ struct list_head *head;
	+ int ret = 0;
	+
	+ head = &get_cpu_var(delayed_drop_list);
	+ while (!list_empty(head)) {
	+ struct mm_struct *mm = list_entry(head->next,
	+ struct mm_struct, delayed_drop);
	+ list_del(&mm->delayed_drop);
	+ put_cpu_var(delayed_drop_list);
	+
	+ __mmdrop(mm);
	+ ret = 1;
	+
	+ head = &get_cpu_var(delayed_drop_list);
	+ }
	+ put_cpu_var(delayed_drop_list);
	+
	+ return ret;
	+}
	+
	+/*
	+ * We dont want to do complex work from the scheduler, thus
	+ * we delay the work to a per-CPU worker thread:
	+ */
	+void __mmdrop_delayed(struct mm_struct *mm)
	+{
	+ struct task_struct *cpu_desched_task;
	+ struct list_head *head;
	+
	+ head = &get_cpu_var(delayed_drop_list);
	+ list_add_tail(&mm->delayed_drop, head);
	+ cpu_desched_task = __get_cpu_var(desched_task);
	+ if (cpu_desched_task)
	+ wake_up_process(cpu_desched_task);
	+ put_cpu_var(delayed_drop_list);
	+}
	+
	+static void takeover_delayed_drop(int hotcpu)
	+{
	+ struct list_head *head = &per_cpu(delayed_drop_list, hotcpu);
	+
	+ while (!list_empty(head)) {
	+ struct mm_struct *mm = list_entry(head->next,
	+ struct mm_struct, delayed_drop);
	+
	+ list_del(&mm->delayed_drop);
	+ __mmdrop_delayed(mm);
	+ }
	+}
	+
	+static int desched_thread(void * __bind_cpu)
	+{
	+ set_user_nice(current, -10);
	+ current->flags \|= PF_NOFREEZE;
	+ current->extra_flags \|= PFE_SOFTIRQ;
	+
	+ set_current_state(TASK_INTERRUPTIBLE);
	+
	+ while (!kthread_should_stop()) {
	+
	+ if (mmdrop_complete())
	+ continue;
	+ schedule();
	+
	+ /*
	+ * This must be called from time to time on ia64, and is a
	+ * no-op on other archs. Used to be in cpu_idle(), but with
	+ * the new -rt semantics it can't stay there.
	+ */
	+ check_pgt_cache();
	+
	+ set_current_state(TASK_INTERRUPTIBLE);
	+ }
	+ __set_current_state(TASK_RUNNING);
	+ return 0;
	+}
	+
	+static int __devinit cpu_callback(struct notifier_block *nfb,
	+ unsigned long action,
	+ void *hcpu)
	+{
	+ int hotcpu = (unsigned long)hcpu;
	+ struct task_struct *p;
	+
	+ switch (action) {
	+ case CPU_UP_PREPARE:
	+
	+ BUG_ON(per_cpu(desched_task, hotcpu));
	+ INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu));
	+ p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu);
	+ if (IS_ERR(p)) {
	+ printk("desched_thread for %i failed\n", hotcpu);
	+ return NOTIFY_BAD;
	+ }
	+ per_cpu(desched_task, hotcpu) = p;
	+ kthread_bind(p, hotcpu);
	+ break;
	+ case CPU_ONLINE:
	+
	+ wake_up_process(per_cpu(desched_task, hotcpu));
	+ break;
	+#ifdef CONFIG_HOTPLUG_CPU
	+ case CPU_UP_CANCELED:
	+
	+ /* Unbind so it can run. Fall thru. */
	+ kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id());
	+ case CPU_DEAD:
	+
	+ p = per_cpu(desched_task, hotcpu);
	+ per_cpu(desched_task, hotcpu) = NULL;
	+ kthread_stop(p);
	+ takeover_delayed_drop(hotcpu);
	+ takeover_tasklets(hotcpu);
	+ break;
	+#endif /* CONFIG_HOTPLUG_CPU */
	+ }
	+ return NOTIFY_OK;
	+}
	+
	+static struct notifier_block __devinitdata cpu_nfb = {
	+ .notifier_call = cpu_callback
	+};
	+
	+__init int spawn_desched_task(void)
	+{
	+ void cpu = (void )(long)smp_processor_id();
	+
	+ cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
	+ cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
	+ register_cpu_notifier(&cpu_nfb);
	+ return 0;
	+}
	diff --git a/kernel/sched.c b/kernel/sched.c
	index 987e4c3..7cb260b 100644
	--- a/kernel/sched.c
	+++ b/kernel/sched.c
	@@ -2906,8 +2906,12 @@ static void finish_task_switch(struct rq rq, struct task_struct prev)
	finish_lock_switch(rq, prev);

	fire_sched_in_preempt_notifiers(current);
	+ /*
	+ * Delay the final freeing of the mm or task, so that we dont have
	+ * to do complex work from within the scheduler:
	+ */
	if (mm)
	- mmdrop(mm);
	+ mmdrop_delayed(mm);
	if (unlikely(prev_state == TASK_DEAD)) {
	/*
	* Remove function-return probe instances associated with this
	@@ -5903,7 +5907,11 @@ void idle_task_exit(void)

	if (mm != &init_mm)
	switch_mm(mm, &init_mm, current);
	+#ifdef CONFIG_PREEMPT_RT
	+ mmdrop_delayed(mm);
	+#else
	mmdrop(mm);
	+#endif
	}

	/* called under rq->lock with disabled interrupts */
	--
	1.7.0.4