mm-page_alloc-rt-friendly-per-cpu-pages.patch - pub/scm/linux/kernel/git/paulg/rt-patches - Git at Google

 From d7da901ab7d409d7fdd76c9964b07b48675a051c Mon Sep 17 00:00:00 2001
 From: Ingo Molnar <mingo@elte.hu>
 Date: Fri, 3 Jul 2009 08:29:37 -0500
 Subject: [PATCH] mm: page_alloc: rt-friendly per-cpu pages

 commit ff3fd6afd788760c846a2f4449487debb6c4b0ac in tip.

 rt-friendly per-cpu pages: convert the irqs-off per-cpu locking
 method into a preemptible, explicit-per-cpu-locks method.

 Contains fixes from:
 	 Peter Zijlstra <a.p.zijlstra@chello.nl>
 	 Thomas Gleixner <tglx@linutronix.de>

 [PG: upstream 99dcc3e5a94e muddies the waters of applying the original,
  for example free_zone_pagesets() is gone.]

 Signed-off-by: Ingo Molnar <mingo@elte.hu>
 Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
 Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>

 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
 index a8182c8..0d15911 100644
 --- a/mm/page_alloc.c
 +++ b/mm/page_alloc.c
 @@ -187,6 +187,54 @@ static unsigned long __meminitdata dma_reserve;
    EXPORT_SYMBOL(movable_zone);
  #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

 +#ifdef CONFIG_PREEMPT_RT
 +static DEFINE_PER_CPU_LOCKED(int, pcp_locks);
 +#endif
 +
 +static inline void __lock_cpu_pcp(unsigned long *flags, int cpu)
 +{
 +#ifdef CONFIG_PREEMPT_RT
 +	spin_lock(&__get_cpu_lock(pcp_locks, cpu));
 +	flags = 0;
 +#else
 +	local_irq_save(*flags);
 +#endif
 +}
 +
 +static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu)
 +{
 +#ifdef CONFIG_PREEMPT_RT
 +	(void)get_cpu_var_locked(pcp_locks, this_cpu);
 +	flags = 0;
 +#else
 +	local_irq_save(*flags);
 +	*this_cpu = smp_processor_id();
 +#endif
 +}
 +
 +static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu)
 +{
 +#ifdef CONFIG_PREEMPT_RT
 +	put_cpu_var_locked(pcp_locks, this_cpu);
 +#else
 +	local_irq_restore(flags);
 +#endif
 +}
 +
 +// PG: FIXME - zone_pcp is dead (99dcc3e5a9) so kill these get/put variants?
 +static struct per_cpu_pageset *
 +get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu)
 +{
 +	lock_cpu_pcp(flags, this_cpu);
 +	return per_cpu_ptr(zone->pageset, *this_cpu);
 +}
 +
 +static void
 +put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu)
 +{
 +	unlock_cpu_pcp(flags, this_cpu);
 +}
 +
  #if MAX_NUMNODES > 1
  int nr_node_ids __read_mostly = MAX_NUMNODES;
  int nr_online_nodes __read_mostly = 1;
 @@ -604,8 +652,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
  static void __free_pages_ok(struct page *page, unsigned int order)
  {
  	unsigned long flags;
 -	int i;
 -	int bad = 0;
 +	int i, this_cpu, bad = 0;
  	int wasMlocked = __TestClearPageMlocked(page);

  	trace_mm_page_free_direct(page, order);
 @@ -624,13 +671,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
  	arch_free_page(page, order);
  	kernel_map_pages(page, 1 << order, 0);

 -	local_irq_save(flags);
 +	lock_cpu_pcp(&flags, &this_cpu);
  	if (unlikely(wasMlocked))
  		free_page_mlock(page);
 -	__count_vm_events(PGFREE, 1 << order);
 +	count_vm_events(PGFREE, 1 << order);
 +	unlock_cpu_pcp(flags, this_cpu);
  	free_one_page(page_zone(page), page, order,
  					get_pageblock_migratetype(page));
 -	local_irq_restore(flags);
  }

  /*
 @@ -1007,15 +1054,16 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
  {
  	unsigned long flags;
  	int to_drain;
 +	int this_cpu;

 -	local_irq_save(flags);
 +	lock_cpu_pcp(&flags, &this_cpu);
  	if (pcp->count >= pcp->batch)
  		to_drain = pcp->batch;
  	else
  		to_drain = pcp->count;
  	free_pcppages_bulk(zone, to_drain, pcp);
  	pcp->count -= to_drain;
 -	local_irq_restore(flags);
 +	unlock_cpu_pcp(flags, this_cpu);
  }
  #endif

 @@ -1035,13 +1083,18 @@ static void drain_pages(unsigned int cpu)
  		struct per_cpu_pageset *pset;
  		struct per_cpu_pages *pcp;

 -		local_irq_save(flags);
 +		__lock_cpu_pcp(&flags, cpu);
  		pset = per_cpu_ptr(zone->pageset, cpu);

 +		if (!pset) {
 +			unlock_cpu_pcp(flags, cpu);
 +			WARN_ON(1);
 +			continue;
 +		}
  		pcp = &pset->pcp;
  		free_pcppages_bulk(zone, pcp->count, pcp);
  		pcp->count = 0;
 -		local_irq_restore(flags);
 +		unlock_cpu_pcp(flags, cpu);
  	}
  }

 @@ -1053,12 +1106,52 @@ void drain_local_pages(void *arg)
  	drain_pages(smp_processor_id());
  }

 +#ifdef CONFIG_PREEMPT_RT
 +static void drain_local_pages_work(struct work_struct *wrk)
 +{
 +	drain_pages(smp_processor_id());
 +}
 +#endif
 +
  /*
   * Spill all the per-cpu pages from all CPUs back into the buddy allocator
   */
  void drain_all_pages(void)
  {
 +#ifdef CONFIG_PREEMPT_RT
 +	/*
 +	 * HACK!!!!!
 +	 *  For RT we can't use IPIs to run drain_local_pages, since
 +	 *  that code will call spin_locks that will now sleep.
 +	 *  But, schedule_on_each_cpu will call kzalloc, which will
 +	 *  call page_alloc which was what calls this.
 +	 *
 +	 *  Luckily, there's a condition to get here, and that is if
 +	 *  the order passed in to alloc_pages is greater than 0
 +	 *  (alloced more than a page size).  The slabs only allocate
 +	 *  what is needed, and the allocation made by schedule_on_each_cpu
 +	 *  does an alloc of "sizeof(void *)*nr_cpu_ids".
 +	 *
 +	 *  So we can safely call schedule_on_each_cpu if that number
 +	 *  is less than a page. Otherwise don't bother. At least warn of
 +	 *  this issue.
 +	 *
 +	 * And yes, this is one big hack.  Please fix ;-)
 +	 */
 +	if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE)
 +		schedule_on_each_cpu(drain_local_pages_work);
 +	else {
 +		static int once;
 +		if (!once) {
 +			printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n");
 +			once = 1;
 +		}
 +		drain_local_pages(NULL);
 +	}
 +
 +#else
  	on_each_cpu(drain_local_pages, NULL, 1);
 +#endif
  }

  #ifdef CONFIG_HIBERNATION
 @@ -1104,10 +1197,11 @@ void mark_free_pages(struct zone *zone)
  void free_hot_cold_page(struct page *page, int cold)
  {
  	struct zone *zone = page_zone(page);
 +	struct per_cpu_pageset *pset;
  	struct per_cpu_pages *pcp;
  	unsigned long flags;
  	int migratetype;
 -	int wasMlocked = __TestClearPageMlocked(page);
 +	int this_cpu, wasMlocked = __TestClearPageMlocked(page);

  	trace_mm_page_free_direct(page, 0);
  	kmemcheck_free_shadow(page, 0);
 @@ -1124,12 +1218,13 @@ void free_hot_cold_page(struct page *page, int cold)
  	arch_free_page(page, 0);
  	kernel_map_pages(page, 1, 0);

 +	pset = get_zone_pcp(zone, &flags, &this_cpu);
 +	pcp = &pset->pcp;
  	migratetype = get_pageblock_migratetype(page);
  	set_page_private(page, migratetype);
 -	local_irq_save(flags);
  	if (unlikely(wasMlocked))
  		free_page_mlock(page);
 -	__count_vm_event(PGFREE);
 +	count_vm_event(PGFREE);

  	/*
  	 * We only track unmovable, reclaimable and movable on pcp lists.
 @@ -1146,7 +1241,6 @@ void free_hot_cold_page(struct page *page, int cold)
  		migratetype = MIGRATE_MOVABLE;
  	}

 -	pcp = &this_cpu_ptr(zone->pageset)->pcp;
  	if (cold)
  		list_add_tail(&page->lru, &pcp->lists[migratetype]);
  	else
 @@ -1158,7 +1252,7 @@ void free_hot_cold_page(struct page *page, int cold)
  	}

  out:
 -	local_irq_restore(flags);
 +	put_zone_pcp(zone, flags, this_cpu);
  }

  /*
 @@ -1202,15 +1296,18 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
  	unsigned long flags;
  	struct page *page;
  	int cold = !!(gfp_flags & __GFP_COLD);
 +	struct per_cpu_pageset *pset;
 +	int this_cpu;

  again:
 +	pset = get_zone_pcp(zone, &flags, &this_cpu);
 +
  	if (likely(order == 0)) {
 -		struct per_cpu_pages *pcp;
  		struct list_head *list;
 +		struct per_cpu_pages *pcp = &pset->pcp;

 -		local_irq_save(flags);
 -		pcp = &this_cpu_ptr(zone->pageset)->pcp;
  		list = &pcp->lists[migratetype];
 +
  		if (list_empty(list)) {
  			pcp->count += rmqueue_bulk(zone, 0,
  					pcp->batch, list,
 @@ -1240,7 +1337,7 @@ again:
  			 */
  			WARN_ON_ONCE(order > 1);
  		}
 -		spin_lock_irqsave(&zone->lock, flags);
 +		spin_lock(&zone->lock);
  		page = __rmqueue(zone, order, migratetype);
  		spin_unlock(&zone->lock);
  		if (!page)
 @@ -1250,7 +1347,7 @@ again:

  	__count_zone_vm_events(PGALLOC, zone, 1 << order);
  	zone_statistics(preferred_zone, zone);
 -	local_irq_restore(flags);
 +	put_zone_pcp(zone, flags, this_cpu);

  	VM_BUG_ON(bad_range(zone, page));
  	if (prep_new_page(page, order, gfp_flags))
 @@ -1258,7 +1355,7 @@ again:
  	return page;

  failed:
 -	local_irq_restore(flags);
 +	put_zone_pcp(zone, flags, this_cpu);
  	return NULL;
  }

 --
 1.7.1.1
	From d7da901ab7d409d7fdd76c9964b07b48675a051c Mon Sep 17 00:00:00 2001
	From: Ingo Molnar <mingo@elte.hu>
	Date: Fri, 3 Jul 2009 08:29:37 -0500
	Subject: [PATCH] mm: page_alloc: rt-friendly per-cpu pages

	commit ff3fd6afd788760c846a2f4449487debb6c4b0ac in tip.

	rt-friendly per-cpu pages: convert the irqs-off per-cpu locking
	method into a preemptible, explicit-per-cpu-locks method.

	Contains fixes from:
	Peter Zijlstra <a.p.zijlstra@chello.nl>
	Thomas Gleixner <tglx@linutronix.de>

	[PG: upstream 99dcc3e5a94e muddies the waters of applying the original,
	for example free_zone_pagesets() is gone.]

	Signed-off-by: Ingo Molnar <mingo@elte.hu>
	Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
	Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>

	diff --git a/mm/page_alloc.c b/mm/page_alloc.c
	index a8182c8..0d15911 100644
	--- a/mm/page_alloc.c
	+++ b/mm/page_alloc.c
	@@ -187,6 +187,54 @@ static unsigned long __meminitdata dma_reserve;
	EXPORT_SYMBOL(movable_zone);
	#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

	+#ifdef CONFIG_PREEMPT_RT
	+static DEFINE_PER_CPU_LOCKED(int, pcp_locks);
	+#endif
	+
	+static inline void __lock_cpu_pcp(unsigned long *flags, int cpu)
	+{
	+#ifdef CONFIG_PREEMPT_RT
	+ spin_lock(&__get_cpu_lock(pcp_locks, cpu));
	+ flags = 0;
	+#else
	+ local_irq_save(*flags);
	+#endif
	+}
	+
	+static inline void lock_cpu_pcp(unsigned long flags, int this_cpu)
	+{
	+#ifdef CONFIG_PREEMPT_RT
	+ (void)get_cpu_var_locked(pcp_locks, this_cpu);
	+ flags = 0;
	+#else
	+ local_irq_save(*flags);
	+ *this_cpu = smp_processor_id();
	+#endif
	+}
	+
	+static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu)
	+{
	+#ifdef CONFIG_PREEMPT_RT
	+ put_cpu_var_locked(pcp_locks, this_cpu);
	+#else
	+ local_irq_restore(flags);
	+#endif
	+}
	+
	+// PG: FIXME - zone_pcp is dead (99dcc3e5a9) so kill these get/put variants?
	+static struct per_cpu_pageset *
	+get_zone_pcp(struct zone zone, unsigned long flags, int *this_cpu)
	+{
	+ lock_cpu_pcp(flags, this_cpu);
	+ return per_cpu_ptr(zone->pageset, *this_cpu);
	+}
	+
	+static void
	+put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu)
	+{
	+ unlock_cpu_pcp(flags, this_cpu);
	+}
	+
	#if MAX_NUMNODES > 1
	int nr_node_ids __read_mostly = MAX_NUMNODES;
	int nr_online_nodes __read_mostly = 1;
	@@ -604,8 +652,7 @@ static void free_one_page(struct zone zone, struct page page, int order,
	static void __free_pages_ok(struct page *page, unsigned int order)
	{
	unsigned long flags;
	- int i;
	- int bad = 0;
	+ int i, this_cpu, bad = 0;
	int wasMlocked = __TestClearPageMlocked(page);

	trace_mm_page_free_direct(page, order);
	@@ -624,13 +671,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
	arch_free_page(page, order);
	kernel_map_pages(page, 1 << order, 0);

	- local_irq_save(flags);
	+ lock_cpu_pcp(&flags, &this_cpu);
	if (unlikely(wasMlocked))
	free_page_mlock(page);
	- __count_vm_events(PGFREE, 1 << order);
	+ count_vm_events(PGFREE, 1 << order);
	+ unlock_cpu_pcp(flags, this_cpu);
	free_one_page(page_zone(page), page, order,
	get_pageblock_migratetype(page));
	- local_irq_restore(flags);
	}

	/*
	@@ -1007,15 +1054,16 @@ void drain_zone_pages(struct zone zone, struct per_cpu_pages pcp)
	{
	unsigned long flags;
	int to_drain;
	+ int this_cpu;

	- local_irq_save(flags);
	+ lock_cpu_pcp(&flags, &this_cpu);
	if (pcp->count >= pcp->batch)
	to_drain = pcp->batch;
	else
	to_drain = pcp->count;
	free_pcppages_bulk(zone, to_drain, pcp);
	pcp->count -= to_drain;
	- local_irq_restore(flags);
	+ unlock_cpu_pcp(flags, this_cpu);
	}
	#endif

	@@ -1035,13 +1083,18 @@ static void drain_pages(unsigned int cpu)
	struct per_cpu_pageset *pset;
	struct per_cpu_pages *pcp;

	- local_irq_save(flags);
	+ __lock_cpu_pcp(&flags, cpu);
	pset = per_cpu_ptr(zone->pageset, cpu);

	+ if (!pset) {
	+ unlock_cpu_pcp(flags, cpu);
	+ WARN_ON(1);
	+ continue;
	+ }
	pcp = &pset->pcp;
	free_pcppages_bulk(zone, pcp->count, pcp);
	pcp->count = 0;
	- local_irq_restore(flags);
	+ unlock_cpu_pcp(flags, cpu);
	}
	}

	@@ -1053,12 +1106,52 @@ void drain_local_pages(void *arg)
	drain_pages(smp_processor_id());
	}

	+#ifdef CONFIG_PREEMPT_RT
	+static void drain_local_pages_work(struct work_struct *wrk)
	+{
	+ drain_pages(smp_processor_id());
	+}
	+#endif
	+
	/*
	* Spill all the per-cpu pages from all CPUs back into the buddy allocator
	*/
	void drain_all_pages(void)
	{
	+#ifdef CONFIG_PREEMPT_RT
	+ /*
	+ * HACK!!!!!
	+ * For RT we can't use IPIs to run drain_local_pages, since
	+ * that code will call spin_locks that will now sleep.
	+ * But, schedule_on_each_cpu will call kzalloc, which will
	+ * call page_alloc which was what calls this.
	+ *
	+ * Luckily, there's a condition to get here, and that is if
	+ * the order passed in to alloc_pages is greater than 0
	+ * (alloced more than a page size). The slabs only allocate
	+ * what is needed, and the allocation made by schedule_on_each_cpu
	+ * does an alloc of "sizeof(void )nr_cpu_ids".
	+ *
	+ * So we can safely call schedule_on_each_cpu if that number
	+ * is less than a page. Otherwise don't bother. At least warn of
	+ * this issue.
	+ *
	+ * And yes, this is one big hack. Please fix ;-)
	+ */
	+ if (sizeof(void )nr_cpu_ids < PAGE_SIZE)
	+ schedule_on_each_cpu(drain_local_pages_work);
	+ else {
	+ static int once;
	+ if (!once) {
	+ printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n");
	+ once = 1;
	+ }
	+ drain_local_pages(NULL);
	+ }
	+
	+#else
	on_each_cpu(drain_local_pages, NULL, 1);
	+#endif
	}

	#ifdef CONFIG_HIBERNATION
	@@ -1104,10 +1197,11 @@ void mark_free_pages(struct zone *zone)
	void free_hot_cold_page(struct page *page, int cold)
	{
	struct zone *zone = page_zone(page);
	+ struct per_cpu_pageset *pset;
	struct per_cpu_pages *pcp;
	unsigned long flags;
	int migratetype;
	- int wasMlocked = __TestClearPageMlocked(page);
	+ int this_cpu, wasMlocked = __TestClearPageMlocked(page);

	trace_mm_page_free_direct(page, 0);
	kmemcheck_free_shadow(page, 0);
	@@ -1124,12 +1218,13 @@ void free_hot_cold_page(struct page *page, int cold)
	arch_free_page(page, 0);
	kernel_map_pages(page, 1, 0);

	+ pset = get_zone_pcp(zone, &flags, &this_cpu);
	+ pcp = &pset->pcp;
	migratetype = get_pageblock_migratetype(page);
	set_page_private(page, migratetype);
	- local_irq_save(flags);
	if (unlikely(wasMlocked))
	free_page_mlock(page);
	- __count_vm_event(PGFREE);
	+ count_vm_event(PGFREE);

	/*
	* We only track unmovable, reclaimable and movable on pcp lists.
	@@ -1146,7 +1241,6 @@ void free_hot_cold_page(struct page *page, int cold)
	migratetype = MIGRATE_MOVABLE;
	}

	- pcp = &this_cpu_ptr(zone->pageset)->pcp;
	if (cold)
	list_add_tail(&page->lru, &pcp->lists[migratetype]);
	else
	@@ -1158,7 +1252,7 @@ void free_hot_cold_page(struct page *page, int cold)
	}

	out:
	- local_irq_restore(flags);
	+ put_zone_pcp(zone, flags, this_cpu);
	}

	/*
	@@ -1202,15 +1296,18 @@ struct page buffered_rmqueue(struct zone preferred_zone,
	unsigned long flags;
	struct page *page;
	int cold = !!(gfp_flags & __GFP_COLD);
	+ struct per_cpu_pageset *pset;
	+ int this_cpu;

	again:
	+ pset = get_zone_pcp(zone, &flags, &this_cpu);
	+
	if (likely(order == 0)) {
	- struct per_cpu_pages *pcp;
	struct list_head *list;
	+ struct per_cpu_pages *pcp = &pset->pcp;

	- local_irq_save(flags);
	- pcp = &this_cpu_ptr(zone->pageset)->pcp;
	list = &pcp->lists[migratetype];
	+
	if (list_empty(list)) {
	pcp->count += rmqueue_bulk(zone, 0,
	pcp->batch, list,
	@@ -1240,7 +1337,7 @@ again:
	*/
	WARN_ON_ONCE(order > 1);
	}
	- spin_lock_irqsave(&zone->lock, flags);
	+ spin_lock(&zone->lock);
	page = __rmqueue(zone, order, migratetype);
	spin_unlock(&zone->lock);
	if (!page)
	@@ -1250,7 +1347,7 @@ again:

	__count_zone_vm_events(PGALLOC, zone, 1 << order);
	zone_statistics(preferred_zone, zone);
	- local_irq_restore(flags);
	+ put_zone_pcp(zone, flags, this_cpu);

	VM_BUG_ON(bad_range(zone, page));
	if (prep_new_page(page, order, gfp_flags))
	@@ -1258,7 +1355,7 @@ again:
	return page;

	failed:
	- local_irq_restore(flags);
	+ put_zone_pcp(zone, flags, this_cpu);
	return NULL;
	}

	--
	1.7.1.1