blob: 2634675fb638808e27b3aab9fa6746ba0faf4b6f [file] [log] [blame]
From d7da901ab7d409d7fdd76c9964b07b48675a051c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:37 -0500
Subject: [PATCH] mm: page_alloc: rt-friendly per-cpu pages
commit ff3fd6afd788760c846a2f4449487debb6c4b0ac in tip.
rt-friendly per-cpu pages: convert the irqs-off per-cpu locking
method into a preemptible, explicit-per-cpu-locks method.
Contains fixes from:
Peter Zijlstra <a.p.zijlstra@chello.nl>
Thomas Gleixner <tglx@linutronix.de>
[PG: upstream 99dcc3e5a94e muddies the waters of applying the original,
for example free_zone_pagesets() is gone.]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a8182c8..0d15911 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -187,6 +187,54 @@ static unsigned long __meminitdata dma_reserve;
EXPORT_SYMBOL(movable_zone);
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#ifdef CONFIG_PREEMPT_RT
+static DEFINE_PER_CPU_LOCKED(int, pcp_locks);
+#endif
+
+static inline void __lock_cpu_pcp(unsigned long *flags, int cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+ spin_lock(&__get_cpu_lock(pcp_locks, cpu));
+ flags = 0;
+#else
+ local_irq_save(*flags);
+#endif
+}
+
+static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+ (void)get_cpu_var_locked(pcp_locks, this_cpu);
+ flags = 0;
+#else
+ local_irq_save(*flags);
+ *this_cpu = smp_processor_id();
+#endif
+}
+
+static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+ put_cpu_var_locked(pcp_locks, this_cpu);
+#else
+ local_irq_restore(flags);
+#endif
+}
+
+// PG: FIXME - zone_pcp is dead (99dcc3e5a9) so kill these get/put variants?
+static struct per_cpu_pageset *
+get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu)
+{
+ lock_cpu_pcp(flags, this_cpu);
+ return per_cpu_ptr(zone->pageset, *this_cpu);
+}
+
+static void
+put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu)
+{
+ unlock_cpu_pcp(flags, this_cpu);
+}
+
#if MAX_NUMNODES > 1
int nr_node_ids __read_mostly = MAX_NUMNODES;
int nr_online_nodes __read_mostly = 1;
@@ -604,8 +652,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
- int i;
- int bad = 0;
+ int i, this_cpu, bad = 0;
int wasMlocked = __TestClearPageMlocked(page);
trace_mm_page_free_direct(page, order);
@@ -624,13 +671,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
arch_free_page(page, order);
kernel_map_pages(page, 1 << order, 0);
- local_irq_save(flags);
+ lock_cpu_pcp(&flags, &this_cpu);
if (unlikely(wasMlocked))
free_page_mlock(page);
- __count_vm_events(PGFREE, 1 << order);
+ count_vm_events(PGFREE, 1 << order);
+ unlock_cpu_pcp(flags, this_cpu);
free_one_page(page_zone(page), page, order,
get_pageblock_migratetype(page));
- local_irq_restore(flags);
}
/*
@@ -1007,15 +1054,16 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
unsigned long flags;
int to_drain;
+ int this_cpu;
- local_irq_save(flags);
+ lock_cpu_pcp(&flags, &this_cpu);
if (pcp->count >= pcp->batch)
to_drain = pcp->batch;
else
to_drain = pcp->count;
free_pcppages_bulk(zone, to_drain, pcp);
pcp->count -= to_drain;
- local_irq_restore(flags);
+ unlock_cpu_pcp(flags, this_cpu);
}
#endif
@@ -1035,13 +1083,18 @@ static void drain_pages(unsigned int cpu)
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
- local_irq_save(flags);
+ __lock_cpu_pcp(&flags, cpu);
pset = per_cpu_ptr(zone->pageset, cpu);
+ if (!pset) {
+ unlock_cpu_pcp(flags, cpu);
+ WARN_ON(1);
+ continue;
+ }
pcp = &pset->pcp;
free_pcppages_bulk(zone, pcp->count, pcp);
pcp->count = 0;
- local_irq_restore(flags);
+ unlock_cpu_pcp(flags, cpu);
}
}
@@ -1053,12 +1106,52 @@ void drain_local_pages(void *arg)
drain_pages(smp_processor_id());
}
+#ifdef CONFIG_PREEMPT_RT
+static void drain_local_pages_work(struct work_struct *wrk)
+{
+ drain_pages(smp_processor_id());
+}
+#endif
+
/*
* Spill all the per-cpu pages from all CPUs back into the buddy allocator
*/
void drain_all_pages(void)
{
+#ifdef CONFIG_PREEMPT_RT
+ /*
+ * HACK!!!!!
+ * For RT we can't use IPIs to run drain_local_pages, since
+ * that code will call spin_locks that will now sleep.
+ * But, schedule_on_each_cpu will call kzalloc, which will
+ * call page_alloc which was what calls this.
+ *
+ * Luckily, there's a condition to get here, and that is if
+ * the order passed in to alloc_pages is greater than 0
+ * (alloced more than a page size). The slabs only allocate
+ * what is needed, and the allocation made by schedule_on_each_cpu
+ * does an alloc of "sizeof(void *)*nr_cpu_ids".
+ *
+ * So we can safely call schedule_on_each_cpu if that number
+ * is less than a page. Otherwise don't bother. At least warn of
+ * this issue.
+ *
+ * And yes, this is one big hack. Please fix ;-)
+ */
+ if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE)
+ schedule_on_each_cpu(drain_local_pages_work);
+ else {
+ static int once;
+ if (!once) {
+ printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n");
+ once = 1;
+ }
+ drain_local_pages(NULL);
+ }
+
+#else
on_each_cpu(drain_local_pages, NULL, 1);
+#endif
}
#ifdef CONFIG_HIBERNATION
@@ -1104,10 +1197,11 @@ void mark_free_pages(struct zone *zone)
void free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page);
+ struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
unsigned long flags;
int migratetype;
- int wasMlocked = __TestClearPageMlocked(page);
+ int this_cpu, wasMlocked = __TestClearPageMlocked(page);
trace_mm_page_free_direct(page, 0);
kmemcheck_free_shadow(page, 0);
@@ -1124,12 +1218,13 @@ void free_hot_cold_page(struct page *page, int cold)
arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);
+ pset = get_zone_pcp(zone, &flags, &this_cpu);
+ pcp = &pset->pcp;
migratetype = get_pageblock_migratetype(page);
set_page_private(page, migratetype);
- local_irq_save(flags);
if (unlikely(wasMlocked))
free_page_mlock(page);
- __count_vm_event(PGFREE);
+ count_vm_event(PGFREE);
/*
* We only track unmovable, reclaimable and movable on pcp lists.
@@ -1146,7 +1241,6 @@ void free_hot_cold_page(struct page *page, int cold)
migratetype = MIGRATE_MOVABLE;
}
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
if (cold)
list_add_tail(&page->lru, &pcp->lists[migratetype]);
else
@@ -1158,7 +1252,7 @@ void free_hot_cold_page(struct page *page, int cold)
}
out:
- local_irq_restore(flags);
+ put_zone_pcp(zone, flags, this_cpu);
}
/*
@@ -1202,15 +1296,18 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
+ struct per_cpu_pageset *pset;
+ int this_cpu;
again:
+ pset = get_zone_pcp(zone, &flags, &this_cpu);
+
if (likely(order == 0)) {
- struct per_cpu_pages *pcp;
struct list_head *list;
+ struct per_cpu_pages *pcp = &pset->pcp;
- local_irq_save(flags);
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
+
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
@@ -1240,7 +1337,7 @@ again:
*/
WARN_ON_ONCE(order > 1);
}
- spin_lock_irqsave(&zone->lock, flags);
+ spin_lock(&zone->lock);
page = __rmqueue(zone, order, migratetype);
spin_unlock(&zone->lock);
if (!page)
@@ -1250,7 +1347,7 @@ again:
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone);
- local_irq_restore(flags);
+ put_zone_pcp(zone, flags, this_cpu);
VM_BUG_ON(bad_range(zone, page));
if (prep_new_page(page, order, gfp_flags))
@@ -1258,7 +1355,7 @@ again:
return page;
failed:
- local_irq_restore(flags);
+ put_zone_pcp(zone, flags, this_cpu);
return NULL;
}
--
1.7.1.1