| From 0f8fca28f1876cb884b9254fc30fb6f95321f306 Mon Sep 17 00:00:00 2001 |
| From: Peter Zijlstra <a.p.zijlstra@chello.nl> |
| Date: Fri, 3 Jul 2009 08:30:39 -0500 |
| Subject: [PATCH] mm: remove kmap_lock |
| |
| commit b38cb5ad333d0f84adf1825cf24929707ffce57d in tip. |
| |
| Eradicate global locks. |
| |
| - kmap_lock is removed by extensive use of atomic_t and a new flush |
| scheme. |
| |
| - pool_lock is removed by using the pkmap index for the |
| page_address_maps and modifying set_page_address to only allow |
| NULL<->virt transitions. |
| |
| Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> |
| Signed-off-by: Ingo Molnar <mingo@elte.hu> |
| Signed-off-by: Thomas Gleixner <tglx@linutronix.de> |
| |
| diff --git a/include/linux/mm.h b/include/linux/mm.h |
| index fc0f76f..9fb57fa 100644 |
| --- a/include/linux/mm.h |
| +++ b/include/linux/mm.h |
| @@ -597,23 +597,39 @@ static __always_inline void *lowmem_page_address(struct page *page) |
| #endif |
| |
| #if defined(WANT_PAGE_VIRTUAL) |
| -#define page_address(page) ((page)->virtual) |
| -#define set_page_address(page, address) \ |
| - do { \ |
| - (page)->virtual = (address); \ |
| - } while(0) |
| -#define page_address_init() do { } while(0) |
| +/* |
| + * wrap page->virtual so it is safe to set/read locklessly |
| + */ |
| +#define page_address(page) \ |
| + ({ typeof((page)->virtual) v = (page)->virtual; \ |
| + smp_read_barrier_depends(); \ |
| + v; }) |
| + |
| +static inline int set_page_address(struct page *page, void *address) |
| +{ |
| + if (address) |
| + return cmpxchg(&page->virtual, NULL, address) == NULL; |
| + else { |
| + /* |
| + * cmpxchg is a bit abused because it is not guaranteed |
| + * safe wrt direct assignment on all platforms. |
| + */ |
| + void *virt = page->virtual; |
| + return cmpxchg(&page->vitrual, virt, NULL) == virt; |
| + } |
| +} |
| +void page_address_init(void); |
| #endif |
| |
| #if defined(HASHED_PAGE_VIRTUAL) |
| void *page_address(struct page *page); |
| -void set_page_address(struct page *page, void *virtual); |
| +int set_page_address(struct page *page, void *virtual); |
| void page_address_init(void); |
| #endif |
| |
| #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) |
| #define page_address(page) lowmem_page_address(page) |
| -#define set_page_address(page, address) do { } while(0) |
| +#define set_page_address(page, address) (0) |
| #define page_address_init() do { } while(0) |
| #endif |
| |
| diff --git a/mm/highmem.c b/mm/highmem.c |
| index 9c1e627..b8a833a 100644 |
| --- a/mm/highmem.c |
| +++ b/mm/highmem.c |
| @@ -14,6 +14,11 @@ |
| * based on Linus' idea. |
| * |
| * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> |
| + * |
| + * Largely rewritten to get rid of all global locks |
| + * |
| + * Copyright (C) 2006 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
| + * |
| */ |
| |
| #include <linux/mm.h> |
| @@ -27,17 +32,12 @@ |
| #include <linux/hash.h> |
| #include <linux/highmem.h> |
| #include <asm/tlbflush.h> |
| +#include <asm/pgtable.h> |
| |
| -/* |
| - * Virtual_count is not a pure "count". |
| - * 0 means that it is not mapped, and has not been mapped |
| - * since a TLB flush - it is usable. |
| - * 1 means that there are no users, but it has been mapped |
| - * since the last TLB flush - so we can't use it. |
| - * n means that there are (n-1) current users of it. |
| - */ |
| #ifdef CONFIG_HIGHMEM |
| |
| +static int __set_page_address(struct page *page, void *virtual, int pos); |
| + |
| unsigned long totalhigh_pages __read_mostly; |
| EXPORT_SYMBOL(totalhigh_pages); |
| |
| @@ -58,9 +58,14 @@ unsigned int nr_free_highpages (void) |
| return pages; |
| } |
| |
| -static int pkmap_count[LAST_PKMAP]; |
| -static unsigned int last_pkmap_nr; |
| -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); |
| +/* |
| + * count is not a pure "count". |
| + * 0 means its owned exclusively by someone |
| + * 1 means its free for use - either mapped or not. |
| + * n means that there are (n-1) current users of it. |
| + */ |
| +static atomic_t pkmap_count[LAST_PKMAP]; |
| +static atomic_t pkmap_hand; |
| |
| pte_t * pkmap_page_table; |
| |
| @@ -85,131 +90,191 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); |
| do { spin_unlock(&kmap_lock); (void)(flags); } while (0) |
| #endif |
| |
| -static void flush_all_zero_pkmaps(void) |
| +/* |
| + * Try to free a given kmap slot. |
| + * |
| + * Returns: |
| + * -1 - in use |
| + * 0 - free, no TLB flush needed |
| + * 1 - free, needs TLB flush |
| + */ |
| +static int pkmap_try_free(int pos) |
| { |
| - int i; |
| - int need_flush = 0; |
| - |
| - flush_cache_kmaps(); |
| - |
| - for (i = 0; i < LAST_PKMAP; i++) { |
| - struct page *page; |
| + if (atomic_cmpxchg(&pkmap_count[pos], 1, 0) != 1) |
| + return -1; |
| + /* |
| + * TODO: add a young bit to make it CLOCK |
| + */ |
| + if (!pte_none(pkmap_page_table[pos])) { |
| + struct page *page = pte_page(pkmap_page_table[pos]); |
| + unsigned long addr = PKMAP_ADDR(pos); |
| + pte_t *ptep = &pkmap_page_table[pos]; |
| |
| - /* |
| - * zero means we don't have anything to do, |
| - * >1 means that it is still in use. Only |
| - * a count of 1 means that it is free but |
| - * needs to be unmapped |
| - */ |
| - if (pkmap_count[i] != 1) |
| - continue; |
| - pkmap_count[i] = 0; |
| + VM_BUG_ON(addr != (unsigned long)page_address(page)); |
| |
| - /* sanity check */ |
| - BUG_ON(pte_none(pkmap_page_table[i])); |
| + if (!__set_page_address(page, NULL, pos)) |
| + BUG(); |
| + flush_kernel_dcache_page(page); |
| + pte_clear(&init_mm, addr, ptep); |
| |
| - /* |
| - * Don't need an atomic fetch-and-clear op here; |
| - * no-one has the page mapped, and cannot get at |
| - * its virtual address (and hence PTE) without first |
| - * getting the kmap_lock (which is held here). |
| - * So no dangers, even with speculative execution. |
| - */ |
| - page = pte_page(pkmap_page_table[i]); |
| - pte_clear(&init_mm, (unsigned long)page_address(page), |
| - &pkmap_page_table[i]); |
| |
| - set_page_address(page, NULL); |
| - need_flush = 1; |
| + return 1; |
| } |
| - if (need_flush) |
| - flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); |
| + |
| + return 0; |
| } |
| |
| -/** |
| - * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings |
| - */ |
| -void kmap_flush_unused(void) |
| +static inline void pkmap_put(atomic_t *counter) |
| { |
| - lock_kmap(); |
| - flush_all_zero_pkmaps(); |
| - unlock_kmap(); |
| + switch (atomic_dec_return(counter)) { |
| + case 0: |
| + BUG(); |
| + |
| + case 1: |
| + wake_up(&pkmap_map_wait); |
| + } |
| } |
| |
| -static inline unsigned long map_new_virtual(struct page *page) |
| +#define TLB_BATCH 32 |
| + |
| +static int pkmap_get_free(void) |
| { |
| - unsigned long vaddr; |
| - int count; |
| - |
| -start: |
| - count = LAST_PKMAP; |
| - /* Find an empty entry */ |
| - for (;;) { |
| - last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; |
| - if (!last_pkmap_nr) { |
| - flush_all_zero_pkmaps(); |
| - count = LAST_PKMAP; |
| - } |
| - if (!pkmap_count[last_pkmap_nr]) |
| - break; /* Found a usable entry */ |
| - if (--count) |
| - continue; |
| + int i, pos, flush; |
| + DECLARE_WAITQUEUE(wait, current); |
| + |
| +restart: |
| + for (i = 0; i < LAST_PKMAP; i++) { |
| + pos = atomic_inc_return(&pkmap_hand) % LAST_PKMAP; |
| + flush = pkmap_try_free(pos); |
| + if (flush >= 0) |
| + goto got_one; |
| + } |
| + |
| + /* |
| + * wait for somebody else to unmap their entries |
| + */ |
| + __set_current_state(TASK_UNINTERRUPTIBLE); |
| + add_wait_queue(&pkmap_map_wait, &wait); |
| + schedule(); |
| + remove_wait_queue(&pkmap_map_wait, &wait); |
| + |
| + goto restart; |
| + |
| +got_one: |
| + if (flush) { |
| +#if 0 |
| + flush_tlb_kernel_range(PKMAP_ADDR(pos), PKMAP_ADDR(pos+1)); |
| +#else |
| + int pos2 = (pos + 1) % LAST_PKMAP; |
| + int nr; |
| + int entries[TLB_BATCH]; |
| |
| /* |
| - * Sleep for somebody else to unmap their entries |
| + * For those architectures that cannot help but flush the |
| + * whole TLB, flush some more entries to make it worthwhile. |
| + * Scan ahead of the hand to minimise search distances. |
| */ |
| - { |
| - DECLARE_WAITQUEUE(wait, current); |
| - |
| - __set_current_state(TASK_UNINTERRUPTIBLE); |
| - add_wait_queue(&pkmap_map_wait, &wait); |
| - unlock_kmap(); |
| - schedule(); |
| - remove_wait_queue(&pkmap_map_wait, &wait); |
| - lock_kmap(); |
| - |
| - /* Somebody else might have mapped it while we slept */ |
| - if (page_address(page)) |
| - return (unsigned long)page_address(page); |
| - |
| - /* Re-start */ |
| - goto start; |
| + for (i = 0, nr = 0; i < LAST_PKMAP && nr < TLB_BATCH; |
| + i++, pos2 = (pos2 + 1) % LAST_PKMAP) { |
| + |
| + flush = pkmap_try_free(pos2); |
| + if (flush < 0) |
| + continue; |
| + |
| + if (!flush) { |
| + atomic_t *counter = &pkmap_count[pos2]; |
| + VM_BUG_ON(atomic_read(counter) != 0); |
| + atomic_set(counter, 2); |
| + pkmap_put(counter); |
| + } else |
| + entries[nr++] = pos2; |
| } |
| + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); |
| + |
| + for (i = 0; i < nr; i++) { |
| + atomic_t *counter = &pkmap_count[entries[i]]; |
| + VM_BUG_ON(atomic_read(counter) != 0); |
| + atomic_set(counter, 2); |
| + pkmap_put(counter); |
| + } |
| +#endif |
| } |
| - vaddr = PKMAP_ADDR(last_pkmap_nr); |
| - set_pte_at(&init_mm, vaddr, |
| - &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); |
| + return pos; |
| +} |
| + |
| +static unsigned long pkmap_insert(struct page *page) |
| +{ |
| + int pos = pkmap_get_free(); |
| + unsigned long vaddr = PKMAP_ADDR(pos); |
| + pte_t *ptep = &pkmap_page_table[pos]; |
| + pte_t entry = mk_pte(page, kmap_prot); |
| + atomic_t *counter = &pkmap_count[pos]; |
| |
| - pkmap_count[last_pkmap_nr] = 1; |
| - set_page_address(page, (void *)vaddr); |
| + VM_BUG_ON(atomic_read(counter) != 0); |
| + |
| + set_pte_at(&init_mm, vaddr, ptep, entry); |
| + if (unlikely(!__set_page_address(page, (void *)vaddr, pos))) { |
| + /* |
| + * concurrent pkmap_inserts for this page - |
| + * the other won the race, release this entry. |
| + * |
| + * we can still clear the pte without a tlb flush since |
| + * it couldn't have been used yet. |
| + */ |
| + pte_clear(&init_mm, vaddr, ptep); |
| + VM_BUG_ON(atomic_read(counter) != 0); |
| + atomic_set(counter, 2); |
| + pkmap_put(counter); |
| + vaddr = 0; |
| + } else |
| + atomic_set(counter, 2); |
| |
| return vaddr; |
| } |
| |
| -/** |
| - * kmap_high - map a highmem page into memory |
| - * @page: &struct page to map |
| - * |
| - * Returns the page's virtual memory address. |
| - * |
| - * We cannot call this from interrupts, as it may block. |
| +/* |
| + * Flush all unused kmap mappings in order to remove stray mappings. |
| */ |
| +void kmap_flush_unused(void) |
| +{ |
| + WARN_ON_ONCE(1); |
| +} |
| + |
| void *kmap_high(struct page *page) |
| { |
| unsigned long vaddr; |
| |
| - /* |
| - * For highmem pages, we can't trust "virtual" until |
| - * after we have the lock. |
| - */ |
| - lock_kmap(); |
| +again: |
| vaddr = (unsigned long)page_address(page); |
| + if (vaddr) { |
| + atomic_t *counter = &pkmap_count[PKMAP_NR(vaddr)]; |
| + if (atomic_inc_not_zero(counter)) { |
| + /* |
| + * atomic_inc_not_zero implies a (memory) barrier on success |
| + * so page address will be reloaded. |
| + */ |
| + unsigned long vaddr2 = (unsigned long)page_address(page); |
| + if (likely(vaddr == vaddr2)) |
| + return (void *)vaddr; |
| + |
| + /* |
| + * Oops, we got someone else. |
| + * |
| + * This can happen if we get preempted after |
| + * page_address() and before atomic_inc_not_zero() |
| + * and during that preemption this slot is freed and |
| + * reused. |
| + */ |
| + pkmap_put(counter); |
| + goto again; |
| + } |
| + } |
| + |
| + vaddr = pkmap_insert(page); |
| if (!vaddr) |
| - vaddr = map_new_virtual(page); |
| - pkmap_count[PKMAP_NR(vaddr)]++; |
| - BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); |
| - unlock_kmap(); |
| - return (void*) vaddr; |
| + goto again; |
| + |
| + return (void *)vaddr; |
| } |
| |
| EXPORT_SYMBOL(kmap_high); |
| @@ -240,51 +305,11 @@ void *kmap_high_get(struct page *page) |
| } |
| #endif |
| |
| -/** |
| - * kunmap_high - map a highmem page into memory |
| - * @page: &struct page to unmap |
| - * |
| - * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called |
| - * only from user context. |
| - */ |
| -void kunmap_high(struct page *page) |
| + void kunmap_high(struct page *page) |
| { |
| - unsigned long vaddr; |
| - unsigned long nr; |
| - unsigned long flags; |
| - int need_wakeup; |
| - |
| - lock_kmap_any(flags); |
| - vaddr = (unsigned long)page_address(page); |
| + unsigned long vaddr = (unsigned long)page_address(page); |
| BUG_ON(!vaddr); |
| - nr = PKMAP_NR(vaddr); |
| - |
| - /* |
| - * A count must never go down to zero |
| - * without a TLB flush! |
| - */ |
| - need_wakeup = 0; |
| - switch (--pkmap_count[nr]) { |
| - case 0: |
| - BUG(); |
| - case 1: |
| - /* |
| - * Avoid an unnecessary wake_up() function call. |
| - * The common case is pkmap_count[] == 1, but |
| - * no waiters. |
| - * The tasks queued in the wait-queue are guarded |
| - * by both the lock in the wait-queue-head and by |
| - * the kmap_lock. As the kmap_lock is held here, |
| - * no need for the wait-queue-head's lock. Simply |
| - * test if the queue is empty. |
| - */ |
| - need_wakeup = waitqueue_active(&pkmap_map_wait); |
| - } |
| - unlock_kmap_any(flags); |
| - |
| - /* do wake-up, if needed, race-free outside of the spin lock */ |
| - if (need_wakeup) |
| - wake_up(&pkmap_map_wait); |
| + pkmap_put(&pkmap_count[PKMAP_NR(vaddr)]); |
| } |
| |
| EXPORT_SYMBOL(kunmap_high); |
| @@ -295,19 +320,13 @@ EXPORT_SYMBOL(kunmap_high); |
| #define PA_HASH_ORDER 7 |
| |
| /* |
| - * Describes one page->virtual association |
| + * Describes one page->virtual address association. |
| */ |
| -struct page_address_map { |
| +static struct page_address_map { |
| struct page *page; |
| void *virtual; |
| struct list_head list; |
| -}; |
| - |
| -/* |
| - * page_address_map freelist, allocated from page_address_maps. |
| - */ |
| -static struct list_head page_address_pool; /* freelist */ |
| -static spinlock_t pool_lock; /* protects page_address_pool */ |
| +} page_address_maps[LAST_PKMAP]; |
| |
| /* |
| * Hash table bucket |
| @@ -328,29 +347,37 @@ static struct page_address_slot *page_slot(struct page *page) |
| * |
| * Returns the page's virtual address. |
| */ |
| -void *page_address(struct page *page) |
| -{ |
| - unsigned long flags; |
| - void *ret; |
| - struct page_address_slot *pas; |
| |
| - if (!PageHighMem(page)) |
| - return lowmem_page_address(page); |
| +static void *__page_address(struct page_address_slot *pas, struct page *page) |
| +{ |
| + void *ret = NULL; |
| |
| - pas = page_slot(page); |
| - ret = NULL; |
| - spin_lock_irqsave(&pas->lock, flags); |
| if (!list_empty(&pas->lh)) { |
| struct page_address_map *pam; |
| |
| list_for_each_entry(pam, &pas->lh, list) { |
| if (pam->page == page) { |
| ret = pam->virtual; |
| - goto done; |
| + break; |
| } |
| } |
| } |
| -done: |
| + |
| + return ret; |
| +} |
| + |
| +void *page_address(struct page *page) |
| +{ |
| + unsigned long flags; |
| + void *ret; |
| + struct page_address_slot *pas; |
| + |
| + if (!PageHighMem(page)) |
| + return lowmem_page_address(page); |
| + |
| + pas = page_slot(page); |
| + spin_lock_irqsave(&pas->lock, flags); |
| + ret = __page_address(pas, page); |
| spin_unlock_irqrestore(&pas->lock, flags); |
| return ret; |
| } |
| @@ -362,62 +389,87 @@ EXPORT_SYMBOL(page_address); |
| * @page: &struct page to set |
| * @virtual: virtual address to use |
| */ |
| -void set_page_address(struct page *page, void *virtual) |
| +static int __set_page_address(struct page *page, void *virtual, int pos) |
| { |
| + int ret = 0; |
| unsigned long flags; |
| struct page_address_slot *pas; |
| struct page_address_map *pam; |
| |
| - BUG_ON(!PageHighMem(page)); |
| + VM_BUG_ON(!PageHighMem(page)); |
| + VM_BUG_ON(atomic_read(&pkmap_count[pos]) != 0); |
| + VM_BUG_ON(pos < 0 || pos >= LAST_PKMAP); |
| |
| pas = page_slot(page); |
| - if (virtual) { /* Add */ |
| - BUG_ON(list_empty(&page_address_pool)); |
| - |
| - spin_lock_irqsave(&pool_lock, flags); |
| - pam = list_entry(page_address_pool.next, |
| - struct page_address_map, list); |
| - list_del(&pam->list); |
| - spin_unlock_irqrestore(&pool_lock, flags); |
| - |
| - pam->page = page; |
| - pam->virtual = virtual; |
| - |
| - spin_lock_irqsave(&pas->lock, flags); |
| - list_add_tail(&pam->list, &pas->lh); |
| - spin_unlock_irqrestore(&pas->lock, flags); |
| - } else { /* Remove */ |
| - spin_lock_irqsave(&pas->lock, flags); |
| - list_for_each_entry(pam, &pas->lh, list) { |
| - if (pam->page == page) { |
| - list_del(&pam->list); |
| - spin_unlock_irqrestore(&pas->lock, flags); |
| - spin_lock_irqsave(&pool_lock, flags); |
| - list_add_tail(&pam->list, &page_address_pool); |
| - spin_unlock_irqrestore(&pool_lock, flags); |
| - goto done; |
| - } |
| + pam = &page_address_maps[pos]; |
| + |
| + spin_lock_irqsave(&pas->lock, flags); |
| + if (virtual) { /* add */ |
| + VM_BUG_ON(!list_empty(&pam->list)); |
| + |
| + if (!__page_address(pas, page)) { |
| + pam->page = page; |
| + pam->virtual = virtual; |
| + list_add_tail(&pam->list, &pas->lh); |
| + ret = 1; |
| + } |
| + } else { /* remove */ |
| + if (!list_empty(&pam->list)) { |
| + list_del_init(&pam->list); |
| + ret = 1; |
| } |
| - spin_unlock_irqrestore(&pas->lock, flags); |
| } |
| -done: |
| - return; |
| + spin_unlock_irqrestore(&pas->lock, flags); |
| + |
| + return ret; |
| } |
| |
| -static struct page_address_map page_address_maps[LAST_PKMAP]; |
| +int set_page_address(struct page *page, void *virtual) |
| +{ |
| + /* |
| + * set_page_address is not supposed to be called when using |
| + * hashed virtual addresses. |
| + */ |
| + BUG(); |
| + return 0; |
| +} |
| |
| -void __init page_address_init(void) |
| +void __init __page_address_init(void) |
| { |
| int i; |
| |
| - INIT_LIST_HEAD(&page_address_pool); |
| for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) |
| - list_add(&page_address_maps[i].list, &page_address_pool); |
| + INIT_LIST_HEAD(&page_address_maps[i].list); |
| + |
| for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { |
| INIT_LIST_HEAD(&page_address_htable[i].lh); |
| spin_lock_init(&page_address_htable[i].lock); |
| } |
| - spin_lock_init(&pool_lock); |
| +} |
| + |
| +#elif defined (CONFIG_HIGHMEM) /* HASHED_PAGE_VIRTUAL */ |
| + |
| +static int __set_page_address(struct page *page, void *virtual, int pos) |
| +{ |
| + return set_page_address(page, virtual); |
| +} |
| + |
| +#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ |
| + |
| +#if defined(CONFIG_HIGHMEM) || defined(HASHED_PAGE_VIRTUAL) |
| + |
| +void __init page_address_init(void) |
| +{ |
| +#ifdef CONFIG_HIGHMEM |
| + int i; |
| + |
| + for (i = 0; i < ARRAY_SIZE(pkmap_count); i++) |
| + atomic_set(&pkmap_count[i], 1); |
| +#endif |
| + |
| +#ifdef HASHED_PAGE_VIRTUAL |
| + __page_address_init(); |
| +#endif |
| } |
| |
| #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ |
| -- |
| 1.7.1.1 |
| |