| From 70b50f94f1644e2aa7cb374819cfd93f3c28d725 Mon Sep 17 00:00:00 2001 |
| From: Andrea Arcangeli <aarcange@redhat.com> |
| Date: Wed, 2 Nov 2011 13:36:59 -0700 |
| Subject: mm: thp: tail page refcounting fix |
| |
| From: Andrea Arcangeli <aarcange@redhat.com> |
| |
| commit 70b50f94f1644e2aa7cb374819cfd93f3c28d725 upstream. |
| |
| Michel while working on the working set estimation code, noticed that |
| calling get_page_unless_zero() on a random pfn_to_page(random_pfn) |
| wasn't safe, if the pfn ended up being a tail page of a transparent |
| hugepage under splitting by __split_huge_page_refcount(). |
| |
| He then found the problem could also theoretically materialize with |
| page_cache_get_speculative() during the speculative radix tree lookups |
| that uses get_page_unless_zero() in SMP if the radix tree page is freed |
| and reallocated and get_user_pages is called on it before |
| page_cache_get_speculative has a chance to call get_page_unless_zero(). |
| |
| So the best way to fix the problem is to keep page_tail->_count zero at |
| all times. This will guarantee that get_page_unless_zero() can never |
| succeed on any tail page. page_tail->_mapcount is guaranteed zero and |
| is unused for all tail pages of a compound page, so we can simply |
| account the tail page references there and transfer them to |
| tail_page->_count in __split_huge_page_refcount() (in addition to the |
| head_page->_mapcount). |
| |
| While debugging this s/_count/_mapcount/ change I also noticed get_page is |
| called by direct-io.c on pages returned by get_user_pages. That wasn't |
| entirely safe because the two atomic_inc in get_page weren't atomic. As |
| opposed to other get_user_page users like secondary-MMU page fault to |
| establish the shadow pagetables would never call any superflous get_page |
| after get_user_page returns. It's safer to make get_page universally safe |
| for tail pages and to use get_page_foll() within follow_page (inside |
| get_user_pages()). get_page_foll() is safe to do the refcounting for tail |
| pages without taking any locks because it is run within PT lock protected |
| critical sections (PT lock for pte and page_table_lock for |
| pmd_trans_huge). |
| |
| The standard get_page() as invoked by direct-io instead will now take |
| the compound_lock but still only for tail pages. The direct-io paths |
| are usually I/O bound and the compound_lock is per THP so very |
| finegrined, so there's no risk of scalability issues with it. A simple |
| direct-io benchmarks with all lockdep prove locking and spinlock |
| debugging infrastructure enabled shows identical performance and no |
| overhead. So it's worth it. Ideally direct-io should stop calling |
| get_page() on pages returned by get_user_pages(). The spinlock in |
| get_page() is already optimized away for no-THP builds but doing |
| get_page() on tail pages returned by GUP is generally a rare operation |
| and usually only run in I/O paths. |
| |
| This new refcounting on page_tail->_mapcount in addition to avoiding new |
| RCU critical sections will also allow the working set estimation code to |
| work without any further complexity associated to the tail page |
| refcounting with THP. |
| |
| Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> |
| Reported-by: Michel Lespinasse <walken@google.com> |
| Reviewed-by: Michel Lespinasse <walken@google.com> |
| Reviewed-by: Minchan Kim <minchan.kim@gmail.com> |
| Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> |
| Cc: Hugh Dickins <hughd@google.com> |
| Cc: Johannes Weiner <jweiner@redhat.com> |
| Cc: Rik van Riel <riel@redhat.com> |
| Cc: Mel Gorman <mgorman@suse.de> |
| Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> |
| Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> |
| Cc: David Gibson <david@gibson.dropbear.id.au> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> |
| |
| --- |
| arch/powerpc/mm/gup.c | 5 +- |
| arch/x86/mm/gup.c | 5 +- |
| include/linux/mm.h | 56 ++++++++++++------------------- |
| include/linux/mm_types.h | 21 +++++++++-- |
| mm/huge_memory.c | 37 ++++++++++++++------ |
| mm/internal.h | 46 ++++++++++++++++++++++++++ |
| mm/memory.c | 2 - |
| mm/swap.c | 83 ++++++++++++++++++++++++++++++----------------- |
| 8 files changed, 171 insertions(+), 84 deletions(-) |
| |
| --- a/arch/powerpc/mm/gup.c |
| +++ b/arch/powerpc/mm/gup.c |
| @@ -22,8 +22,9 @@ static inline void get_huge_page_tail(st |
| * __split_huge_page_refcount() cannot run |
| * from under us. |
| */ |
| - VM_BUG_ON(atomic_read(&page->_count) < 0); |
| - atomic_inc(&page->_count); |
| + VM_BUG_ON(page_mapcount(page) < 0); |
| + VM_BUG_ON(atomic_read(&page->_count) != 0); |
| + atomic_inc(&page->_mapcount); |
| } |
| |
| /* |
| --- a/arch/x86/mm/gup.c |
| +++ b/arch/x86/mm/gup.c |
| @@ -114,8 +114,9 @@ static inline void get_huge_page_tail(st |
| * __split_huge_page_refcount() cannot run |
| * from under us. |
| */ |
| - VM_BUG_ON(atomic_read(&page->_count) < 0); |
| - atomic_inc(&page->_count); |
| + VM_BUG_ON(page_mapcount(page) < 0); |
| + VM_BUG_ON(atomic_read(&page->_count) != 0); |
| + atomic_inc(&page->_mapcount); |
| } |
| |
| static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, |
| --- a/include/linux/mm.h |
| +++ b/include/linux/mm.h |
| @@ -356,36 +356,39 @@ static inline struct page *compound_head |
| return page; |
| } |
| |
| +/* |
| + * The atomic page->_mapcount, starts from -1: so that transitions |
| + * both from it and to it can be tracked, using atomic_inc_and_test |
| + * and atomic_add_negative(-1). |
| + */ |
| +static inline void reset_page_mapcount(struct page *page) |
| +{ |
| + atomic_set(&(page)->_mapcount, -1); |
| +} |
| + |
| +static inline int page_mapcount(struct page *page) |
| +{ |
| + return atomic_read(&(page)->_mapcount) + 1; |
| +} |
| + |
| static inline int page_count(struct page *page) |
| { |
| return atomic_read(&compound_head(page)->_count); |
| } |
| |
| +extern bool __get_page_tail(struct page *page); |
| + |
| static inline void get_page(struct page *page) |
| { |
| + if (unlikely(PageTail(page))) |
| + if (likely(__get_page_tail(page))) |
| + return; |
| /* |
| * Getting a normal page or the head of a compound page |
| - * requires to already have an elevated page->_count. Only if |
| - * we're getting a tail page, the elevated page->_count is |
| - * required only in the head page, so for tail pages the |
| - * bugcheck only verifies that the page->_count isn't |
| - * negative. |
| + * requires to already have an elevated page->_count. |
| */ |
| - VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page)); |
| + VM_BUG_ON(atomic_read(&page->_count) <= 0); |
| atomic_inc(&page->_count); |
| - /* |
| - * Getting a tail page will elevate both the head and tail |
| - * page->_count(s). |
| - */ |
| - if (unlikely(PageTail(page))) { |
| - /* |
| - * This is safe only because |
| - * __split_huge_page_refcount can't run under |
| - * get_page(). |
| - */ |
| - VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); |
| - atomic_inc(&page->first_page->_count); |
| - } |
| } |
| |
| static inline struct page *virt_to_head_page(const void *x) |
| @@ -804,21 +807,6 @@ static inline pgoff_t page_index(struct |
| } |
| |
| /* |
| - * The atomic page->_mapcount, like _count, starts from -1: |
| - * so that transitions both from it and to it can be tracked, |
| - * using atomic_inc_and_test and atomic_add_negative(-1). |
| - */ |
| -static inline void reset_page_mapcount(struct page *page) |
| -{ |
| - atomic_set(&(page)->_mapcount, -1); |
| -} |
| - |
| -static inline int page_mapcount(struct page *page) |
| -{ |
| - return atomic_read(&(page)->_mapcount) + 1; |
| -} |
| - |
| -/* |
| * Return true if this page is mapped into pagetables. |
| */ |
| static inline int page_mapped(struct page *page) |
| --- a/include/linux/mm_types.h |
| +++ b/include/linux/mm_types.h |
| @@ -62,10 +62,23 @@ struct page { |
| struct { |
| |
| union { |
| - atomic_t _mapcount; /* Count of ptes mapped in mms, |
| - * to show when page is mapped |
| - * & limit reverse map searches. |
| - */ |
| + /* |
| + * Count of ptes mapped in |
| + * mms, to show when page is |
| + * mapped & limit reverse map |
| + * searches. |
| + * |
| + * Used also for tail pages |
| + * refcounting instead of |
| + * _count. Tail pages cannot |
| + * be mapped and keeping the |
| + * tail page _count zero at |
| + * all times guarantees |
| + * get_page_unless_zero() will |
| + * never succeed on tail |
| + * pages. |
| + */ |
| + atomic_t _mapcount; |
| |
| struct { |
| unsigned inuse:16; |
| --- a/mm/huge_memory.c |
| +++ b/mm/huge_memory.c |
| @@ -989,7 +989,7 @@ struct page *follow_trans_huge_pmd(struc |
| page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; |
| VM_BUG_ON(!PageCompound(page)); |
| if (flags & FOLL_GET) |
| - get_page(page); |
| + get_page_foll(page); |
| |
| out: |
| return page; |
| @@ -1156,6 +1156,7 @@ static void __split_huge_page_refcount(s |
| unsigned long head_index = page->index; |
| struct zone *zone = page_zone(page); |
| int zonestat; |
| + int tail_count = 0; |
| |
| /* prevent PageLRU to go away from under us, and freeze lru stats */ |
| spin_lock_irq(&zone->lru_lock); |
| @@ -1164,11 +1165,27 @@ static void __split_huge_page_refcount(s |
| for (i = 1; i < HPAGE_PMD_NR; i++) { |
| struct page *page_tail = page + i; |
| |
| - /* tail_page->_count cannot change */ |
| - atomic_sub(atomic_read(&page_tail->_count), &page->_count); |
| - BUG_ON(page_count(page) <= 0); |
| - atomic_add(page_mapcount(page) + 1, &page_tail->_count); |
| - BUG_ON(atomic_read(&page_tail->_count) <= 0); |
| + /* tail_page->_mapcount cannot change */ |
| + BUG_ON(page_mapcount(page_tail) < 0); |
| + tail_count += page_mapcount(page_tail); |
| + /* check for overflow */ |
| + BUG_ON(tail_count < 0); |
| + BUG_ON(atomic_read(&page_tail->_count) != 0); |
| + /* |
| + * tail_page->_count is zero and not changing from |
| + * under us. But get_page_unless_zero() may be running |
| + * from under us on the tail_page. If we used |
| + * atomic_set() below instead of atomic_add(), we |
| + * would then run atomic_set() concurrently with |
| + * get_page_unless_zero(), and atomic_set() is |
| + * implemented in C not using locked ops. spin_unlock |
| + * on x86 sometime uses locked ops because of PPro |
| + * errata 66, 92, so unless somebody can guarantee |
| + * atomic_set() here would be safe on all archs (and |
| + * not only on x86), it's safer to use atomic_add(). |
| + */ |
| + atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, |
| + &page_tail->_count); |
| |
| /* after clearing PageTail the gup refcount can be released */ |
| smp_mb(); |
| @@ -1186,10 +1203,7 @@ static void __split_huge_page_refcount(s |
| (1L << PG_uptodate))); |
| page_tail->flags |= (1L << PG_dirty); |
| |
| - /* |
| - * 1) clear PageTail before overwriting first_page |
| - * 2) clear PageTail before clearing PageHead for VM_BUG_ON |
| - */ |
| + /* clear PageTail before overwriting first_page */ |
| smp_wmb(); |
| |
| /* |
| @@ -1206,7 +1220,6 @@ static void __split_huge_page_refcount(s |
| * status is achieved setting a reserved bit in the |
| * pmd, not by clearing the present bit. |
| */ |
| - BUG_ON(page_mapcount(page_tail)); |
| page_tail->_mapcount = page->_mapcount; |
| |
| BUG_ON(page_tail->mapping); |
| @@ -1223,6 +1236,8 @@ static void __split_huge_page_refcount(s |
| |
| lru_add_page_tail(zone, page, page_tail); |
| } |
| + atomic_sub(tail_count, &page->_count); |
| + BUG_ON(atomic_read(&page->_count) <= 0); |
| |
| __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
| __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); |
| --- a/mm/internal.h |
| +++ b/mm/internal.h |
| @@ -37,6 +37,52 @@ static inline void __put_page(struct pag |
| atomic_dec(&page->_count); |
| } |
| |
| +static inline void __get_page_tail_foll(struct page *page, |
| + bool get_page_head) |
| +{ |
| + /* |
| + * If we're getting a tail page, the elevated page->_count is |
| + * required only in the head page and we will elevate the head |
| + * page->_count and tail page->_mapcount. |
| + * |
| + * We elevate page_tail->_mapcount for tail pages to force |
| + * page_tail->_count to be zero at all times to avoid getting |
| + * false positives from get_page_unless_zero() with |
| + * speculative page access (like in |
| + * page_cache_get_speculative()) on tail pages. |
| + */ |
| + VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); |
| + VM_BUG_ON(atomic_read(&page->_count) != 0); |
| + VM_BUG_ON(page_mapcount(page) < 0); |
| + if (get_page_head) |
| + atomic_inc(&page->first_page->_count); |
| + atomic_inc(&page->_mapcount); |
| +} |
| + |
| +/* |
| + * This is meant to be called as the FOLL_GET operation of |
| + * follow_page() and it must be called while holding the proper PT |
| + * lock while the pte (or pmd_trans_huge) is still mapping the page. |
| + */ |
| +static inline void get_page_foll(struct page *page) |
| +{ |
| + if (unlikely(PageTail(page))) |
| + /* |
| + * This is safe only because |
| + * __split_huge_page_refcount() can't run under |
| + * get_page_foll() because we hold the proper PT lock. |
| + */ |
| + __get_page_tail_foll(page, true); |
| + else { |
| + /* |
| + * Getting a normal page or the head of a compound page |
| + * requires to already have an elevated page->_count. |
| + */ |
| + VM_BUG_ON(atomic_read(&page->_count) <= 0); |
| + atomic_inc(&page->_count); |
| + } |
| +} |
| + |
| extern unsigned long highest_memmap_pfn; |
| |
| /* |
| --- a/mm/memory.c |
| +++ b/mm/memory.c |
| @@ -1503,7 +1503,7 @@ split_fallthrough: |
| } |
| |
| if (flags & FOLL_GET) |
| - get_page(page); |
| + get_page_foll(page); |
| if (flags & FOLL_TOUCH) { |
| if ((flags & FOLL_WRITE) && |
| !pte_dirty(pte) && !PageDirty(page)) |
| --- a/mm/swap.c |
| +++ b/mm/swap.c |
| @@ -78,39 +78,22 @@ static void put_compound_page(struct pag |
| { |
| if (unlikely(PageTail(page))) { |
| /* __split_huge_page_refcount can run under us */ |
| - struct page *page_head = page->first_page; |
| - smp_rmb(); |
| - /* |
| - * If PageTail is still set after smp_rmb() we can be sure |
| - * that the page->first_page we read wasn't a dangling pointer. |
| - * See __split_huge_page_refcount() smp_wmb(). |
| - */ |
| - if (likely(PageTail(page) && get_page_unless_zero(page_head))) { |
| + struct page *page_head = compound_trans_head(page); |
| + |
| + if (likely(page != page_head && |
| + get_page_unless_zero(page_head))) { |
| unsigned long flags; |
| /* |
| - * Verify that our page_head wasn't converted |
| - * to a a regular page before we got a |
| - * reference on it. |
| + * page_head wasn't a dangling pointer but it |
| + * may not be a head page anymore by the time |
| + * we obtain the lock. That is ok as long as it |
| + * can't be freed from under us. |
| */ |
| - if (unlikely(!PageHead(page_head))) { |
| - /* PageHead is cleared after PageTail */ |
| - smp_rmb(); |
| - VM_BUG_ON(PageTail(page)); |
| - goto out_put_head; |
| - } |
| - /* |
| - * Only run compound_lock on a valid PageHead, |
| - * after having it pinned with |
| - * get_page_unless_zero() above. |
| - */ |
| - smp_mb(); |
| - /* page_head wasn't a dangling pointer */ |
| flags = compound_lock_irqsave(page_head); |
| if (unlikely(!PageTail(page))) { |
| /* __split_huge_page_refcount run before us */ |
| compound_unlock_irqrestore(page_head, flags); |
| VM_BUG_ON(PageHead(page_head)); |
| - out_put_head: |
| if (put_page_testzero(page_head)) |
| __put_single_page(page_head); |
| out_put_single: |
| @@ -121,16 +104,17 @@ static void put_compound_page(struct pag |
| VM_BUG_ON(page_head != page->first_page); |
| /* |
| * We can release the refcount taken by |
| - * get_page_unless_zero now that |
| - * split_huge_page_refcount is blocked on the |
| - * compound_lock. |
| + * get_page_unless_zero() now that |
| + * __split_huge_page_refcount() is blocked on |
| + * the compound_lock. |
| */ |
| if (put_page_testzero(page_head)) |
| VM_BUG_ON(1); |
| /* __split_huge_page_refcount will wait now */ |
| - VM_BUG_ON(atomic_read(&page->_count) <= 0); |
| - atomic_dec(&page->_count); |
| + VM_BUG_ON(page_mapcount(page) <= 0); |
| + atomic_dec(&page->_mapcount); |
| VM_BUG_ON(atomic_read(&page_head->_count) <= 0); |
| + VM_BUG_ON(atomic_read(&page->_count) != 0); |
| compound_unlock_irqrestore(page_head, flags); |
| if (put_page_testzero(page_head)) { |
| if (PageHead(page_head)) |
| @@ -160,6 +144,45 @@ void put_page(struct page *page) |
| } |
| EXPORT_SYMBOL(put_page); |
| |
| +/* |
| + * This function is exported but must not be called by anything other |
| + * than get_page(). It implements the slow path of get_page(). |
| + */ |
| +bool __get_page_tail(struct page *page) |
| +{ |
| + /* |
| + * This takes care of get_page() if run on a tail page |
| + * returned by one of the get_user_pages/follow_page variants. |
| + * get_user_pages/follow_page itself doesn't need the compound |
| + * lock because it runs __get_page_tail_foll() under the |
| + * proper PT lock that already serializes against |
| + * split_huge_page(). |
| + */ |
| + unsigned long flags; |
| + bool got = false; |
| + struct page *page_head = compound_trans_head(page); |
| + |
| + if (likely(page != page_head && get_page_unless_zero(page_head))) { |
| + /* |
| + * page_head wasn't a dangling pointer but it |
| + * may not be a head page anymore by the time |
| + * we obtain the lock. That is ok as long as it |
| + * can't be freed from under us. |
| + */ |
| + flags = compound_lock_irqsave(page_head); |
| + /* here __split_huge_page_refcount won't run anymore */ |
| + if (likely(PageTail(page))) { |
| + __get_page_tail_foll(page, false); |
| + got = true; |
| + } |
| + compound_unlock_irqrestore(page_head, flags); |
| + if (unlikely(!got)) |
| + put_page(page_head); |
| + } |
| + return got; |
| +} |
| +EXPORT_SYMBOL(__get_page_tail); |
| + |
| /** |
| * put_pages_list() - release a list of pages |
| * @pages: list of pages threaded on page->lru |