| From: Andrea Arcangeli <aarcange@redhat.com> |
| Date: Thu, 21 Nov 2013 14:32:02 -0800 |
| Subject: mm: hugetlbfs: fix hugetlbfs optimization |
| |
| commit 27c73ae759774e63313c1fbfeb17ba076cea64c5 upstream. |
| |
| Commit 7cb2ef56e6a8 ("mm: fix aio performance regression for database |
| caused by THP") can cause dereference of a dangling pointer if |
| split_huge_page runs during PageHuge() if there are updates to the |
| tail_page->private field. |
| |
| Also it is repeating compound_head twice for hugetlbfs and it is running |
| compound_head+compound_trans_head for THP when a single one is needed in |
| both cases. |
| |
| The new code within the PageSlab() check doesn't need to verify that the |
| THP page size is never bigger than the smallest hugetlbfs page size, to |
| avoid memory corruption. |
| |
| A longstanding theoretical race condition was found while fixing the |
| above (see the change right after the skip_unlock label, that is |
| relevant for the compound_lock path too). |
| |
| By re-establishing the _mapcount tail refcounting for all compound |
| pages, this also fixes the below problem: |
| |
| echo 0 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages |
| |
| BUG: Bad page state in process bash pfn:59a01 |
| page:ffffea000139b038 count:0 mapcount:10 mapping: (null) index:0x0 |
| page flags: 0x1c00000000008000(tail) |
| Modules linked in: |
| CPU: 6 PID: 2018 Comm: bash Not tainted 3.12.0+ #25 |
| Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 |
| Call Trace: |
| dump_stack+0x55/0x76 |
| bad_page+0xd5/0x130 |
| free_pages_prepare+0x213/0x280 |
| __free_pages+0x36/0x80 |
| update_and_free_page+0xc1/0xd0 |
| free_pool_huge_page+0xc2/0xe0 |
| set_max_huge_pages.part.58+0x14c/0x220 |
| nr_hugepages_store_common.isra.60+0xd0/0xf0 |
| nr_hugepages_store+0x13/0x20 |
| kobj_attr_store+0xf/0x20 |
| sysfs_write_file+0x189/0x1e0 |
| vfs_write+0xc5/0x1f0 |
| SyS_write+0x55/0xb0 |
| system_call_fastpath+0x16/0x1b |
| |
| Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com> |
| Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> |
| Tested-by: Khalid Aziz <khalid.aziz@oracle.com> |
| Cc: Pravin Shelar <pshelar@nicira.com> |
| Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| Cc: Ben Hutchings <bhutchings@solarflare.com> |
| Cc: Christoph Lameter <cl@linux.com> |
| Cc: Johannes Weiner <jweiner@redhat.com> |
| Cc: Mel Gorman <mgorman@suse.de> |
| Cc: Rik van Riel <riel@redhat.com> |
| Cc: Andi Kleen <andi@firstfloor.org> |
| Cc: Minchan Kim <minchan@kernel.org> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| [Khalid Aziz: Backported to 3.4] |
| Signed-off-by: Ben Hutchings <ben@decadent.org.uk> |
| --- |
| include/linux/hugetlb.h | 6 +++ |
| mm/hugetlb.c | 17 ++++++++ |
| mm/swap.c | 107 ++++++++++++++++++++++++++++++++++------------- |
| 3 files changed, 101 insertions(+), 29 deletions(-) |
| |
| --- a/include/linux/hugetlb.h |
| +++ b/include/linux/hugetlb.h |
| @@ -24,6 +24,7 @@ struct hugepage_subpool *hugepage_new_su |
| void hugepage_put_subpool(struct hugepage_subpool *spool); |
| |
| int PageHuge(struct page *page); |
| +int PageHeadHuge(struct page *page_head); |
| |
| void reset_vma_resv_huge_pages(struct vm_area_struct *vma); |
| int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); |
| @@ -87,6 +88,11 @@ static inline int PageHuge(struct page * |
| { |
| return 0; |
| } |
| + |
| +static inline int PageHeadHuge(struct page *page_head) |
| +{ |
| + return 0; |
| +} |
| |
| static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) |
| { |
| --- a/mm/hugetlb.c |
| +++ b/mm/hugetlb.c |
| @@ -679,6 +679,23 @@ int PageHuge(struct page *page) |
| } |
| EXPORT_SYMBOL_GPL(PageHuge); |
| |
| +/* |
| + * PageHeadHuge() only returns true for hugetlbfs head page, but not for |
| + * normal or transparent huge pages. |
| + */ |
| +int PageHeadHuge(struct page *page_head) |
| +{ |
| + compound_page_dtor *dtor; |
| + |
| + if (!PageHead(page_head)) |
| + return 0; |
| + |
| + dtor = get_compound_page_dtor(page_head); |
| + |
| + return dtor == free_huge_page; |
| +} |
| +EXPORT_SYMBOL_GPL(PageHeadHuge); |
| + |
| pgoff_t __basepage_index(struct page *page) |
| { |
| struct page *page_head = compound_head(page); |
| --- a/mm/swap.c |
| +++ b/mm/swap.c |
| @@ -78,18 +78,6 @@ static void __put_compound_page(struct p |
| |
| static void put_compound_page(struct page *page) |
| { |
| - /* |
| - * hugetlbfs pages cannot be split from under us. If this is a |
| - * hugetlbfs page, check refcount on head page and release the page if |
| - * the refcount becomes zero. |
| - */ |
| - if (PageHuge(page)) { |
| - page = compound_head(page); |
| - if (put_page_testzero(page)) |
| - __put_compound_page(page); |
| - return; |
| - } |
| - |
| if (unlikely(PageTail(page))) { |
| /* __split_huge_page_refcount can run under us */ |
| struct page *page_head = compound_trans_head(page); |
| @@ -97,6 +85,35 @@ static void put_compound_page(struct pag |
| if (likely(page != page_head && |
| get_page_unless_zero(page_head))) { |
| unsigned long flags; |
| + |
| + if (PageHeadHuge(page_head)) { |
| + if (likely(PageTail(page))) { |
| + /* |
| + * __split_huge_page_refcount |
| + * cannot race here. |
| + */ |
| + VM_BUG_ON(!PageHead(page_head)); |
| + atomic_dec(&page->_mapcount); |
| + if (put_page_testzero(page_head)) |
| + VM_BUG_ON(1); |
| + if (put_page_testzero(page_head)) |
| + __put_compound_page(page_head); |
| + return; |
| + } else { |
| + /* |
| + * __split_huge_page_refcount |
| + * run before us, "page" was a |
| + * THP tail. The split |
| + * page_head has been freed |
| + * and reallocated as slab or |
| + * hugetlbfs page of smaller |
| + * order (only possible if |
| + * reallocated as slab on |
| + * x86). |
| + */ |
| + goto skip_lock; |
| + } |
| + } |
| /* |
| * page_head wasn't a dangling pointer but it |
| * may not be a head page anymore by the time |
| @@ -108,9 +125,29 @@ static void put_compound_page(struct pag |
| /* __split_huge_page_refcount run before us */ |
| compound_unlock_irqrestore(page_head, flags); |
| VM_BUG_ON(PageHead(page_head)); |
| - if (put_page_testzero(page_head)) |
| - __put_single_page(page_head); |
| - out_put_single: |
| +skip_lock: |
| + if (put_page_testzero(page_head)) { |
| + /* |
| + * The head page may have been |
| + * freed and reallocated as a |
| + * compound page of smaller |
| + * order and then freed again. |
| + * All we know is that it |
| + * cannot have become: a THP |
| + * page, a compound page of |
| + * higher order, a tail page. |
| + * That is because we still |
| + * hold the refcount of the |
| + * split THP tail and |
| + * page_head was the THP head |
| + * before the split. |
| + */ |
| + if (PageHead(page_head)) |
| + __put_compound_page(page_head); |
| + else |
| + __put_single_page(page_head); |
| + } |
| +out_put_single: |
| if (put_page_testzero(page)) |
| __put_single_page(page); |
| return; |
| @@ -174,21 +211,34 @@ bool __get_page_tail(struct page *page) |
| */ |
| unsigned long flags; |
| bool got = false; |
| - struct page *page_head; |
| + struct page *page_head = compound_trans_head(page); |
| |
| - /* |
| - * If this is a hugetlbfs page it cannot be split under us. Simply |
| - * increment refcount for the head page. |
| - */ |
| - if (PageHuge(page)) { |
| - page_head = compound_head(page); |
| - atomic_inc(&page_head->_count); |
| - got = true; |
| - goto out; |
| - } |
| - |
| - page_head = compound_trans_head(page); |
| if (likely(page != page_head && get_page_unless_zero(page_head))) { |
| + /* Ref to put_compound_page() comment. */ |
| + if (PageHeadHuge(page_head)) { |
| + if (likely(PageTail(page))) { |
| + /* |
| + * This is a hugetlbfs |
| + * page. __split_huge_page_refcount |
| + * cannot race here. |
| + */ |
| + VM_BUG_ON(!PageHead(page_head)); |
| + __get_page_tail_foll(page, false); |
| + return true; |
| + } else { |
| + /* |
| + * __split_huge_page_refcount run |
| + * before us, "page" was a THP |
| + * tail. The split page_head has been |
| + * freed and reallocated as slab or |
| + * hugetlbfs page of smaller order |
| + * (only possible if reallocated as |
| + * slab on x86). |
| + */ |
| + put_page(page_head); |
| + return false; |
| + } |
| + } |
| /* |
| * page_head wasn't a dangling pointer but it |
| * may not be a head page anymore by the time |
| @@ -205,7 +255,6 @@ bool __get_page_tail(struct page *page) |
| if (unlikely(!got)) |
| put_page(page_head); |
| } |
| -out: |
| return got; |
| } |
| EXPORT_SYMBOL(__get_page_tail); |