| From 53a59fc67f97374758e63a9c785891ec62324c81 Mon Sep 17 00:00:00 2001 |
| From: Michal Hocko <mhocko@suse.cz> |
| Date: Fri, 4 Jan 2013 15:35:12 -0800 |
| Subject: mm: limit mmu_gather batching to fix soft lockups on !CONFIG_PREEMPT |
| |
| From: Michal Hocko <mhocko@suse.cz> |
| |
| commit 53a59fc67f97374758e63a9c785891ec62324c81 upstream. |
| |
| Since commit e303297e6c3a ("mm: extended batches for generic |
| mmu_gather") we are batching pages to be freed until either |
| tlb_next_batch cannot allocate a new batch or we are done. |
| |
| This works just fine most of the time but we can get in troubles with |
| non-preemptible kernel (CONFIG_PREEMPT_NONE or CONFIG_PREEMPT_VOLUNTARY) |
| on large machines where too aggressive batching might lead to soft |
| lockups during process exit path (exit_mmap) because there are no |
| scheduling points down the free_pages_and_swap_cache path and so the |
| freeing can take long enough to trigger the soft lockup. |
| |
| The lockup is harmless except when the system is setup to panic on |
| softlockup which is not that unusual. |
| |
| The simplest way to work around this issue is to limit the maximum |
| number of batches in a single mmu_gather. 10k of collected pages should |
| be safe to prevent from soft lockups (we would have 2ms for one) even if |
| they are all freed without an explicit scheduling point. |
| |
| This patch doesn't add any new explicit scheduling points because it |
| relies on zap_pmd_range during page tables zapping which calls |
| cond_resched per PMD. |
| |
| The following lockup has been reported for 3.0 kernel with a huge |
| process (in order of hundreds gigs but I do know any more details). |
| |
| BUG: soft lockup - CPU#56 stuck for 22s! [kernel:31053] |
| Modules linked in: af_packet nfs lockd fscache auth_rpcgss nfs_acl sunrpc mptctl mptbase autofs4 binfmt_misc dm_round_robin dm_multipath bonding cpufreq_conservative cpufreq_userspace cpufreq_powersave pcc_cpufreq mperf microcode fuse loop osst sg sd_mod crc_t10dif st qla2xxx scsi_transport_fc scsi_tgt netxen_nic i7core_edac iTCO_wdt joydev e1000e serio_raw pcspkr edac_core iTCO_vendor_support acpi_power_meter rtc_cmos hpwdt hpilo button container usbhid hid dm_mirror dm_region_hash dm_log linear uhci_hcd ehci_hcd usbcore usb_common scsi_dh_emc scsi_dh_alua scsi_dh_hp_sw scsi_dh_rdac scsi_dh dm_snapshot pcnet32 mii edd dm_mod raid1 ext3 mbcache jbd fan thermal processor thermal_sys hwmon cciss scsi_mod |
| Supported: Yes |
| CPU 56 |
| Pid: 31053, comm: kernel Not tainted 3.0.31-0.9-default #1 HP ProLiant DL580 G7 |
| RIP: 0010: _raw_spin_unlock_irqrestore+0x8/0x10 |
| RSP: 0018:ffff883ec1037af0 EFLAGS: 00000206 |
| RAX: 0000000000000e00 RBX: ffffea01a0817e28 RCX: ffff88803ffd9e80 |
| RDX: 0000000000000200 RSI: 0000000000000206 RDI: 0000000000000206 |
| RBP: 0000000000000002 R08: 0000000000000001 R09: ffff887ec724a400 |
| R10: 0000000000000000 R11: dead000000200200 R12: ffffffff8144c26e |
| R13: 0000000000000030 R14: 0000000000000297 R15: 000000000000000e |
| FS: 00007ed834282700(0000) GS:ffff88c03f200000(0000) knlGS:0000000000000000 |
| CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b |
| CR2: 000000000068b240 CR3: 0000003ec13c5000 CR4: 00000000000006e0 |
| DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 |
| DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 |
| Process kernel (pid: 31053, threadinfo ffff883ec1036000, task ffff883ebd5d4100) |
| Call Trace: |
| release_pages+0xc5/0x260 |
| free_pages_and_swap_cache+0x9d/0xc0 |
| tlb_flush_mmu+0x5c/0x80 |
| tlb_finish_mmu+0xe/0x50 |
| exit_mmap+0xbd/0x120 |
| mmput+0x49/0x120 |
| exit_mm+0x122/0x160 |
| do_exit+0x17a/0x430 |
| do_group_exit+0x3d/0xb0 |
| get_signal_to_deliver+0x247/0x480 |
| do_signal+0x71/0x1b0 |
| do_notify_resume+0x98/0xb0 |
| int_signal+0x12/0x17 |
| DWARF2 unwinder stuck at int_signal+0x12/0x17 |
| |
| Signed-off-by: Michal Hocko <mhocko@suse.cz> |
| Cc: Mel Gorman <mgorman@suse.de> |
| Cc: Rik van Riel <riel@redhat.com> |
| Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| |
| --- |
| include/asm-generic/tlb.h | 9 +++++++++ |
| mm/memory.c | 5 +++++ |
| 2 files changed, 14 insertions(+) |
| |
| --- a/include/asm-generic/tlb.h |
| +++ b/include/asm-generic/tlb.h |
| @@ -78,6 +78,14 @@ struct mmu_gather_batch { |
| #define MAX_GATHER_BATCH \ |
| ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *)) |
| |
| +/* |
| + * Limit the maximum number of mmu_gather batches to reduce a risk of soft |
| + * lockups for non-preemptible kernels on huge machines when a lot of memory |
| + * is zapped during unmapping. |
| + * 10K pages freed at once should be safe even without a preemption point. |
| + */ |
| +#define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) |
| + |
| /* struct mmu_gather is an opaque type used by the mm code for passing around |
| * any data needed by arch specific code for tlb_remove_page. |
| */ |
| @@ -96,6 +104,7 @@ struct mmu_gather { |
| struct mmu_gather_batch *active; |
| struct mmu_gather_batch local; |
| struct page *__pages[MMU_GATHER_BUNDLE]; |
| + unsigned int batch_count; |
| }; |
| |
| #define HAVE_GENERIC_MMU_GATHER |
| --- a/mm/memory.c |
| +++ b/mm/memory.c |
| @@ -182,10 +182,14 @@ static int tlb_next_batch(struct mmu_gat |
| return 1; |
| } |
| |
| + if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) |
| + return 0; |
| + |
| batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); |
| if (!batch) |
| return 0; |
| |
| + tlb->batch_count++; |
| batch->next = NULL; |
| batch->nr = 0; |
| batch->max = MAX_GATHER_BATCH; |
| @@ -214,6 +218,7 @@ void tlb_gather_mmu(struct mmu_gather *t |
| tlb->local.nr = 0; |
| tlb->local.max = ARRAY_SIZE(tlb->__pages); |
| tlb->active = &tlb->local; |
| + tlb->batch_count = 0; |
| |
| #ifdef CONFIG_HAVE_RCU_TABLE_FREE |
| tlb->batch = NULL; |