| /* |
| * linux/mm/vmscan.c |
| * |
| * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
| * |
| * Swap reorganised 29.12.95, Stephen Tweedie. |
| * kswapd added: 7.1.96 sct |
| * Removed kswapd_ctl limits, and swap out as many pages as needed |
| * to bring the system back to freepages.high: 2.4.97, Rik van Riel. |
| * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). |
| * Multiqueue VM started 5.8.00, Rik van Riel. |
| */ |
| |
| #include <linux/mm.h> |
| #include <linux/slab.h> |
| #include <linux/kernel_stat.h> |
| #include <linux/swap.h> |
| #include <linux/smp_lock.h> |
| #include <linux/pagemap.h> |
| #include <linux/init.h> |
| #include <linux/highmem.h> |
| #include <linux/file.h> |
| #include <linux/writeback.h> |
| #include <linux/suspend.h> |
| #include <linux/buffer_head.h> /* for try_to_release_page() */ |
| #include <linux/mm_inline.h> |
| #include <linux/pagevec.h> |
| |
| #include <asm/pgalloc.h> |
| #include <asm/tlbflush.h> |
| #include <linux/swapops.h> |
| |
| /* |
| * The "priority" of VM scanning is how much of the queues we |
| * will scan in one go. A value of 6 for DEF_PRIORITY implies |
| * that we'll scan 1/64th of the queues ("queue_length >> 6") |
| * during a normal aging round. |
| */ |
| #define DEF_PRIORITY (6) |
| |
| #ifdef ARCH_HAS_PREFETCH |
| #define prefetch_prev_lru_page(_page, _base, _field) \ |
| do { \ |
| if ((_page)->lru.prev != _base) { \ |
| struct page *prev; \ |
| \ |
| prev = list_entry(_page->lru.prev, \ |
| struct page, lru); \ |
| prefetch(&prev->_field); \ |
| } \ |
| } while (0) |
| #else |
| #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) |
| #endif |
| |
| #ifdef ARCH_HAS_PREFETCHW |
| #define prefetchw_prev_lru_page(_page, _base, _field) \ |
| do { \ |
| if ((_page)->lru.prev != _base) { \ |
| struct page *prev; \ |
| \ |
| prev = list_entry(_page->lru.prev, \ |
| struct page, lru); \ |
| prefetchw(&prev->_field); \ |
| } \ |
| } while (0) |
| #else |
| #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) |
| #endif |
| |
| /* Must be called with page's pte_chain_lock held. */ |
| static inline int page_mapping_inuse(struct page * page) |
| { |
| struct address_space *mapping = page->mapping; |
| |
| /* Page is in somebody's page tables. */ |
| if (page->pte.chain) |
| return 1; |
| |
| /* XXX: does this happen ? */ |
| if (!mapping) |
| return 0; |
| |
| /* File is mmap'd by somebody. */ |
| if (!list_empty(&mapping->i_mmap) || !list_empty(&mapping->i_mmap_shared)) |
| return 1; |
| |
| return 0; |
| } |
| |
| static inline int is_page_cache_freeable(struct page *page) |
| { |
| return page_count(page) - !!PagePrivate(page) == 2; |
| } |
| |
| static /* inline */ int |
| shrink_list(struct list_head *page_list, int nr_pages, |
| unsigned int gfp_mask, int priority, int *max_scan) |
| { |
| struct address_space *mapping; |
| LIST_HEAD(ret_pages); |
| struct pagevec freed_pvec; |
| const int nr_pages_in = nr_pages; |
| int pgactivate = 0; |
| |
| pagevec_init(&freed_pvec); |
| while (!list_empty(page_list)) { |
| struct page *page; |
| int may_enter_fs; |
| |
| page = list_entry(page_list->prev, struct page, lru); |
| list_del(&page->lru); |
| |
| if (TestSetPageLocked(page)) |
| goto keep; |
| |
| BUG_ON(PageActive(page)); |
| may_enter_fs = (gfp_mask & __GFP_FS) || |
| (PageSwapCache(page) && (gfp_mask & __GFP_IO)); |
| if (PageWriteback(page)) { |
| if (may_enter_fs) |
| wait_on_page_writeback(page); /* throttling */ |
| else |
| goto keep_locked; |
| } |
| |
| pte_chain_lock(page); |
| if (page_referenced(page) && page_mapping_inuse(page)) { |
| /* In active use or really unfreeable. Activate it. */ |
| pte_chain_unlock(page); |
| goto activate_locked; |
| } |
| |
| mapping = page->mapping; |
| |
| /* |
| * Anonymous process memory without backing store. Try to |
| * allocate it some swap space here. |
| * |
| * XXX: implement swap clustering ? |
| */ |
| if (page->pte.chain && !mapping && !PagePrivate(page)) { |
| pte_chain_unlock(page); |
| if (!add_to_swap(page)) |
| goto activate_locked; |
| pte_chain_lock(page); |
| } |
| |
| /* |
| * The page is mapped into the page tables of one or more |
| * processes. Try to unmap it here. |
| */ |
| if (page->pte.chain && mapping) { |
| switch (try_to_unmap(page)) { |
| case SWAP_ERROR: |
| case SWAP_FAIL: |
| pte_chain_unlock(page); |
| goto activate_locked; |
| case SWAP_AGAIN: |
| pte_chain_unlock(page); |
| goto keep_locked; |
| case SWAP_SUCCESS: |
| ; /* try to free the page below */ |
| } |
| } |
| pte_chain_unlock(page); |
| |
| /* |
| * FIXME: this is CPU-inefficient for shared mappings. |
| * try_to_unmap() will set the page dirty and ->vm_writeback |
| * will write it. So we're back to page-at-a-time writepage |
| * in LRU order. |
| */ |
| if (PageDirty(page) && is_page_cache_freeable(page) && |
| mapping && may_enter_fs) { |
| int (*writeback)(struct page *, int *); |
| const int cluster_size = SWAP_CLUSTER_MAX; |
| int nr_to_write = cluster_size; |
| |
| writeback = mapping->a_ops->vm_writeback; |
| if (writeback == NULL) |
| writeback = generic_vm_writeback; |
| (*writeback)(page, &nr_to_write); |
| *max_scan -= (cluster_size - nr_to_write); |
| goto keep; |
| } |
| |
| /* |
| * If the page has buffers, try to free the buffer mappings |
| * associated with this page. If we succeed we try to free |
| * the page as well. |
| * |
| * We do this even if the page is PageDirty(). |
| * try_to_release_page() does not perform I/O, but it is |
| * possible for a page to have PageDirty set, but it is actually |
| * clean (all its buffers are clean). This happens if the |
| * buffers were written out directly, with submit_bh(). ext3 |
| * will do this, as well as the blockdev mapping. |
| * try_to_release_page() will discover that cleanness and will |
| * drop the buffers and mark the page clean - it can be freed. |
| * |
| * Rarely, pages can have buffers and no ->mapping. These are |
| * the pages which were not successfully invalidated in |
| * truncate_complete_page(). We try to drop those buffers here |
| * and if that worked, and the page is no longer mapped into |
| * process address space (page_count == 0) it can be freed. |
| * Otherwise, leave the page on the LRU so it is swappable. |
| */ |
| if (PagePrivate(page)) { |
| if (!try_to_release_page(page, 0)) |
| goto keep_locked; |
| if (!mapping && page_count(page) == 1) |
| goto free_it; |
| } |
| |
| if (!mapping) |
| goto keep_locked; /* truncate got there first */ |
| |
| write_lock(&mapping->page_lock); |
| |
| /* |
| * The non-racy check for busy page. It is critical to check |
| * PageDirty _after_ making sure that the page is freeable and |
| * not in use by anybody. (pagecache + us == 2) |
| */ |
| if (page_count(page) != 2 || PageDirty(page)) { |
| write_unlock(&mapping->page_lock); |
| goto keep_locked; |
| } |
| |
| if (PageSwapCache(page)) { |
| swp_entry_t swap = { .val = page->index }; |
| __delete_from_swap_cache(page); |
| write_unlock(&mapping->page_lock); |
| swap_free(swap); |
| } else { |
| __remove_from_page_cache(page); |
| write_unlock(&mapping->page_lock); |
| } |
| __put_page(page); /* The pagecache ref */ |
| free_it: |
| unlock_page(page); |
| nr_pages--; |
| if (!pagevec_add(&freed_pvec, page)) |
| __pagevec_release_nonlru(&freed_pvec); |
| continue; |
| |
| activate_locked: |
| SetPageActive(page); |
| pgactivate++; |
| keep_locked: |
| unlock_page(page); |
| keep: |
| list_add(&page->lru, &ret_pages); |
| BUG_ON(PageLRU(page)); |
| } |
| list_splice(&ret_pages, page_list); |
| if (pagevec_count(&freed_pvec)) |
| __pagevec_release_nonlru(&freed_pvec); |
| KERNEL_STAT_ADD(pgsteal, nr_pages_in - nr_pages); |
| KERNEL_STAT_ADD(pgactivate, pgactivate); |
| return nr_pages; |
| } |
| |
| /* |
| * zone->lru_lock is heavily contented. We relieve it by quickly privatising |
| * a batch of pages and working on them outside the lock. Any pages which were |
| * not freed will be added back to the LRU. |
| * |
| * shrink_cache() is passed the number of pages to try to free, and returns |
| * the number which are yet-to-free. |
| * |
| * For pagecache intensive workloads, the first loop here is the hottest spot |
| * in the kernel (apart from the copy_*_user functions). |
| */ |
| static /* inline */ int |
| shrink_cache(int nr_pages, struct zone *zone, |
| unsigned int gfp_mask, int priority, int max_scan) |
| { |
| LIST_HEAD(page_list); |
| struct pagevec pvec; |
| int nr_to_process; |
| |
| /* |
| * Try to ensure that we free `nr_pages' pages in one pass of the loop. |
| */ |
| nr_to_process = nr_pages; |
| if (nr_to_process < SWAP_CLUSTER_MAX) |
| nr_to_process = SWAP_CLUSTER_MAX; |
| |
| pagevec_init(&pvec); |
| |
| lru_add_drain(); |
| spin_lock_irq(&zone->lru_lock); |
| while (max_scan > 0 && nr_pages > 0) { |
| struct page *page; |
| int n = 0; |
| |
| while (n < nr_to_process && !list_empty(&zone->inactive_list)) { |
| page = list_entry(zone->inactive_list.prev, |
| struct page, lru); |
| |
| prefetchw_prev_lru_page(page, |
| &zone->inactive_list, flags); |
| |
| if (!TestClearPageLRU(page)) |
| BUG(); |
| list_del(&page->lru); |
| if (page_count(page) == 0) { |
| /* It is currently in pagevec_release() */ |
| SetPageLRU(page); |
| list_add(&page->lru, &zone->inactive_list); |
| continue; |
| } |
| list_add(&page->lru, &page_list); |
| page_cache_get(page); |
| n++; |
| } |
| zone->nr_inactive -= n; |
| spin_unlock_irq(&zone->lru_lock); |
| |
| if (list_empty(&page_list)) |
| goto done; |
| |
| max_scan -= n; |
| KERNEL_STAT_ADD(pgscan, n); |
| nr_pages = shrink_list(&page_list, nr_pages, |
| gfp_mask, priority, &max_scan); |
| |
| if (nr_pages <= 0 && list_empty(&page_list)) |
| goto done; |
| |
| spin_lock_irq(&zone->lru_lock); |
| /* |
| * Put back any unfreeable pages. |
| */ |
| while (!list_empty(&page_list)) { |
| page = list_entry(page_list.prev, struct page, lru); |
| if (TestSetPageLRU(page)) |
| BUG(); |
| list_del(&page->lru); |
| if (PageActive(page)) |
| add_page_to_active_list(zone, page); |
| else |
| add_page_to_inactive_list(zone, page); |
| if (!pagevec_add(&pvec, page)) { |
| spin_unlock_irq(&zone->lru_lock); |
| __pagevec_release(&pvec); |
| spin_lock_irq(&zone->lru_lock); |
| } |
| } |
| } |
| spin_unlock_irq(&zone->lru_lock); |
| done: |
| pagevec_release(&pvec); |
| return nr_pages; |
| } |
| |
| /* |
| * This moves pages from the active list to the inactive list. |
| * |
| * We move them the other way if the page is referenced by one or more |
| * processes, from rmap. |
| * |
| * If the pages are mostly unmapped, the processing is fast and it is |
| * appropriate to hold zone->lru_lock across the whole operation. But if |
| * the pages are mapped, the processing is slow (page_referenced()) so we |
| * should drop zone->lru_lock around each page. It's impossible to balance |
| * this, so instead we remove the pages from the LRU while processing them. |
| * It is safe to rely on PG_active against the non-LRU pages in here because |
| * nobody will play with that bit on a non-LRU page. |
| * |
| * The downside is that we have to touch page->count against each page. |
| * But we had to alter page->flags anyway. |
| */ |
| static /* inline */ void |
| refill_inactive_zone(struct zone *zone, const int nr_pages_in) |
| { |
| int pgdeactivate = 0; |
| int nr_pages = nr_pages_in; |
| LIST_HEAD(l_hold); /* The pages which were snipped off */ |
| LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ |
| LIST_HEAD(l_active); /* Pages to go onto the active_list */ |
| struct page *page; |
| struct pagevec pvec; |
| |
| lru_add_drain(); |
| spin_lock_irq(&zone->lru_lock); |
| while (nr_pages && !list_empty(&zone->active_list)) { |
| page = list_entry(zone->active_list.prev, struct page, lru); |
| prefetchw_prev_lru_page(page, &zone->active_list, flags); |
| if (!TestClearPageLRU(page)) |
| BUG(); |
| list_del(&page->lru); |
| if (page_count(page) == 0) { |
| /* It is currently in pagevec_release() */ |
| SetPageLRU(page); |
| list_add(&page->lru, &zone->active_list); |
| continue; |
| } |
| page_cache_get(page); |
| list_add(&page->lru, &l_hold); |
| nr_pages--; |
| } |
| spin_unlock_irq(&zone->lru_lock); |
| |
| while (!list_empty(&l_hold)) { |
| page = list_entry(l_hold.prev, struct page, lru); |
| list_del(&page->lru); |
| if (page->pte.chain) { |
| pte_chain_lock(page); |
| if (page->pte.chain && page_referenced(page)) { |
| pte_chain_unlock(page); |
| list_add(&page->lru, &l_active); |
| continue; |
| } |
| pte_chain_unlock(page); |
| } |
| list_add(&page->lru, &l_inactive); |
| pgdeactivate++; |
| } |
| |
| pagevec_init(&pvec); |
| spin_lock_irq(&zone->lru_lock); |
| while (!list_empty(&l_inactive)) { |
| page = list_entry(l_inactive.prev, struct page, lru); |
| prefetchw_prev_lru_page(page, &l_inactive, flags); |
| if (TestSetPageLRU(page)) |
| BUG(); |
| if (!TestClearPageActive(page)) |
| BUG(); |
| list_move(&page->lru, &zone->inactive_list); |
| if (!pagevec_add(&pvec, page)) { |
| spin_unlock_irq(&zone->lru_lock); |
| __pagevec_release(&pvec); |
| spin_lock_irq(&zone->lru_lock); |
| } |
| } |
| while (!list_empty(&l_active)) { |
| page = list_entry(l_active.prev, struct page, lru); |
| prefetchw_prev_lru_page(page, &l_active, flags); |
| if (TestSetPageLRU(page)) |
| BUG(); |
| BUG_ON(!PageActive(page)); |
| list_move(&page->lru, &zone->active_list); |
| if (!pagevec_add(&pvec, page)) { |
| spin_unlock_irq(&zone->lru_lock); |
| __pagevec_release(&pvec); |
| spin_lock_irq(&zone->lru_lock); |
| } |
| } |
| zone->nr_active -= pgdeactivate; |
| zone->nr_inactive += pgdeactivate; |
| spin_unlock_irq(&zone->lru_lock); |
| pagevec_release(&pvec); |
| |
| KERNEL_STAT_ADD(pgscan, nr_pages_in - nr_pages); |
| KERNEL_STAT_ADD(pgdeactivate, pgdeactivate); |
| } |
| |
| static /* inline */ int |
| shrink_zone(struct zone *zone, int priority, |
| unsigned int gfp_mask, int nr_pages) |
| { |
| unsigned long ratio; |
| int max_scan; |
| |
| /* This is bogus for ZONE_HIGHMEM? */ |
| if (kmem_cache_reap(gfp_mask) >= nr_pages) |
| return 0; |
| |
| /* |
| * Try to keep the active list 2/3 of the size of the cache. And |
| * make sure that refill_inactive is given a decent number of pages. |
| * |
| * The "ratio+1" here is important. With pagecache-intensive workloads |
| * the inactive list is huge, and `ratio' evaluates to zero all the |
| * time. Which pins the active list memory. So we add one to `ratio' |
| * just to make sure that the kernel will slowly sift through the |
| * active list. |
| */ |
| ratio = (unsigned long)nr_pages * zone->nr_active / |
| ((zone->nr_inactive | 1) * 2); |
| atomic_add(ratio+1, &zone->refill_counter); |
| if (atomic_read(&zone->refill_counter) > SWAP_CLUSTER_MAX) { |
| atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter); |
| refill_inactive_zone(zone, SWAP_CLUSTER_MAX); |
| } |
| |
| max_scan = zone->nr_inactive / priority; |
| nr_pages = shrink_cache(nr_pages, zone, |
| gfp_mask, priority, max_scan); |
| |
| if (nr_pages <= 0) |
| return 0; |
| |
| wakeup_bdflush(); |
| |
| shrink_dcache_memory(priority, gfp_mask); |
| |
| /* After shrinking the dcache, get rid of unused inodes too .. */ |
| shrink_icache_memory(1, gfp_mask); |
| #ifdef CONFIG_QUOTA |
| shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); |
| #endif |
| |
| return nr_pages; |
| } |
| |
| static int |
| shrink_caches(struct zone *classzone, int priority, |
| int gfp_mask, int nr_pages) |
| { |
| struct zone *first_classzone; |
| struct zone *zone; |
| |
| first_classzone = classzone->zone_pgdat->node_zones; |
| zone = classzone; |
| while (zone >= first_classzone) { |
| if (zone->free_pages <= zone->pages_high) { |
| nr_pages = shrink_zone(zone, priority, |
| gfp_mask, nr_pages); |
| } |
| zone--; |
| } |
| return nr_pages; |
| } |
| |
| /* |
| * This is the main entry point to page reclaim. |
| */ |
| int |
| try_to_free_pages(struct zone *classzone, |
| unsigned int gfp_mask, unsigned int order) |
| { |
| int priority = DEF_PRIORITY; |
| int nr_pages = SWAP_CLUSTER_MAX; |
| |
| KERNEL_STAT_INC(pageoutrun); |
| |
| do { |
| nr_pages = shrink_caches(classzone, priority, |
| gfp_mask, nr_pages); |
| if (nr_pages <= 0) |
| return 1; |
| } while (--priority); |
| out_of_memory(); |
| return 0; |
| } |
| |
| DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); |
| |
| static int check_classzone_need_balance(struct zone *classzone) |
| { |
| struct zone *first_classzone; |
| |
| first_classzone = classzone->zone_pgdat->node_zones; |
| while (classzone >= first_classzone) { |
| if (classzone->free_pages > classzone->pages_high) |
| return 0; |
| classzone--; |
| } |
| return 1; |
| } |
| |
| static int kswapd_balance_pgdat(pg_data_t * pgdat) |
| { |
| int need_more_balance = 0, i; |
| struct zone *zone; |
| |
| for (i = pgdat->nr_zones-1; i >= 0; i--) { |
| zone = pgdat->node_zones + i; |
| cond_resched(); |
| if (!zone->need_balance) |
| continue; |
| if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) { |
| zone->need_balance = 0; |
| __set_current_state(TASK_INTERRUPTIBLE); |
| schedule_timeout(HZ); |
| continue; |
| } |
| if (check_classzone_need_balance(zone)) |
| need_more_balance = 1; |
| else |
| zone->need_balance = 0; |
| } |
| |
| return need_more_balance; |
| } |
| |
| static void kswapd_balance(void) |
| { |
| int need_more_balance; |
| pg_data_t * pgdat; |
| |
| do { |
| need_more_balance = 0; |
| pgdat = pgdat_list; |
| do |
| need_more_balance |= kswapd_balance_pgdat(pgdat); |
| while ((pgdat = pgdat->pgdat_next)); |
| } while (need_more_balance); |
| } |
| |
| static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) |
| { |
| struct zone *zone; |
| int i; |
| |
| for (i = pgdat->nr_zones-1; i >= 0; i--) { |
| zone = pgdat->node_zones + i; |
| if (!zone->need_balance) |
| continue; |
| return 0; |
| } |
| |
| return 1; |
| } |
| |
| static int kswapd_can_sleep(void) |
| { |
| pg_data_t * pgdat; |
| |
| pgdat = pgdat_list; |
| do { |
| if (kswapd_can_sleep_pgdat(pgdat)) |
| continue; |
| return 0; |
| } while ((pgdat = pgdat->pgdat_next)); |
| |
| return 1; |
| } |
| |
| /* |
| * The background pageout daemon, started as a kernel thread |
| * from the init process. |
| * |
| * This basically trickles out pages so that we have _some_ |
| * free memory available even if there is no other activity |
| * that frees anything up. This is needed for things like routing |
| * etc, where we otherwise might have all activity going on in |
| * asynchronous contexts that cannot page things out. |
| * |
| * If there are applications that are active memory-allocators |
| * (most normal use), this basically shouldn't matter. |
| */ |
| int kswapd(void *unused) |
| { |
| struct task_struct *tsk = current; |
| DECLARE_WAITQUEUE(wait, tsk); |
| |
| daemonize(); |
| strcpy(tsk->comm, "kswapd"); |
| sigfillset(&tsk->blocked); |
| |
| /* |
| * Tell the memory management that we're a "memory allocator", |
| * and that if we need more memory we should get access to it |
| * regardless (see "__alloc_pages()"). "kswapd" should |
| * never get caught in the normal page freeing logic. |
| * |
| * (Kswapd normally doesn't need memory anyway, but sometimes |
| * you need a small amount of memory in order to be able to |
| * page out something else, and this flag essentially protects |
| * us from recursively trying to free more memory as we're |
| * trying to free the first piece of memory in the first place). |
| */ |
| tsk->flags |= PF_MEMALLOC; |
| |
| /* |
| * Kswapd main loop. |
| */ |
| for (;;) { |
| if (current->flags & PF_FREEZE) |
| refrigerator(PF_IOTHREAD); |
| __set_current_state(TASK_INTERRUPTIBLE); |
| add_wait_queue(&kswapd_wait, &wait); |
| |
| mb(); |
| if (kswapd_can_sleep()) |
| schedule(); |
| |
| __set_current_state(TASK_RUNNING); |
| remove_wait_queue(&kswapd_wait, &wait); |
| |
| /* |
| * If we actually get into a low-memory situation, |
| * the processes needing more memory will wake us |
| * up on a more timely basis. |
| */ |
| kswapd_balance(); |
| blk_run_queues(); |
| } |
| } |
| |
| static int __init kswapd_init(void) |
| { |
| printk("Starting kswapd\n"); |
| swap_setup(); |
| kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); |
| return 0; |
| } |
| |
| module_init(kswapd_init) |