| /* |
| * linux/mm/vmscan.c |
| * |
| * The pageout daemon, decides which pages to evict (swap out) and |
| * does the actual work of freeing them. |
| * |
| * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
| * |
| * Swap reorganised 29.12.95, Stephen Tweedie. |
| * kswapd added: 7.1.96 sct |
| * Removed kswapd_ctl limits, and swap out as many pages as needed |
| * to bring the system back to freepages.high: 2.4.97, Rik van Riel. |
| * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). |
| * Multiqueue VM started 5.8.00, Rik van Riel. |
| */ |
| |
| #include <linux/slab.h> |
| #include <linux/kernel_stat.h> |
| #include <linux/swap.h> |
| #include <linux/swapctl.h> |
| #include <linux/smp_lock.h> |
| #include <linux/pagemap.h> |
| #include <linux/init.h> |
| #include <linux/highmem.h> |
| #include <linux/file.h> |
| |
| #include <asm/pgalloc.h> |
| |
| /* |
| * "vm_passes" is the number of vm passes before failing the |
| * memory balancing. Take into account 3 passes are needed |
| * for a flush/wait/free cycle and that we only scan 1/vm_cache_scan_ratio |
| * of the inactive list at each pass. |
| */ |
| int vm_passes = 60; |
| |
| /* |
| * "vm_cache_scan_ratio" is how much of the inactive LRU queue we will scan |
| * in one go. A value of 6 for vm_cache_scan_ratio implies that we'll |
| * scan 1/6 of the inactive lists during a normal aging round. |
| */ |
| int vm_cache_scan_ratio = 6; |
| |
| /* |
| * "vm_mapped_ratio" controls the pageout rate, the smaller, the earlier |
| * we'll start to pageout. |
| */ |
| int vm_mapped_ratio = 100; |
| |
| /* |
| * "vm_lru_balance_ratio" controls the balance between active and |
| * inactive cache. The bigger vm_balance is, the easier the |
| * active cache will grow, because we'll rotate the active list |
| * slowly. A value of 2 means we'll go towards a balance of |
| * 1/3 of the cache being inactive. |
| */ |
| int vm_lru_balance_ratio = 2; |
| |
| /* |
| * "vm_vfs_scan_ratio" is what proportion of the VFS queues we will scan |
| * in one go. A value of 6 for vm_vfs_scan_ratio implies that 1/6th of |
| * the unused-inode, dentry and dquot caches will be freed during a normal |
| * aging round. |
| */ |
| int vm_vfs_scan_ratio = 6; |
| |
| /* |
| * "vm_anon_lru" select if to immdiatly insert anon pages in the |
| * lru. Immediatly means as soon as they're allocated during the |
| * page faults. |
| * |
| * If this is set to 0, they're inserted only after the first |
| * swapout. |
| * |
| * Having anon pages immediatly inserted in the lru allows the |
| * VM to know better when it's worthwhile to start swapping |
| * anonymous ram, it will start to swap earlier and it should |
| * swap smoother and faster, but it will decrease scalability |
| * on the >16-ways of an order of magnitude. Big SMP/NUMA |
| * definitely can't take an hit on a global spinlock at |
| * every anon page allocation. So this is off by default. |
| * |
| * Low ram machines that swaps all the time want to turn |
| * this on (i.e. set to 1). |
| */ |
| int vm_anon_lru = 0; |
| |
| /* |
| * The swap-out function returns 1 if it successfully |
| * scanned all the pages it was asked to (`count'). |
| * It returns zero if it couldn't do anything, |
| * |
| * rss may decrease because pages are shared, but this |
| * doesn't count as having freed a page. |
| */ |
| |
| /* mm->page_table_lock is held. mmap_sem is not held */ |
| static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone) |
| { |
| pte_t pte; |
| swp_entry_t entry; |
| |
| /* Don't look at this pte if it's been accessed recently. */ |
| if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) { |
| mark_page_accessed(page); |
| return 0; |
| } |
| |
| /* Don't bother unmapping pages that are active */ |
| if (PageActive(page)) |
| return 0; |
| |
| /* Don't bother replenishing zones not under pressure.. */ |
| if (!memclass(page_zone(page), classzone)) |
| return 0; |
| |
| if (TryLockPage(page)) |
| return 0; |
| |
| /* From this point on, the odds are that we're going to |
| * nuke this pte, so read and clear the pte. This hook |
| * is needed on CPUs which update the accessed and dirty |
| * bits in hardware. |
| */ |
| flush_cache_page(vma, address); |
| pte = ptep_get_and_clear(page_table); |
| flush_tlb_page(vma, address); |
| |
| if (pte_dirty(pte)) |
| set_page_dirty(page); |
| |
| /* |
| * Is the page already in the swap cache? If so, then |
| * we can just drop our reference to it without doing |
| * any IO - it's already up-to-date on disk. |
| */ |
| if (PageSwapCache(page)) { |
| entry.val = page->index; |
| swap_duplicate(entry); |
| set_swap_pte: |
| set_pte(page_table, swp_entry_to_pte(entry)); |
| drop_pte: |
| mm->rss--; |
| UnlockPage(page); |
| { |
| int freeable = page_count(page) - !!page->buffers <= 2; |
| page_cache_release(page); |
| return freeable; |
| } |
| } |
| |
| /* |
| * Is it a clean page? Then it must be recoverable |
| * by just paging it in again, and we can just drop |
| * it.. or if it's dirty but has backing store, |
| * just mark the page dirty and drop it. |
| * |
| * However, this won't actually free any real |
| * memory, as the page will just be in the page cache |
| * somewhere, and as such we should just continue |
| * our scan. |
| * |
| * Basically, this just makes it possible for us to do |
| * some real work in the future in "refill_inactive()". |
| */ |
| if (page->mapping) |
| goto drop_pte; |
| if (!PageDirty(page)) |
| goto drop_pte; |
| |
| /* |
| * Anonymous buffercache pages can be left behind by |
| * concurrent truncate and pagefault. |
| */ |
| if (page->buffers) |
| goto preserve; |
| |
| /* |
| * This is a dirty, swappable page. First of all, |
| * get a suitable swap entry for it, and make sure |
| * we have the swap cache set up to associate the |
| * page with that swap entry. |
| */ |
| for (;;) { |
| entry = get_swap_page(); |
| if (!entry.val) |
| break; |
| /* Add it to the swap cache and mark it dirty |
| * (adding to the page cache will clear the dirty |
| * and uptodate bits, so we need to do it again) |
| */ |
| if (add_to_swap_cache(page, entry) == 0) { |
| SetPageUptodate(page); |
| set_page_dirty(page); |
| goto set_swap_pte; |
| } |
| /* Raced with "speculative" read_swap_cache_async */ |
| swap_free(entry); |
| } |
| |
| /* No swap space left */ |
| preserve: |
| set_pte(page_table, pte); |
| UnlockPage(page); |
| return 0; |
| } |
| |
| /* mm->page_table_lock is held. mmap_sem is not held */ |
| static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) |
| { |
| pte_t * pte; |
| unsigned long pmd_end; |
| |
| if (pmd_none(*dir)) |
| return count; |
| if (pmd_bad(*dir)) { |
| pmd_ERROR(*dir); |
| pmd_clear(dir); |
| return count; |
| } |
| |
| pte = pte_offset(dir, address); |
| |
| pmd_end = (address + PMD_SIZE) & PMD_MASK; |
| if (end > pmd_end) |
| end = pmd_end; |
| |
| do { |
| if (pte_present(*pte)) { |
| struct page *page = pte_page(*pte); |
| |
| if (VALID_PAGE(page) && !PageReserved(page)) { |
| count -= try_to_swap_out(mm, vma, address, pte, page, classzone); |
| if (!count) { |
| address += PAGE_SIZE; |
| break; |
| } |
| } |
| } |
| address += PAGE_SIZE; |
| pte++; |
| } while (address && (address < end)); |
| mm->swap_address = address; |
| return count; |
| } |
| |
| /* mm->page_table_lock is held. mmap_sem is not held */ |
| static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) |
| { |
| pmd_t * pmd; |
| unsigned long pgd_end; |
| |
| if (pgd_none(*dir)) |
| return count; |
| if (pgd_bad(*dir)) { |
| pgd_ERROR(*dir); |
| pgd_clear(dir); |
| return count; |
| } |
| |
| pmd = pmd_offset(dir, address); |
| |
| pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; |
| if (pgd_end && (end > pgd_end)) |
| end = pgd_end; |
| |
| do { |
| count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone); |
| if (!count) |
| break; |
| address = (address + PMD_SIZE) & PMD_MASK; |
| pmd++; |
| } while (address && (address < end)); |
| return count; |
| } |
| |
| /* mm->page_table_lock is held. mmap_sem is not held */ |
| static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone) |
| { |
| pgd_t *pgdir; |
| unsigned long end; |
| |
| /* Don't swap out areas which are reserved */ |
| if (vma->vm_flags & VM_RESERVED) |
| return count; |
| |
| pgdir = pgd_offset(mm, address); |
| |
| end = vma->vm_end; |
| BUG_ON(address >= end); |
| do { |
| count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone); |
| if (!count) |
| break; |
| address = (address + PGDIR_SIZE) & PGDIR_MASK; |
| pgdir++; |
| } while (address && (address < end)); |
| return count; |
| } |
| |
| /* Placeholder for swap_out(): may be updated by fork.c:mmput() */ |
| struct mm_struct *swap_mm = &init_mm; |
| |
| /* |
| * Returns remaining count of pages to be swapped out by followup call. |
| */ |
| static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone) |
| { |
| unsigned long address; |
| struct vm_area_struct* vma; |
| |
| /* |
| * Find the proper vm-area after freezing the vma chain |
| * and ptes. |
| */ |
| spin_lock(&mm->page_table_lock); |
| address = mm->swap_address; |
| if (address == TASK_SIZE || swap_mm != mm) { |
| /* We raced: don't count this mm but try again */ |
| ++*mmcounter; |
| goto out_unlock; |
| } |
| vma = find_vma(mm, address); |
| if (vma) { |
| if (address < vma->vm_start) |
| address = vma->vm_start; |
| |
| for (;;) { |
| count = swap_out_vma(mm, vma, address, count, classzone); |
| vma = vma->vm_next; |
| if (!vma) |
| break; |
| if (!count) |
| goto out_unlock; |
| address = vma->vm_start; |
| } |
| } |
| /* Indicate that we reached the end of address space */ |
| mm->swap_address = TASK_SIZE; |
| |
| out_unlock: |
| spin_unlock(&mm->page_table_lock); |
| return count; |
| } |
| |
| static int FASTCALL(swap_out(zone_t * classzone)); |
| static int fastcall swap_out(zone_t * classzone) |
| { |
| int counter, nr_pages = SWAP_CLUSTER_MAX; |
| struct mm_struct *mm; |
| |
| counter = mmlist_nr << 1; |
| do { |
| if (unlikely(current->need_resched)) { |
| __set_current_state(TASK_RUNNING); |
| schedule(); |
| } |
| |
| spin_lock(&mmlist_lock); |
| mm = swap_mm; |
| while (mm->swap_address == TASK_SIZE || mm == &init_mm) { |
| mm->swap_address = 0; |
| mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); |
| if (mm == swap_mm) |
| goto empty; |
| swap_mm = mm; |
| } |
| |
| /* Make sure the mm doesn't disappear when we drop the lock.. */ |
| atomic_inc(&mm->mm_users); |
| spin_unlock(&mmlist_lock); |
| |
| nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone); |
| |
| mmput(mm); |
| |
| if (!nr_pages) |
| return 1; |
| } while (--counter >= 0); |
| |
| return 0; |
| |
| empty: |
| spin_unlock(&mmlist_lock); |
| return 0; |
| } |
| |
| static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone)); |
| static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout)); |
| static int fastcall shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout) |
| { |
| struct list_head * entry; |
| int max_scan = (classzone->nr_inactive_pages + classzone->nr_active_pages) / vm_cache_scan_ratio; |
| int max_mapped = vm_mapped_ratio * nr_pages; |
| |
| while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) { |
| struct page * page; |
| |
| if (unlikely(current->need_resched)) { |
| spin_unlock(&pagemap_lru_lock); |
| __set_current_state(TASK_RUNNING); |
| schedule(); |
| spin_lock(&pagemap_lru_lock); |
| continue; |
| } |
| |
| page = list_entry(entry, struct page, lru); |
| |
| BUG_ON(!PageLRU(page)); |
| BUG_ON(PageActive(page)); |
| |
| list_del(entry); |
| list_add(entry, &inactive_list); |
| |
| /* |
| * Zero page counts can happen because we unlink the pages |
| * _after_ decrementing the usage count.. |
| */ |
| if (unlikely(!page_count(page))) |
| continue; |
| |
| if (!memclass(page_zone(page), classzone)) |
| continue; |
| |
| max_scan--; |
| |
| /* Racy check to avoid trylocking when not worthwhile */ |
| if (!page->buffers && (page_count(page) != 1 || !page->mapping)) |
| goto page_mapped; |
| |
| /* |
| * The page is locked. IO in progress? |
| * Move it to the back of the list. |
| */ |
| if (unlikely(TryLockPage(page))) { |
| if (PageLaunder(page) && (gfp_mask & __GFP_FS)) { |
| page_cache_get(page); |
| spin_unlock(&pagemap_lru_lock); |
| wait_on_page(page); |
| page_cache_release(page); |
| spin_lock(&pagemap_lru_lock); |
| } |
| continue; |
| } |
| |
| if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) { |
| /* |
| * It is not critical here to write it only if |
| * the page is unmapped beause any direct writer |
| * like O_DIRECT would set the PG_dirty bitflag |
| * on the phisical page after having successfully |
| * pinned it and after the I/O to the page is finished, |
| * so the direct writes to the page cannot get lost. |
| */ |
| int (*writepage)(struct page *); |
| |
| writepage = page->mapping->a_ops->writepage; |
| if ((gfp_mask & __GFP_FS) && writepage) { |
| ClearPageDirty(page); |
| SetPageLaunder(page); |
| page_cache_get(page); |
| spin_unlock(&pagemap_lru_lock); |
| |
| writepage(page); |
| page_cache_release(page); |
| |
| spin_lock(&pagemap_lru_lock); |
| continue; |
| } |
| } |
| |
| /* |
| * If the page has buffers, try to free the buffer mappings |
| * associated with this page. If we succeed we try to free |
| * the page as well. |
| */ |
| if (page->buffers) { |
| spin_unlock(&pagemap_lru_lock); |
| |
| /* avoid to free a locked page */ |
| page_cache_get(page); |
| |
| if (try_to_release_page(page, gfp_mask)) { |
| if (!page->mapping) { |
| /* |
| * We must not allow an anon page |
| * with no buffers to be visible on |
| * the LRU, so we unlock the page after |
| * taking the lru lock |
| */ |
| spin_lock(&pagemap_lru_lock); |
| UnlockPage(page); |
| __lru_cache_del(page); |
| |
| /* effectively free the page here */ |
| page_cache_release(page); |
| |
| if (--nr_pages) |
| continue; |
| break; |
| } else { |
| /* |
| * The page is still in pagecache so undo the stuff |
| * before the try_to_release_page since we've not |
| * finished and we can now try the next step. |
| */ |
| page_cache_release(page); |
| |
| spin_lock(&pagemap_lru_lock); |
| } |
| } else { |
| /* failed to drop the buffers so stop here */ |
| UnlockPage(page); |
| page_cache_release(page); |
| |
| spin_lock(&pagemap_lru_lock); |
| continue; |
| } |
| } |
| |
| spin_lock(&pagecache_lock); |
| |
| /* |
| * This is the non-racy check for busy page. |
| * It is critical to check PageDirty _after_ we made sure |
| * the page is freeable so not in use by anybody. |
| * At this point we're guaranteed that page->buffers is NULL, |
| * nobody can refill page->buffers under us because we still |
| * hold the page lock. |
| */ |
| if (!page->mapping || page_count(page) > 1) { |
| spin_unlock(&pagecache_lock); |
| UnlockPage(page); |
| page_mapped: |
| if (--max_mapped < 0) { |
| spin_unlock(&pagemap_lru_lock); |
| |
| nr_pages -= kmem_cache_reap(gfp_mask); |
| if (nr_pages <= 0) |
| goto out; |
| |
| shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask); |
| shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask); |
| #ifdef CONFIG_QUOTA |
| shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask); |
| #endif |
| |
| if (!*failed_swapout) |
| *failed_swapout = !swap_out(classzone); |
| |
| max_mapped = nr_pages * vm_mapped_ratio; |
| |
| spin_lock(&pagemap_lru_lock); |
| refill_inactive(nr_pages, classzone); |
| } |
| continue; |
| |
| } |
| smp_rmb(); |
| if (PageDirty(page)) { |
| spin_unlock(&pagecache_lock); |
| UnlockPage(page); |
| continue; |
| } |
| |
| __lru_cache_del(page); |
| |
| /* point of no return */ |
| if (likely(!PageSwapCache(page))) { |
| __remove_inode_page(page); |
| spin_unlock(&pagecache_lock); |
| } else { |
| swp_entry_t swap; |
| swap.val = page->index; |
| __delete_from_swap_cache(page); |
| spin_unlock(&pagecache_lock); |
| swap_free(swap); |
| } |
| |
| UnlockPage(page); |
| |
| /* effectively free the page here */ |
| page_cache_release(page); |
| |
| if (--nr_pages) |
| continue; |
| break; |
| } |
| spin_unlock(&pagemap_lru_lock); |
| |
| out: |
| return nr_pages; |
| } |
| |
| /* |
| * This moves pages from the active list to |
| * the inactive list. |
| * |
| * We move them the other way when we see the |
| * reference bit on the page. |
| */ |
| static void fastcall refill_inactive(int nr_pages, zone_t * classzone) |
| { |
| struct list_head * entry; |
| unsigned long ratio; |
| |
| ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_lru_balance_ratio) + 1); |
| |
| entry = active_list.prev; |
| while (ratio && entry != &active_list) { |
| struct page * page; |
| |
| page = list_entry(entry, struct page, lru); |
| entry = entry->prev; |
| if (PageTestandClearReferenced(page)) { |
| list_del(&page->lru); |
| list_add(&page->lru, &active_list); |
| continue; |
| } |
| |
| ratio--; |
| |
| del_page_from_active_list(page); |
| add_page_to_inactive_list(page); |
| SetPageReferenced(page); |
| } |
| |
| if (entry != &active_list) { |
| list_del(&active_list); |
| list_add(&active_list, entry); |
| } |
| } |
| |
| static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout)); |
| static int fastcall shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout) |
| { |
| nr_pages -= kmem_cache_reap(gfp_mask); |
| if (nr_pages <= 0) |
| goto out; |
| |
| spin_lock(&pagemap_lru_lock); |
| refill_inactive(nr_pages, classzone); |
| |
| nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, failed_swapout); |
| |
| out: |
| return nr_pages; |
| } |
| |
| static int check_classzone_need_balance(zone_t * classzone); |
| |
| int fastcall try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask) |
| { |
| gfp_mask = pf_gfp_mask(gfp_mask); |
| |
| for (;;) { |
| int tries = vm_passes; |
| int failed_swapout = !(gfp_mask & __GFP_IO); |
| int nr_pages = SWAP_CLUSTER_MAX; |
| |
| do { |
| nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &failed_swapout); |
| if (nr_pages <= 0) |
| return 1; |
| shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask); |
| shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask); |
| #ifdef CONFIG_QUOTA |
| shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask); |
| #endif |
| if (!failed_swapout) |
| failed_swapout = !swap_out(classzone); |
| } while (--tries); |
| |
| #ifdef CONFIG_OOM_KILLER |
| out_of_memory(); |
| #else |
| if (likely(current->pid != 1)) |
| break; |
| if (!check_classzone_need_balance(classzone)) |
| break; |
| |
| __set_current_state(TASK_RUNNING); |
| yield(); |
| #endif |
| } |
| |
| return 0; |
| } |
| |
| int fastcall try_to_free_pages(unsigned int gfp_mask) |
| { |
| pg_data_t *pgdat; |
| zonelist_t *zonelist; |
| unsigned long pf_free_pages; |
| int error = 0; |
| |
| pf_free_pages = current->flags & PF_FREE_PAGES; |
| current->flags &= ~PF_FREE_PAGES; |
| |
| for_each_pgdat(pgdat) { |
| zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK); |
| error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask); |
| } |
| |
| current->flags |= pf_free_pages; |
| return error; |
| } |
| |
| DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); |
| |
| static int check_classzone_need_balance(zone_t * classzone) |
| { |
| zone_t * first_zone; |
| int class_idx = zone_idx(classzone); |
| |
| first_zone = classzone->zone_pgdat->node_zones; |
| while (classzone >= first_zone) { |
| if (classzone->free_pages > classzone->watermarks[class_idx].high) |
| return 0; |
| classzone--; |
| } |
| return 1; |
| } |
| |
| static int kswapd_balance_pgdat(pg_data_t * pgdat) |
| { |
| int need_more_balance = 0, i; |
| zone_t * zone; |
| |
| for (i = pgdat->nr_zones-1; i >= 0; i--) { |
| zone = pgdat->node_zones + i; |
| if (unlikely(current->need_resched)) |
| schedule(); |
| if (!zone->need_balance || !zone->size) |
| continue; |
| if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) { |
| zone->need_balance = 0; |
| __set_current_state(TASK_INTERRUPTIBLE); |
| schedule_timeout(HZ*5); |
| continue; |
| } |
| if (check_classzone_need_balance(zone)) |
| need_more_balance = 1; |
| else |
| zone->need_balance = 0; |
| } |
| |
| return need_more_balance; |
| } |
| |
| static void kswapd_balance(void) |
| { |
| int need_more_balance; |
| pg_data_t * pgdat; |
| |
| do { |
| need_more_balance = 0; |
| |
| for_each_pgdat(pgdat) |
| need_more_balance |= kswapd_balance_pgdat(pgdat); |
| } while (need_more_balance); |
| } |
| |
| static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) |
| { |
| zone_t * zone; |
| int i; |
| |
| for (i = pgdat->nr_zones-1; i >= 0; i--) { |
| zone = pgdat->node_zones + i; |
| if (!zone->need_balance || !zone->size) |
| continue; |
| return 0; |
| } |
| |
| return 1; |
| } |
| |
| static int kswapd_can_sleep(void) |
| { |
| pg_data_t * pgdat; |
| |
| for_each_pgdat(pgdat) { |
| if (!kswapd_can_sleep_pgdat(pgdat)) |
| return 0; |
| } |
| |
| return 1; |
| } |
| |
| /* |
| * The background pageout daemon, started as a kernel thread |
| * from the init process. |
| * |
| * This basically trickles out pages so that we have _some_ |
| * free memory available even if there is no other activity |
| * that frees anything up. This is needed for things like routing |
| * etc, where we otherwise might have all activity going on in |
| * asynchronous contexts that cannot page things out. |
| * |
| * If there are applications that are active memory-allocators |
| * (most normal use), this basically shouldn't matter. |
| */ |
| int kswapd(void *unused) |
| { |
| struct task_struct *tsk = current; |
| DECLARE_WAITQUEUE(wait, tsk); |
| |
| daemonize(); |
| strcpy(tsk->comm, "kswapd"); |
| sigfillset(&tsk->blocked); |
| |
| /* |
| * Tell the memory management that we're a "memory allocator", |
| * and that if we need more memory we should get access to it |
| * regardless (see "__alloc_pages()"). "kswapd" should |
| * never get caught in the normal page freeing logic. |
| * |
| * (Kswapd normally doesn't need memory anyway, but sometimes |
| * you need a small amount of memory in order to be able to |
| * page out something else, and this flag essentially protects |
| * us from recursively trying to free more memory as we're |
| * trying to free the first piece of memory in the first place). |
| */ |
| tsk->flags |= PF_MEMALLOC; |
| |
| /* |
| * Kswapd main loop. |
| */ |
| for (;;) { |
| __set_current_state(TASK_INTERRUPTIBLE); |
| add_wait_queue(&kswapd_wait, &wait); |
| |
| mb(); |
| if (kswapd_can_sleep()) |
| schedule(); |
| |
| __set_current_state(TASK_RUNNING); |
| remove_wait_queue(&kswapd_wait, &wait); |
| |
| /* |
| * If we actually get into a low-memory situation, |
| * the processes needing more memory will wake us |
| * up on a more timely basis. |
| */ |
| kswapd_balance(); |
| run_task_queue(&tq_disk); |
| } |
| } |
| |
| static int __init kswapd_init(void) |
| { |
| printk("Starting kswapd\n"); |
| swap_setup(); |
| kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); |
| return 0; |
| } |
| |
| module_init(kswapd_init) |