blob: c4434f218e4de9688e5c314ad45c1b2a5582824f [file] [log] [blame]
/*
* mm/rmap.c - physical to virtual reverse mappings
*
* Copyright 2001, Rik van Riel <riel@conectiva.com.br>
* Released under the General Public License (GPL).
*
*
* Simple, low overhead pte-based reverse mapping scheme.
* This is kept modular because we may want to experiment
* with object-based reverse mapping schemes. Please try
* to keep this thing as modular as possible.
*/
/*
* Locking:
* - the page->pte.chain is protected by the PG_chainlock bit,
* which nests within the the mm->page_table_lock,
* which nests within the page lock.
* - because swapout locking is opposite to the locking order
* in the page fault path, the swapout path uses trylocks
* on the mm->page_table_lock
*/
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rmap-locking.h>
#include <linux/cache.h>
#include <linux/percpu.h>
#include <asm/pgalloc.h>
#include <asm/rmap.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
/* #define DEBUG_RMAP */
/*
* Shared pages have a chain of pte_chain structures, used to locate
* all the mappings to this page. We only need a pointer to the pte
* here, the page struct for the page table page contains the process
* it belongs to and the offset within that process.
*
* We use an array of pte pointers in this structure to minimise cache misses
* while traversing reverse maps.
*/
#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t))
/*
* next_and_idx encodes both the address of the next pte_chain and the
* offset of the highest-index used pte in ptes[].
*/
struct pte_chain {
unsigned long next_and_idx;
pte_addr_t ptes[NRPTE];
} ____cacheline_aligned;
kmem_cache_t *pte_chain_cache;
static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain)
{
return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE);
}
static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr)
{
return (struct pte_chain *)(pte_chain_addr & ~NRPTE);
}
static inline int pte_chain_idx(struct pte_chain *pte_chain)
{
return pte_chain->next_and_idx & NRPTE;
}
static inline unsigned long
pte_chain_encode(struct pte_chain *pte_chain, int idx)
{
return (unsigned long)pte_chain | idx;
}
/*
* pte_chain list management policy:
*
* - If a page has a pte_chain list then it is shared by at least two processes,
* because a single sharing uses PageDirect. (Well, this isn't true yet,
* coz this code doesn't collapse singletons back to PageDirect on the remove
* path).
* - A pte_chain list has free space only in the head member - all succeeding
* members are 100% full.
* - If the head element has free space, it occurs in its leading slots.
* - All free space in the pte_chain is at the start of the head member.
* - Insertion into the pte_chain puts a pte pointer in the last free slot of
* the head member.
* - Removal from a pte chain moves the head pte of the head member onto the
* victim pte and frees the head member if it became empty.
*/
/**
** VM stuff below this comment
**/
/**
* page_referenced - test if the page was referenced
* @page: the page to test
*
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of processes which referenced the page.
* Caller needs to hold the pte_chain_lock.
*
* If the page has a single-entry pte_chain, collapse that back to a PageDirect
* representation. This way, it's only done under memory pressure.
*/
int page_referenced(struct page * page)
{
struct pte_chain *pc;
int referenced = 0;
if (TestClearPageReferenced(page))
referenced++;
if (PageDirect(page)) {
pte_t *pte = rmap_ptep_map(page->pte.direct);
if (ptep_test_and_clear_young(pte))
referenced++;
rmap_ptep_unmap(pte);
} else {
int nr_chains = 0;
/* Check all the page tables mapping this page. */
for (pc = page->pte.chain; pc; pc = pte_chain_next(pc)) {
int i;
for (i = NRPTE-1; i >= 0; i--) {
pte_addr_t pte_paddr = pc->ptes[i];
pte_t *p;
if (!pte_paddr)
break;
p = rmap_ptep_map(pte_paddr);
if (ptep_test_and_clear_young(p))
referenced++;
rmap_ptep_unmap(p);
nr_chains++;
}
}
if (nr_chains == 1) {
pc = page->pte.chain;
page->pte.direct = pc->ptes[NRPTE-1];
SetPageDirect(page);
pc->ptes[NRPTE-1] = 0;
__pte_chain_free(pc);
}
}
return referenced;
}
/**
* page_add_rmap - add reverse mapping entry to a page
* @page: the page to add the mapping to
* @ptep: the page table entry mapping this page
*
* Add a new pte reverse mapping to a page.
* The caller needs to hold the mm->page_table_lock.
*/
struct pte_chain *
page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain)
{
pte_addr_t pte_paddr = ptep_to_paddr(ptep);
struct pte_chain *cur_pte_chain;
if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
return pte_chain;
pte_chain_lock(page);
if (page->pte.direct == 0) {
page->pte.direct = pte_paddr;
SetPageDirect(page);
inc_page_state(nr_mapped);
goto out;
}
if (PageDirect(page)) {
/* Convert a direct pointer into a pte_chain */
ClearPageDirect(page);
pte_chain->ptes[NRPTE-1] = page->pte.direct;
pte_chain->ptes[NRPTE-2] = pte_paddr;
pte_chain->next_and_idx = pte_chain_encode(NULL, NRPTE-2);
page->pte.direct = 0;
page->pte.chain = pte_chain;
pte_chain = NULL; /* We consumed it */
goto out;
}
cur_pte_chain = page->pte.chain;
if (cur_pte_chain->ptes[0]) { /* It's full */
pte_chain->next_and_idx = pte_chain_encode(cur_pte_chain,
NRPTE - 1);
page->pte.chain = pte_chain;
pte_chain->ptes[NRPTE-1] = pte_paddr;
pte_chain = NULL; /* We consumed it */
goto out;
}
cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr;
cur_pte_chain->next_and_idx--;
out:
pte_chain_unlock(page);
return pte_chain;
}
/**
* page_remove_rmap - take down reverse mapping to a page
* @page: page to remove mapping from
* @ptep: page table entry to remove
*
* Removes the reverse mapping from the pte_chain of the page,
* after that the caller can clear the page table entry and free
* the page.
* Caller needs to hold the mm->page_table_lock.
*/
void page_remove_rmap(struct page *page, pte_t *ptep)
{
pte_addr_t pte_paddr = ptep_to_paddr(ptep);
struct pte_chain *pc;
if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
return;
pte_chain_lock(page);
if (!page_mapped(page))
goto out_unlock; /* remap_page_range() from a driver? */
if (PageDirect(page)) {
if (page->pte.direct == pte_paddr) {
page->pte.direct = 0;
ClearPageDirect(page);
goto out;
}
} else {
struct pte_chain *start = page->pte.chain;
struct pte_chain *next;
int victim_i = -1;
for (pc = start; pc; pc = next) {
int i;
next = pte_chain_next(pc);
if (next)
prefetch(next);
for (i = pte_chain_idx(pc); i < NRPTE; i++) {
pte_addr_t pa = pc->ptes[i];
if (victim_i == -1)
victim_i = i;
if (pa != pte_paddr)
continue;
pc->ptes[i] = start->ptes[victim_i];
start->ptes[victim_i] = 0;
if (victim_i == NRPTE-1) {
/* Emptied a pte_chain */
page->pte.chain = pte_chain_next(start);
__pte_chain_free(start);
} else {
start->next_and_idx++;
}
goto out;
}
}
}
out:
if (!page_mapped(page))
dec_page_state(nr_mapped);
out_unlock:
pte_chain_unlock(page);
return;
}
/**
* try_to_unmap_one - worker function for try_to_unmap
* @page: page to unmap
* @ptep: page table entry to unmap from page
*
* Internal helper function for try_to_unmap, called for each page
* table entry mapping a page. Because locking order here is opposite
* to the locking order used by the page fault path, we use trylocks.
* Locking:
* page lock shrink_list(), trylock
* pte_chain_lock shrink_list()
* mm->page_table_lock try_to_unmap_one(), trylock
*/
static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t));
static int try_to_unmap_one(struct page * page, pte_addr_t paddr)
{
pte_t *ptep = rmap_ptep_map(paddr);
unsigned long address = ptep_to_address(ptep);
struct mm_struct * mm = ptep_to_mm(ptep);
struct vm_area_struct * vma;
pte_t pte;
int ret;
if (!mm)
BUG();
/*
* We need the page_table_lock to protect us from page faults,
* munmap, fork, etc...
*/
if (!spin_trylock(&mm->page_table_lock)) {
rmap_ptep_unmap(ptep);
return SWAP_AGAIN;
}
/* During mremap, it's possible pages are not in a VMA. */
vma = find_vma(mm, address);
if (!vma) {
ret = SWAP_FAIL;
goto out_unlock;
}
/* The page is mlock()d, we cannot swap it out. */
if (vma->vm_flags & VM_LOCKED) {
ret = SWAP_FAIL;
goto out_unlock;
}
/* Nuke the page table entry. */
flush_cache_page(vma, address);
pte = ptep_get_and_clear(ptep);
flush_tlb_page(vma, address);
if (PageSwapCache(page)) {
/*
* Store the swap location in the pte.
* See handle_pte_fault() ...
*/
swp_entry_t entry = { .val = page->index };
swap_duplicate(entry);
set_pte(ptep, swp_entry_to_pte(entry));
BUG_ON(pte_file(*ptep));
} else {
unsigned long pgidx;
/*
* If a nonlinear mapping then store the file page offset
* in the pte.
*/
pgidx = (address - vma->vm_start) >> PAGE_SHIFT;
pgidx += vma->vm_pgoff;
pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
if (page->index != pgidx) {
set_pte(ptep, pgoff_to_pte(page->index));
BUG_ON(!pte_file(*ptep));
}
}
/* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pte))
set_page_dirty(page);
mm->rss--;
page_cache_release(page);
ret = SWAP_SUCCESS;
out_unlock:
rmap_ptep_unmap(ptep);
spin_unlock(&mm->page_table_lock);
return ret;
}
/**
* try_to_unmap - try to remove all page table mappings to a page
* @page: the page to get unmapped
*
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold the page lock
* and its pte chain lock. Return values are:
*
* SWAP_SUCCESS - we succeeded in removing all mappings
* SWAP_AGAIN - we missed a trylock, try again later
* SWAP_FAIL - the page is unswappable
*/
int try_to_unmap(struct page * page)
{
struct pte_chain *pc, *next_pc, *start;
int ret = SWAP_SUCCESS;
int victim_i = -1;
/* This page should not be on the pageout lists. */
if (PageReserved(page))
BUG();
if (!PageLocked(page))
BUG();
/* We need backing store to swap out a page. */
if (!page->mapping)
BUG();
if (PageDirect(page)) {
ret = try_to_unmap_one(page, page->pte.direct);
if (ret == SWAP_SUCCESS) {
page->pte.direct = 0;
ClearPageDirect(page);
}
goto out;
}
start = page->pte.chain;
for (pc = start; pc; pc = next_pc) {
int i;
next_pc = pte_chain_next(pc);
if (next_pc)
prefetch(next_pc);
for (i = pte_chain_idx(pc); i < NRPTE; i++) {
pte_addr_t pte_paddr = pc->ptes[i];
if (!pte_paddr)
continue;
if (victim_i == -1)
victim_i = i;
switch (try_to_unmap_one(page, pte_paddr)) {
case SWAP_SUCCESS:
/*
* Release a slot. If we're releasing the
* first pte in the first pte_chain then
* pc->ptes[i] and start->ptes[victim_i] both
* refer to the same thing. It works out.
*/
pc->ptes[i] = start->ptes[victim_i];
start->ptes[victim_i] = 0;
victim_i++;
if (victim_i == NRPTE) {
page->pte.chain = pte_chain_next(start);
__pte_chain_free(start);
start = page->pte.chain;
victim_i = 0;
} else {
start->next_and_idx++;
}
break;
case SWAP_AGAIN:
/* Skip this pte, remembering status. */
ret = SWAP_AGAIN;
continue;
case SWAP_FAIL:
ret = SWAP_FAIL;
goto out;
}
}
}
out:
if (!page_mapped(page))
dec_page_state(nr_mapped);
return ret;
}
/**
** No more VM stuff below this comment, only pte_chain helper
** functions.
**/
static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags)
{
struct pte_chain *pc = p;
memset(pc, 0, sizeof(*pc));
}
DEFINE_PER_CPU(struct pte_chain *, local_pte_chain) = 0;
/**
* __pte_chain_free - free pte_chain structure
* @pte_chain: pte_chain struct to free
*/
void __pte_chain_free(struct pte_chain *pte_chain)
{
struct pte_chain **pte_chainp;
pte_chainp = &get_cpu_var(local_pte_chain);
if (pte_chain->next_and_idx)
pte_chain->next_and_idx = 0;
if (*pte_chainp)
kmem_cache_free(pte_chain_cache, *pte_chainp);
*pte_chainp = pte_chain;
put_cpu_var(local_pte_chain);
}
/*
* pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap().
*
* The caller of page_add_rmap() must perform the allocation because
* page_add_rmap() is invariably called under spinlock. Often, page_add_rmap()
* will not actually use the pte_chain, because there is space available in one
* of the existing pte_chains which are attached to the page. So the case of
* allocating and then freeing a single pte_chain is specially optimised here,
* with a one-deep per-cpu cache.
*/
struct pte_chain *pte_chain_alloc(int gfp_flags)
{
struct pte_chain *ret;
struct pte_chain **pte_chainp;
might_sleep_if(gfp_flags & __GFP_WAIT);
pte_chainp = &get_cpu_var(local_pte_chain);
if (*pte_chainp) {
ret = *pte_chainp;
*pte_chainp = NULL;
put_cpu_var(local_pte_chain);
} else {
put_cpu_var(local_pte_chain);
ret = kmem_cache_alloc(pte_chain_cache, gfp_flags);
}
return ret;
}
void __init pte_chain_init(void)
{
pte_chain_cache = kmem_cache_create( "pte_chain",
sizeof(struct pte_chain),
0,
SLAB_MUST_HWCACHE_ALIGN,
pte_chain_ctor,
NULL);
if (!pte_chain_cache)
panic("failed to create pte_chain cache!\n");
}