blob: 887a487653ca66057f926c498843a30c4265376d [file] [log] [blame]
/*
* High memory handling common code and variables.
*
* (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
* Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
*
*
* Redesigned the x86 32-bit VM architecture to deal with
* 64-bit physical space. With current x86 CPUs this
* means up to 64 Gigabytes physical RAM.
*
* Rewrote high memory support to move the page cache into
* high memory. Implemented permanent (schedulable) kmaps
* based on Linus' idea.
*
* Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
*
* Largely rewritten to get rid of all global locks
*
* Copyright (C) 2006 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>
#ifdef CONFIG_HIGHMEM
static int __set_page_address(struct page *page, void *virtual, int pos);
unsigned long totalhigh_pages __read_mostly;
EXPORT_SYMBOL(totalhigh_pages);
unsigned int nr_free_highpages (void)
{
pg_data_t *pgdat;
unsigned int pages = 0;
for_each_online_pgdat(pgdat) {
pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
NR_FREE_PAGES);
if (zone_movable_is_highmem())
pages += zone_page_state(
&pgdat->node_zones[ZONE_MOVABLE],
NR_FREE_PAGES);
}
return pages;
}
/*
* count is not a pure "count".
* 0 means its owned exclusively by someone
* 1 means its free for use - either mapped or not.
* n means that there are (n-1) current users of it.
*/
struct pkmap_state {
atomic_t count;
int pfn;
};
static struct pkmap_state pkmap[LAST_PKMAP];
static atomic_t pkmap_hand;
static atomic_t pkmap_free;
static atomic_t pkmap_users;
pte_t * pkmap_page_table;
static DECLARE_WAIT_QUEUE_HEAD(pkmap_wait);
/*
* Most architectures have no use for kmap_high_get(), so let's abstract
* the disabling of IRQ out of the locking in that case to save on a
* potential useless overhead.
*/
#ifdef ARCH_NEEDS_KMAP_HIGH_GET
#define lock_kmap() spin_lock_irq(&kmap_lock)
#define unlock_kmap() spin_unlock_irq(&kmap_lock)
#define lock_kmap_any(flags) spin_lock_irqsave(&kmap_lock, flags)
#define unlock_kmap_any(flags) spin_unlock_irqrestore(&kmap_lock, flags)
#else
#define lock_kmap() spin_lock(&kmap_lock)
#define unlock_kmap() spin_unlock(&kmap_lock)
#define lock_kmap_any(flags) \
do { spin_lock(&kmap_lock); (void)(flags); } while (0)
#define unlock_kmap_any(flags) \
do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
#endif
/*
* Try to free a given kmap slot.
*
* Returns:
* -1 - in use
* 0 - free, no TLB flush needed
* 1 - free, needs TLB flush
*/
static int pkmap_try_free(int pos)
{
if (atomic_cmpxchg(&pkmap[pos].count, 1, 0) != 1)
return -1;
atomic_dec(&pkmap_free);
/*
* TODO: add a young bit to make it CLOCK
*/
if (!pte_none(pkmap_page_table[pos])) {
unsigned long addr = PKMAP_ADDR(pos);
pte_t *ptep = &pkmap_page_table[pos];
if (!pkmap[pos].pfn) {
struct page *page = pte_page(pkmap_page_table[pos]);
VM_BUG_ON(addr != (unsigned long)page_address(page));
if (!__set_page_address(page, NULL, pos))
BUG();
flush_kernel_dcache_page(page);
}
pte_clear(&init_mm, addr, ptep);
return 1;
}
return 0;
}
static inline void pkmap_put(atomic_t *counter)
{
switch (atomic_dec_return(counter)) {
case 0:
BUG();
case 1:
atomic_inc(&pkmap_free);
wake_up(&pkmap_wait);
}
}
#define TLB_BATCH 32
static int pkmap_get_free(void)
{
int i, pos, flush;
restart:
for (i = 0; i < LAST_PKMAP; i++) {
pos = atomic_inc_return(&pkmap_hand) & LAST_PKMAP_MASK;
flush = pkmap_try_free(pos);
if (flush >= 0)
goto got_one;
}
atomic_dec(&pkmap_free);
/*
* wait for somebody else to unmap their entries
*/
if (likely(!in_interrupt()))
wait_event(pkmap_wait, atomic_read(&pkmap_free) != 0);
goto restart;
got_one:
if (flush) {
#if 0
flush_tlb_kernel_range(PKMAP_ADDR(pos), PKMAP_ADDR(pos+1));
#else
int pos2 = (pos + 1) & LAST_PKMAP_MASK;
int nr;
int entries[TLB_BATCH];
/*
* For those architectures that cannot help but flush the
* whole TLB, flush some more entries to make it worthwhile.
* Scan ahead of the hand to minimise search distances.
*/
for (i = 0, nr = 0; i < LAST_PKMAP && nr < TLB_BATCH;
i++, pos2 = (pos2 + 1) & LAST_PKMAP_MASK) {
flush = pkmap_try_free(pos2);
if (flush < 0)
continue;
if (!flush) {
atomic_t *counter = &pkmap[pos2].count;
VM_BUG_ON(atomic_read(counter) != 0);
atomic_set(counter, 2);
pkmap_put(counter);
} else
entries[nr++] = pos2;
}
flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
for (i = 0; i < nr; i++) {
atomic_t *counter = &pkmap[entries[i]].count;
VM_BUG_ON(atomic_read(counter) != 0);
atomic_set(counter, 2);
pkmap_put(counter);
}
#endif
}
return pos;
}
static unsigned long pkmap_insert(unsigned long pfn, pgprot_t prot)
{
int pos = pkmap_get_free();
unsigned long vaddr = PKMAP_ADDR(pos);
pte_t *ptep = &pkmap_page_table[pos];
pte_t entry = pfn_pte(pfn, prot);
atomic_t *counter = &pkmap[pos].count;
VM_BUG_ON(atomic_read(counter) != 0);
set_pte_at(&init_mm, vaddr, ptep, entry);
pkmap[pos].pfn =
!(pgprot_val(prot) == pgprot_val(kmap_prot) && pfn_valid(pfn));
if (!pkmap[pos].pfn) {
struct page *page = pfn_to_page(pfn);
if (unlikely(!__set_page_address(page, (void *)vaddr, pos))) {
/*
* concurrent pkmap_inserts for this page -
* the other won the race, release this entry.
*
* we can still clear the pte without a tlb flush since
* it couldn't have been used yet.
*/
pte_clear(&init_mm, vaddr, ptep);
VM_BUG_ON(atomic_read(counter) != 0);
atomic_set(counter, 2);
pkmap_put(counter);
return 0;
}
} else {
#ifdef ARCH_NEEDS_KMAP_HIGH_GET
/*
* non-default prot and pure pfn memory doesn't
* get map deduplication, nor a working page_address
*
* this also makes it incompatible with
* ARCH_NEEDS_KMAP_HIGH_GET
*/
BUG();
#endif
}
atomic_set(counter, 2);
return vaddr;
}
/*
* Flush all unused kmap mappings in order to remove stray mappings.
*/
void kmap_flush_unused(void)
{
WARN_ON_ONCE(1);
}
/*
* Avoid starvation deadlock by limiting the number of tasks that can obtain a
* kmap to (LAST_PKMAP - KM_TYPE_NR*NR_CPUS)/2.
*/
static void kmap_account(void)
{
int weight;
#ifndef CONFIG_PREEMPT_RT
if (in_interrupt()) {
/* irqs can always get them */
weight = -1;
} else
#endif
if (current->flags & PF_KMAP) {
current->flags &= ~PF_KMAP;
/* we already accounted the second */
weight = 0;
} else {
/* mark 1, account 2 */
current->flags |= PF_KMAP;
weight = 2;
}
if (weight > 0) {
/*
* reserve KM_TYPE_NR maps per CPU for interrupt context
*/
const int target = LAST_PKMAP
#ifndef CONFIG_PREEMPT_RT
- KM_TYPE_NR*NR_CPUS
#endif
;
again:
wait_event(pkmap_wait,
atomic_read(&pkmap_users) + weight <= target);
if (atomic_add_return(weight, &pkmap_users) > target) {
atomic_sub(weight, &pkmap_users);
goto again;
}
}
}
static void kunmap_account(void)
{
int weight;
#ifndef CONFIG_PREEMPT_RT
if (in_irq()) {
weight = -1;
} else
#endif
if (current->flags & PF_KMAP) {
/* there was only 1 kmap, un-account both */
current->flags &= ~PF_KMAP;
weight = 2;
} else {
/* there were two kmaps, un-account per kunmap */
weight = 1;
}
if (weight > 0)
atomic_sub(weight, &pkmap_users);
wake_up(&pkmap_wait);
}
void *kmap_get(struct page *page)
{
unsigned long vaddr;
again:
vaddr = (unsigned long)page_address(page);
if (vaddr) {
atomic_t *counter = &pkmap[PKMAP_NR(vaddr)].count;
if (atomic_inc_not_zero(counter)) {
/*
* atomic_inc_not_zero implies a (memory) barrier on
* success so page address will be reloaded.
*/
unsigned long vaddr2 = (unsigned long)page_address(page);
if (likely(vaddr == vaddr2))
return (void *)vaddr;
/*
* Oops, we got someone else.
*
* This can happen if we get preempted after
* page_address() and before atomic_inc_not_zero()
* and during that preemption this slot is freed and
* reused.
*/
pkmap_put(counter);
}
goto again;
}
return (void *)vaddr;
}
void *kmap_high(struct page *page)
{
unsigned long vaddr;
kmap_account();
again:
vaddr = (unsigned long)kmap_get(page);
if (!vaddr) {
vaddr = pkmap_insert(page_to_pfn(page), kmap_prot);
if (!vaddr)
goto again;
}
return (void *)vaddr;
}
EXPORT_SYMBOL(kmap_high);
void *kmap_pfn_prot(unsigned long pfn, pgprot_t prot)
{
unsigned long vaddr;
if (pgprot_val(prot) == pgprot_val(kmap_prot) &&
pfn_valid(pfn) && PageHighMem(pfn_to_page(pfn)))
return kmap_high(pfn_to_page(pfn));
kmap_account();
vaddr = pkmap_insert(pfn, prot);
BUG_ON(!vaddr);
return (void *)vaddr;
}
EXPORT_SYMBOL(kmap_pfn_prot);
#ifdef ARCH_NEEDS_KMAP_HIGH_GET
/**
* kmap_high_get - pin a highmem page into memory
* @page: &struct page to pin
*
* Returns the page's current virtual memory address, or NULL if no mapping
* exists. When and only when a non null address is returned then a
* matching call to kunmap_high() is necessary.
*
* This can be called from any context.
*/
void *kmap_high_get(struct page *page)
{
unsigned long vaddr, flags;
lock_kmap_any(flags);
vaddr = (unsigned long)kmap_get(page);
unlock_kmap_any(flags);
return (void *)vaddr;
}
#endif
void kunmap_virt(void *ptr)
{
unsigned long vaddr = (unsigned long)ptr;
if (vaddr < PKMAP_ADDR(0) || vaddr >= PKMAP_ADDR(LAST_PKMAP))
return;
pkmap_put(&pkmap[PKMAP_NR(vaddr)].count);
kunmap_account();
}
void kunmap_high(struct page *page)
{
unsigned long vaddr = (unsigned long)page_address(page);
BUG_ON(!vaddr);
pkmap_put(&pkmap[PKMAP_NR(vaddr)].count);
kunmap_account();
}
EXPORT_SYMBOL(kunmap_high);
#endif
#if defined(HASHED_PAGE_VIRTUAL)
#define PA_HASH_ORDER 7
/*
* Describes one page->virtual address association.
*/
static struct page_address_map {
struct page *page;
void *virtual;
struct list_head list;
} page_address_maps[LAST_PKMAP];
/*
* Hash table bucket
*/
static struct page_address_slot {
struct list_head lh; /* List of page_address_maps */
spinlock_t lock; /* Protect this bucket's list */
} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
static struct page_address_slot *page_slot(struct page *page)
{
return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
}
/**
* page_address - get the mapped virtual address of a page
* @page: &struct page to get the virtual address of
*
* Returns the page's virtual address.
*/
static void *__page_address(struct page_address_slot *pas, struct page *page)
{
void *ret = NULL;
if (!list_empty(&pas->lh)) {
struct page_address_map *pam;
list_for_each_entry(pam, &pas->lh, list) {
if (pam->page == page) {
ret = pam->virtual;
break;
}
}
}
return ret;
}
void *page_address(struct page *page)
{
unsigned long flags;
void *ret;
struct page_address_slot *pas;
if (!PageHighMem(page))
return lowmem_page_address(page);
pas = page_slot(page);
spin_lock_irqsave(&pas->lock, flags);
ret = __page_address(pas, page);
spin_unlock_irqrestore(&pas->lock, flags);
return ret;
}
EXPORT_SYMBOL(page_address);
/**
* set_page_address - set a page's virtual address
* @page: &struct page to set
* @virtual: virtual address to use
*/
static int __set_page_address(struct page *page, void *virtual, int pos)
{
int ret = 0;
unsigned long flags;
struct page_address_slot *pas;
struct page_address_map *pam;
VM_BUG_ON(!PageHighMem(page));
VM_BUG_ON(atomic_read(&pkmap[pos].count) != 0);
VM_BUG_ON(pos < 0 || pos >= LAST_PKMAP);
pas = page_slot(page);
pam = &page_address_maps[pos];
spin_lock_irqsave(&pas->lock, flags);
if (virtual) { /* add */
VM_BUG_ON(!list_empty(&pam->list));
if (!__page_address(pas, page)) {
pam->page = page;
pam->virtual = virtual;
list_add_tail(&pam->list, &pas->lh);
ret = 1;
}
} else { /* remove */
if (!list_empty(&pam->list)) {
list_del_init(&pam->list);
ret = 1;
}
}
spin_unlock_irqrestore(&pas->lock, flags);
return ret;
}
int set_page_address(struct page *page, void *virtual)
{
/*
* set_page_address is not supposed to be called when using
* hashed virtual addresses.
*/
BUG();
return 0;
}
void __init __page_address_init(void)
{
int i;
for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
INIT_LIST_HEAD(&page_address_maps[i].list);
for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
INIT_LIST_HEAD(&page_address_htable[i].lh);
spin_lock_init(&page_address_htable[i].lock);
}
}
#elif defined (CONFIG_HIGHMEM) /* HASHED_PAGE_VIRTUAL */
static int __set_page_address(struct page *page, void *virtual, int pos)
{
return set_page_address(page, virtual);
}
#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
#if defined(CONFIG_HIGHMEM) || defined(HASHED_PAGE_VIRTUAL)
void __init page_address_init(void)
{
#ifdef CONFIG_HIGHMEM
int i;
for (i = 0; i < ARRAY_SIZE(pkmap); i++)
atomic_set(&pkmap[i].count, 1);
atomic_set(&pkmap_hand, 0);
atomic_set(&pkmap_free, LAST_PKMAP);
atomic_set(&pkmap_users, 0);
#endif
#ifdef HASHED_PAGE_VIRTUAL
__page_address_init();
#endif
}
#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
void debug_kmap_atomic(enum km_type type)
{
static int warn_count = 10;
if (unlikely(warn_count < 0))
return;
if (unlikely(in_interrupt())) {
if (in_nmi()) {
if (type != KM_NMI && type != KM_NMI_PTE) {
WARN_ON(1);
warn_count--;
}
} else if (in_irq()) {
if (type != KM_IRQ0 && type != KM_IRQ1 &&
type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
WARN_ON(1);
warn_count--;
}
} else if (!irqs_disabled()) { /* softirq */
if (type != KM_IRQ0 && type != KM_IRQ1 &&
type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
type != KM_SKB_SUNRPC_DATA &&
type != KM_SKB_DATA_SOFTIRQ &&
type != KM_BOUNCE_READ) {
WARN_ON(1);
warn_count--;
}
}
}
if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
type == KM_IRQ_PTE || type == KM_NMI ||
type == KM_NMI_PTE ) {
if (!irqs_disabled()) {
WARN_ON(1);
warn_count--;
}
} else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
if (irq_count() == 0 && !irqs_disabled()) {
WARN_ON(1);
warn_count--;
}
}
}
#endif