| /* |
| * linux/mm/page_alloc.c |
| * |
| * Manages the free list, the system allocates free pages here. |
| * Note that kmalloc() lives in slab.c |
| * |
| * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
| * Swap reorganised 29.12.95, Stephen Tweedie |
| * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 |
| * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 |
| * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 |
| * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 |
| */ |
| |
| #include <linux/config.h> |
| #include <linux/mm.h> |
| #include <linux/swap.h> |
| #include <linux/swapctl.h> |
| #include <linux/interrupt.h> |
| #include <linux/pagemap.h> |
| #include <linux/bootmem.h> |
| #include <linux/slab.h> |
| #include <linux/module.h> |
| |
| int nr_swap_pages; |
| int nr_active_pages; |
| int nr_inactive_pages; |
| LIST_HEAD(inactive_list); |
| LIST_HEAD(active_list); |
| pg_data_t *pgdat_list; |
| |
| /* |
| * |
| * The zone_table array is used to look up the address of the |
| * struct zone corresponding to a given zone number (ZONE_DMA, |
| * ZONE_NORMAL, or ZONE_HIGHMEM). |
| */ |
| zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; |
| EXPORT_SYMBOL(zone_table); |
| |
| static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; |
| static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; |
| static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; |
| static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; |
| static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; |
| |
| int vm_gfp_debug = 0; |
| |
| /* |
| * Temporary debugging check. |
| */ |
| #define BAD_RANGE(zone, page) \ |
| ( \ |
| (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \ |
| || (((page) - mem_map) < (zone)->zone_start_mapnr) \ |
| || ((zone) != page_zone(page)) \ |
| ) |
| |
| /* |
| * Freeing function for a buddy system allocator. |
| * Contrary to prior comments, this is *NOT* hairy, and there |
| * is no reason for anyone not to understand it. |
| * |
| * The concept of a buddy system is to maintain direct-mapped tables |
| * (containing bit values) for memory blocks of various "orders". |
| * The bottom level table contains the map for the smallest allocatable |
| * units of memory (here, pages), and each level above it describes |
| * pairs of units from the levels below, hence, "buddies". |
| * At a high level, all that happens here is marking the table entry |
| * at the bottom level available, and propagating the changes upward |
| * as necessary, plus some accounting needed to play nicely with other |
| * parts of the VM system. |
| * At each level, we keep one bit for each pair of blocks, which |
| * is set to 1 iff only one of the pair is allocated. So when we |
| * are allocating or freeing one, we can derive the state of the |
| * other. That is, if we allocate a small block, and both were |
| * free, the remainder of the region must be split into blocks. |
| * If a block is freed, and its buddy is also free, then this |
| * triggers coalescing into a block of larger size. |
| * |
| * -- wli |
| */ |
| |
| static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order)); |
| static void __free_pages_ok (struct page *page, unsigned int order) |
| { |
| unsigned long index, page_idx, mask, flags; |
| free_area_t *area; |
| struct page *base; |
| zone_t *zone; |
| |
| arch_free_page(page, order); |
| /* |
| * Yes, think what happens when other parts of the kernel take |
| * a reference to a page in order to pin it for io. -ben |
| */ |
| if (PageLRU(page)) { |
| if (unlikely(in_interrupt())) |
| BUG(); |
| lru_cache_del(page); |
| } |
| |
| if (page->buffers) |
| BUG(); |
| if (page->mapping) |
| BUG(); |
| if (!VALID_PAGE(page)) |
| BUG(); |
| if (PageLocked(page)) |
| BUG(); |
| if (PageActive(page)) |
| BUG(); |
| ClearPageReferenced(page); |
| ClearPageDirty(page); |
| |
| if (current->flags & PF_FREE_PAGES) |
| goto local_freelist; |
| back_local_freelist: |
| |
| zone = page_zone(page); |
| |
| mask = (~0UL) << order; |
| base = zone->zone_mem_map; |
| page_idx = page - base; |
| if (page_idx & ~mask) |
| BUG(); |
| index = page_idx >> (1 + order); |
| |
| area = zone->free_area + order; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| |
| zone->free_pages -= mask; |
| |
| while (mask + (1 << (MAX_ORDER-1))) { |
| struct page *buddy1, *buddy2; |
| |
| if (area >= zone->free_area + MAX_ORDER) |
| BUG(); |
| if (!__test_and_change_bit(index, area->map)) |
| /* |
| * the buddy page is still allocated. |
| */ |
| break; |
| /* |
| * Move the buddy up one level. |
| * This code is taking advantage of the identity: |
| * -mask = 1+~mask |
| */ |
| buddy1 = base + (page_idx ^ -mask); |
| buddy2 = base + page_idx; |
| if (BAD_RANGE(zone,buddy1)) |
| BUG(); |
| if (BAD_RANGE(zone,buddy2)) |
| BUG(); |
| |
| list_del(&buddy1->list); |
| mask <<= 1; |
| area++; |
| index >>= 1; |
| page_idx &= mask; |
| } |
| list_add(&(base + page_idx)->list, &area->free_list); |
| |
| spin_unlock_irqrestore(&zone->lock, flags); |
| return; |
| |
| local_freelist: |
| if (current->nr_local_pages) |
| goto back_local_freelist; |
| if (in_interrupt()) |
| goto back_local_freelist; |
| |
| list_add(&page->list, ¤t->local_pages); |
| page->index = order; |
| current->nr_local_pages++; |
| } |
| |
| #define MARK_USED(index, order, area) \ |
| __change_bit((index) >> (1+(order)), (area)->map) |
| |
| static inline struct page * expand (zone_t *zone, struct page *page, |
| unsigned long index, int low, int high, free_area_t * area) |
| { |
| unsigned long size = 1 << high; |
| |
| while (high > low) { |
| if (BAD_RANGE(zone,page)) |
| BUG(); |
| area--; |
| high--; |
| size >>= 1; |
| list_add(&(page)->list, &(area)->free_list); |
| MARK_USED(index, high, area); |
| index += size; |
| page += size; |
| } |
| if (BAD_RANGE(zone,page)) |
| BUG(); |
| return page; |
| } |
| |
| static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order)); |
| static struct page * rmqueue(zone_t *zone, unsigned int order) |
| { |
| free_area_t * area = zone->free_area + order; |
| unsigned int curr_order = order; |
| struct list_head *head, *curr; |
| unsigned long flags; |
| struct page *page; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| do { |
| head = &area->free_list; |
| curr = head->next; |
| |
| if (curr != head) { |
| unsigned int index; |
| |
| page = list_entry(curr, struct page, list); |
| if (BAD_RANGE(zone,page)) |
| BUG(); |
| list_del(curr); |
| index = page - zone->zone_mem_map; |
| if (curr_order != MAX_ORDER-1) |
| MARK_USED(index, curr_order, area); |
| zone->free_pages -= 1UL << order; |
| |
| page = expand(zone, page, index, order, curr_order, area); |
| spin_unlock_irqrestore(&zone->lock, flags); |
| |
| set_page_count(page, 1); |
| if (BAD_RANGE(zone,page)) |
| BUG(); |
| if (PageLRU(page)) |
| BUG(); |
| if (PageActive(page)) |
| BUG(); |
| return page; |
| } |
| curr_order++; |
| area++; |
| } while (curr_order < MAX_ORDER); |
| spin_unlock_irqrestore(&zone->lock, flags); |
| |
| return NULL; |
| } |
| |
| #ifndef CONFIG_DISCONTIGMEM |
| struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order) |
| { |
| return __alloc_pages(gfp_mask, order, |
| contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK)); |
| } |
| #endif |
| |
| static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *)); |
| static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) |
| { |
| struct page * page = NULL; |
| int __freed; |
| |
| if (in_interrupt()) |
| BUG(); |
| |
| current->allocation_order = order; |
| current->flags |= PF_MEMALLOC | PF_FREE_PAGES; |
| |
| __freed = try_to_free_pages_zone(classzone, gfp_mask); |
| |
| current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); |
| |
| if (current->nr_local_pages) { |
| struct list_head * entry, * local_pages; |
| struct page * tmp; |
| int nr_pages; |
| |
| local_pages = ¤t->local_pages; |
| |
| if (likely(__freed)) { |
| /* pick from the last inserted so we're lifo */ |
| entry = local_pages->next; |
| do { |
| tmp = list_entry(entry, struct page, list); |
| if (tmp->index == order && memclass(page_zone(tmp), classzone)) { |
| list_del(entry); |
| current->nr_local_pages--; |
| set_page_count(tmp, 1); |
| page = tmp; |
| |
| if (page->buffers) |
| BUG(); |
| if (page->mapping) |
| BUG(); |
| if (!VALID_PAGE(page)) |
| BUG(); |
| if (PageLocked(page)) |
| BUG(); |
| if (PageLRU(page)) |
| BUG(); |
| if (PageActive(page)) |
| BUG(); |
| if (PageDirty(page)) |
| BUG(); |
| |
| break; |
| } |
| } while ((entry = entry->next) != local_pages); |
| } |
| |
| nr_pages = current->nr_local_pages; |
| /* free in reverse order so that the global order will be lifo */ |
| while ((entry = local_pages->prev) != local_pages) { |
| list_del(entry); |
| tmp = list_entry(entry, struct page, list); |
| __free_pages_ok(tmp, tmp->index); |
| if (!nr_pages--) |
| BUG(); |
| } |
| current->nr_local_pages = 0; |
| } |
| |
| *freed = __freed; |
| return page; |
| } |
| |
| static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order) |
| { |
| long free = zone->free_pages - (1UL << order); |
| return free >= 0 ? free : 0; |
| } |
| |
| /* |
| * This is the 'heart' of the zoned buddy allocator: |
| */ |
| struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) |
| { |
| zone_t **zone, * classzone; |
| struct page * page; |
| int freed, class_idx; |
| |
| zone = zonelist->zones; |
| classzone = *zone; |
| class_idx = zone_idx(classzone); |
| |
| for (;;) { |
| zone_t *z = *(zone++); |
| if (!z) |
| break; |
| |
| if (zone_free_pages(z, order) > z->watermarks[class_idx].low) { |
| page = rmqueue(z, order); |
| if (page) |
| return page; |
| } |
| } |
| |
| classzone->need_balance = 1; |
| mb(); |
| if (waitqueue_active(&kswapd_wait)) |
| wake_up_interruptible(&kswapd_wait); |
| |
| zone = zonelist->zones; |
| for (;;) { |
| unsigned long min; |
| zone_t *z = *(zone++); |
| if (!z) |
| break; |
| |
| min = z->watermarks[class_idx].min; |
| if (!(gfp_mask & __GFP_WAIT)) |
| min >>= 2; |
| if (zone_free_pages(z, order) > min) { |
| page = rmqueue(z, order); |
| if (page) |
| return page; |
| } |
| } |
| |
| /* here we're in the low on memory slow path */ |
| |
| if (current->flags & PF_MEMALLOC && !in_interrupt()) { |
| zone = zonelist->zones; |
| for (;;) { |
| zone_t *z = *(zone++); |
| if (!z) |
| break; |
| |
| page = rmqueue(z, order); |
| if (page) |
| return page; |
| } |
| return NULL; |
| } |
| |
| /* Atomic allocations - we can't balance anything */ |
| if (!(gfp_mask & __GFP_WAIT)) |
| goto out; |
| |
| rebalance: |
| page = balance_classzone(classzone, gfp_mask, order, &freed); |
| if (page) |
| return page; |
| |
| zone = zonelist->zones; |
| if (likely(freed)) { |
| for (;;) { |
| zone_t *z = *(zone++); |
| if (!z) |
| break; |
| |
| if (zone_free_pages(z, order) > z->watermarks[class_idx].min) { |
| page = rmqueue(z, order); |
| if (page) |
| return page; |
| } |
| } |
| goto rebalance; |
| } else { |
| /* |
| * Check that no other task is been killed meanwhile, |
| * in such a case we can succeed the allocation. |
| */ |
| for (;;) { |
| zone_t *z = *(zone++); |
| if (!z) |
| break; |
| |
| if (zone_free_pages(z, order) > z->watermarks[class_idx].high) { |
| page = rmqueue(z, order); |
| if (page) |
| return page; |
| } |
| } |
| } |
| |
| out: |
| printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n", |
| order, gfp_mask, !!(current->flags & PF_MEMALLOC)); |
| if (unlikely(vm_gfp_debug)) |
| dump_stack(); |
| return NULL; |
| } |
| |
| /* |
| * Common helper functions. |
| */ |
| unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order) |
| { |
| struct page * page; |
| |
| page = alloc_pages(gfp_mask, order); |
| if (!page) |
| return 0; |
| return (unsigned long) page_address(page); |
| } |
| |
| unsigned long get_zeroed_page(unsigned int gfp_mask) |
| { |
| struct page * page; |
| |
| page = alloc_pages(gfp_mask, 0); |
| if (page) { |
| void *address = page_address(page); |
| clear_page(address); |
| return (unsigned long) address; |
| } |
| return 0; |
| } |
| |
| void __free_pages(struct page *page, unsigned int order) |
| { |
| if (!PageReserved(page) && put_page_testzero(page)) |
| __free_pages_ok(page, order); |
| } |
| |
| void free_pages(unsigned long addr, unsigned int order) |
| { |
| if (addr != 0) |
| __free_pages(virt_to_page(addr), order); |
| } |
| |
| /* |
| * Total amount of free (allocatable) RAM: |
| */ |
| unsigned int nr_free_pages (void) |
| { |
| unsigned int sum = 0; |
| zone_t *zone; |
| |
| for_each_zone(zone) |
| sum += zone->free_pages; |
| |
| return sum; |
| } |
| |
| /* |
| * Amount of free RAM allocatable as buffer memory: |
| */ |
| unsigned int nr_free_buffer_pages (void) |
| { |
| pg_data_t *pgdat; |
| unsigned int sum = 0; |
| zonelist_t *zonelist; |
| zone_t **zonep, *zone; |
| |
| for_each_pgdat(pgdat) { |
| int class_idx; |
| zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); |
| zonep = zonelist->zones; |
| zone = *zonep; |
| class_idx = zone_idx(zone); |
| |
| sum += zone->nr_cache_pages; |
| for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { |
| int free = zone->free_pages - zone->watermarks[class_idx].high; |
| if (free <= 0) |
| continue; |
| sum += free; |
| } |
| } |
| |
| return sum; |
| } |
| |
| #if CONFIG_HIGHMEM |
| unsigned int nr_free_highpages (void) |
| { |
| pg_data_t *pgdat; |
| unsigned int pages = 0; |
| |
| for_each_pgdat(pgdat) |
| pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; |
| |
| return pages; |
| } |
| #endif |
| |
| #define K(x) ((x) << (PAGE_SHIFT-10)) |
| |
| /* |
| * Show free area list (used inside shift_scroll-lock stuff) |
| * We also calculate the percentage fragmentation. We do this by counting the |
| * memory on each free list with the exception of the first item on the list. |
| */ |
| void show_free_areas_core(pg_data_t *pgdat) |
| { |
| unsigned int order; |
| unsigned type; |
| pg_data_t *tmpdat = pgdat; |
| |
| printk("Free pages: %6dkB (%6dkB HighMem)\n", |
| K(nr_free_pages()), |
| K(nr_free_highpages())); |
| |
| while (tmpdat) { |
| zone_t *zone; |
| for (zone = tmpdat->node_zones; |
| zone < tmpdat->node_zones + MAX_NR_ZONES; zone++) |
| printk("Zone:%s freepages:%6lukB\n", |
| zone->name, |
| K(zone->free_pages)); |
| |
| tmpdat = tmpdat->node_next; |
| } |
| |
| printk("( Active: %d, inactive: %d, free: %d )\n", |
| nr_active_pages, |
| nr_inactive_pages, |
| nr_free_pages()); |
| |
| for (type = 0; type < MAX_NR_ZONES; type++) { |
| struct list_head *head, *curr; |
| zone_t *zone = pgdat->node_zones + type; |
| unsigned long nr, total, flags; |
| |
| total = 0; |
| if (zone->size) { |
| spin_lock_irqsave(&zone->lock, flags); |
| for (order = 0; order < MAX_ORDER; order++) { |
| head = &(zone->free_area + order)->free_list; |
| curr = head; |
| nr = 0; |
| for (;;) { |
| if ((curr = curr->next) == head) |
| break; |
| nr++; |
| } |
| total += nr * (1 << order); |
| printk("%lu*%lukB ", nr, K(1UL) << order); |
| } |
| spin_unlock_irqrestore(&zone->lock, flags); |
| } |
| printk("= %lukB)\n", K(total)); |
| } |
| |
| #ifdef SWAP_CACHE_INFO |
| show_swap_cache_info(); |
| #endif |
| } |
| |
| void show_free_areas(void) |
| { |
| show_free_areas_core(pgdat_list); |
| } |
| |
| /* |
| * Builds allocation fallback zone lists. |
| */ |
| static inline void build_zonelists(pg_data_t *pgdat) |
| { |
| int i, j, k; |
| |
| for (i = 0; i <= GFP_ZONEMASK; i++) { |
| zonelist_t *zonelist; |
| zone_t *zone; |
| |
| zonelist = pgdat->node_zonelists + i; |
| memset(zonelist, 0, sizeof(*zonelist)); |
| |
| j = 0; |
| k = ZONE_NORMAL; |
| if (i & __GFP_HIGHMEM) |
| k = ZONE_HIGHMEM; |
| if (i & __GFP_DMA) |
| k = ZONE_DMA; |
| |
| switch (k) { |
| default: |
| BUG(); |
| /* |
| * fallthrough: |
| */ |
| case ZONE_HIGHMEM: |
| zone = pgdat->node_zones + ZONE_HIGHMEM; |
| if (zone->size) { |
| #ifndef CONFIG_HIGHMEM |
| BUG(); |
| #endif |
| zonelist->zones[j++] = zone; |
| } |
| case ZONE_NORMAL: |
| zone = pgdat->node_zones + ZONE_NORMAL; |
| if (zone->size) |
| zonelist->zones[j++] = zone; |
| case ZONE_DMA: |
| zone = pgdat->node_zones + ZONE_DMA; |
| if (zone->size) |
| zonelist->zones[j++] = zone; |
| } |
| zonelist->zones[j++] = NULL; |
| } |
| } |
| |
| /* |
| * Helper functions to size the waitqueue hash table. |
| * Essentially these want to choose hash table sizes sufficiently |
| * large so that collisions trying to wait on pages are rare. |
| * But in fact, the number of active page waitqueues on typical |
| * systems is ridiculously low, less than 200. So this is even |
| * conservative, even though it seems large. |
| * |
| * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to |
| * waitqueues, i.e. the size of the waitq table given the number of pages. |
| */ |
| #define PAGES_PER_WAITQUEUE 256 |
| |
| static inline unsigned long wait_table_size(unsigned long pages) |
| { |
| unsigned long size = 1; |
| |
| pages /= PAGES_PER_WAITQUEUE; |
| |
| while (size < pages) |
| size <<= 1; |
| |
| /* |
| * Once we have dozens or even hundreds of threads sleeping |
| * on IO we've got bigger problems than wait queue collision. |
| * Limit the size of the wait table to a reasonable size. |
| */ |
| size = min(size, 4096UL); |
| |
| return size; |
| } |
| |
| /* |
| * This is an integer logarithm so that shifts can be used later |
| * to extract the more random high bits from the multiplicative |
| * hash function before the remainder is taken. |
| */ |
| static inline unsigned long wait_table_bits(unsigned long size) |
| { |
| return ffz(~size); |
| } |
| |
| #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) |
| |
| /* |
| * Set up the zone data structures: |
| * - mark all pages reserved |
| * - mark all memory queues empty |
| * - clear the memory bitmaps |
| */ |
| void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, |
| unsigned long *zones_size, unsigned long zone_start_paddr, |
| unsigned long *zholes_size, struct page *lmem_map) |
| { |
| unsigned long i, j; |
| unsigned long map_size; |
| unsigned long totalpages, offset, realtotalpages; |
| const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); |
| |
| if (zone_start_paddr & ~PAGE_MASK) |
| BUG(); |
| |
| totalpages = 0; |
| for (i = 0; i < MAX_NR_ZONES; i++) { |
| unsigned long size = zones_size[i]; |
| totalpages += size; |
| } |
| realtotalpages = totalpages; |
| if (zholes_size) |
| for (i = 0; i < MAX_NR_ZONES; i++) |
| realtotalpages -= zholes_size[i]; |
| |
| printk("On node %d totalpages: %lu\n", nid, realtotalpages); |
| |
| /* |
| * Some architectures (with lots of mem and discontinous memory |
| * maps) have to search for a good mem_map area: |
| * For discontigmem, the conceptual mem map array starts from |
| * PAGE_OFFSET, we need to align the actual array onto a mem map |
| * boundary, so that MAP_NR works. |
| */ |
| map_size = (totalpages + 1)*sizeof(struct page); |
| if (lmem_map == (struct page *)0) { |
| lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size); |
| lmem_map = (struct page *)(PAGE_OFFSET + |
| MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); |
| } |
| *gmap = pgdat->node_mem_map = lmem_map; |
| pgdat->node_size = totalpages; |
| pgdat->node_start_paddr = zone_start_paddr; |
| pgdat->node_start_mapnr = (lmem_map - mem_map); |
| pgdat->nr_zones = 0; |
| |
| offset = lmem_map - mem_map; |
| for (j = 0; j < MAX_NR_ZONES; j++) { |
| zone_t *zone = pgdat->node_zones + j; |
| unsigned long mask; |
| unsigned long size, realsize; |
| int idx; |
| |
| zone_table[nid * MAX_NR_ZONES + j] = zone; |
| realsize = size = zones_size[j]; |
| if (zholes_size) |
| realsize -= zholes_size[j]; |
| |
| printk("zone(%lu): %lu pages.\n", j, size); |
| zone->size = size; |
| zone->realsize = realsize; |
| zone->name = zone_names[j]; |
| zone->lock = SPIN_LOCK_UNLOCKED; |
| zone->zone_pgdat = pgdat; |
| zone->free_pages = 0; |
| zone->need_balance = 0; |
| zone->nr_active_pages = zone->nr_inactive_pages = 0; |
| |
| |
| if (!size) |
| continue; |
| |
| /* |
| * The per-page waitqueue mechanism uses hashed waitqueues |
| * per zone. |
| */ |
| zone->wait_table_size = wait_table_size(size); |
| zone->wait_table_shift = |
| BITS_PER_LONG - wait_table_bits(zone->wait_table_size); |
| zone->wait_table = (wait_queue_head_t *) |
| alloc_bootmem_node(pgdat, zone->wait_table_size |
| * sizeof(wait_queue_head_t)); |
| |
| for(i = 0; i < zone->wait_table_size; ++i) |
| init_waitqueue_head(zone->wait_table + i); |
| |
| pgdat->nr_zones = j+1; |
| |
| mask = (realsize / zone_balance_ratio[j]); |
| if (mask < zone_balance_min[j]) |
| mask = zone_balance_min[j]; |
| else if (mask > zone_balance_max[j]) |
| mask = zone_balance_max[j]; |
| zone->watermarks[j].min = mask; |
| zone->watermarks[j].low = mask*2; |
| zone->watermarks[j].high = mask*3; |
| /* now set the watermarks of the lower zones in the "j" classzone */ |
| for (idx = j-1; idx >= 0; idx--) { |
| zone_t * lower_zone = pgdat->node_zones + idx; |
| unsigned long lower_zone_reserve; |
| if (!lower_zone->size) |
| continue; |
| |
| mask = lower_zone->watermarks[idx].min; |
| lower_zone->watermarks[j].min = mask; |
| lower_zone->watermarks[j].low = mask*2; |
| lower_zone->watermarks[j].high = mask*3; |
| |
| /* now the brainer part */ |
| lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx]; |
| lower_zone->watermarks[j].min += lower_zone_reserve; |
| lower_zone->watermarks[j].low += lower_zone_reserve; |
| lower_zone->watermarks[j].high += lower_zone_reserve; |
| |
| realsize += lower_zone->realsize; |
| } |
| |
| zone->zone_mem_map = mem_map + offset; |
| zone->zone_start_mapnr = offset; |
| zone->zone_start_paddr = zone_start_paddr; |
| |
| if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1)) |
| printk("BUG: wrong zone alignment, it will crash\n"); |
| |
| /* |
| * Initially all pages are reserved - free ones are freed |
| * up by free_all_bootmem() once the early boot process is |
| * done. Non-atomic initialization, single-pass. |
| */ |
| for (i = 0; i < size; i++) { |
| struct page *page = mem_map + offset + i; |
| set_page_zone(page, nid * MAX_NR_ZONES + j); |
| set_page_count(page, 0); |
| SetPageReserved(page); |
| INIT_LIST_HEAD(&page->list); |
| if (j != ZONE_HIGHMEM) |
| set_page_address(page, __va(zone_start_paddr)); |
| zone_start_paddr += PAGE_SIZE; |
| } |
| |
| offset += size; |
| for (i = 0; ; i++) { |
| unsigned long bitmap_size; |
| |
| INIT_LIST_HEAD(&zone->free_area[i].free_list); |
| if (i == MAX_ORDER-1) { |
| zone->free_area[i].map = NULL; |
| break; |
| } |
| |
| /* |
| * Page buddy system uses "index >> (i+1)", |
| * where "index" is at most "size-1". |
| * |
| * The extra "+3" is to round down to byte |
| * size (8 bits per byte assumption). Thus |
| * we get "(size-1) >> (i+4)" as the last byte |
| * we can access. |
| * |
| * The "+1" is because we want to round the |
| * byte allocation up rather than down. So |
| * we should have had a "+7" before we shifted |
| * down by three. Also, we have to add one as |
| * we actually _use_ the last bit (it's [0,n] |
| * inclusive, not [0,n[). |
| * |
| * So we actually had +7+1 before we shift |
| * down by 3. But (n+8) >> 3 == (n >> 3) + 1 |
| * (modulo overflows, which we do not have). |
| * |
| * Finally, we LONG_ALIGN because all bitmap |
| * operations are on longs. |
| */ |
| bitmap_size = (size-1) >> (i+4); |
| bitmap_size = LONG_ALIGN(bitmap_size+1); |
| zone->free_area[i].map = |
| (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); |
| } |
| } |
| build_zonelists(pgdat); |
| } |
| |
| void __init free_area_init(unsigned long *zones_size) |
| { |
| free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0); |
| } |
| |
| static int __init setup_mem_frac(char *str) |
| { |
| int j = 0; |
| |
| while (get_option(&str, &zone_balance_ratio[j++]) == 2); |
| printk("setup_mem_frac: "); |
| for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]); |
| printk("\n"); |
| return 1; |
| } |
| |
| __setup("memfrac=", setup_mem_frac); |
| |
| static int __init setup_lower_zone_reserve(char *str) |
| { |
| int j = 0; |
| |
| while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2); |
| printk("setup_lower_zone_reserve: "); |
| for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d ", lower_zone_reserve_ratio[j]); |
| printk("\n"); |
| return 1; |
| } |
| |
| __setup("lower_zone_reserve=", setup_lower_zone_reserve); |