mm/mmap.c - pub/scm/linux/kernel/git/joro/linux - Git at Google

 /*
  * mm/mmap.c
  *
  * Written by obz.
  *
  * Address space accounting code	<alan@redhat.com>
  */

 #include <linux/slab.h>
 #include <linux/shm.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/personality.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 #include <linux/profile.h>

 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>

 /*
  * WARNING: the debugging will use recursive algorithms so never enable this
  * unless you know what you are doing.
  */
 #undef DEBUG_MM_RB

 /* description of effects of mapping type and prot in current implementation.
  * this is due to the limited x86 page protection hardware.  The expected
  * behavior is in parens:
  *
  * map_type	prot
  *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
  * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
  *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
  *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
  *
  * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
  *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
  *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
  *
  */
 pgprot_t protection_map[16] = {
 	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
 	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
 };

 int sysctl_overcommit_memory = 0;	/* default is heuristic overcommit */
 int sysctl_overcommit_ratio = 50;	/* default is 50% */
 atomic_t vm_committed_space = ATOMIC_INIT(0);

 inline void vm_unacct_memory(long pages)
 {
 	atomic_sub(pages, &vm_committed_space);
 }

 /*
  * Check that a process has enough memory to allocate a new virtual
  * mapping. 1 means there is enough memory for the allocation to
  * succeed and 0 implies there is not.
  *
  * We currently support three overcommit policies, which are set via the
  * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-acounting
  *
  * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
  * Additional code 2002 Jul 20 by Robert Love.
  */
 int vm_enough_memory(long pages)
 {
 	unsigned long free, allowed;

 	atomic_add(pages, &vm_committed_space);

         /*
 	 * Sometimes we want to use more memory than we have
 	 */
 	if (sysctl_overcommit_memory == 1)
 		return 1;

 	if (sysctl_overcommit_memory == 0) {
 		free = get_page_cache_size();
 		free += nr_free_pages();
 		free += nr_swap_pages;

 		/*
 		 * This double-counts: the nrpages are both in the
 		 * page-cache and in the swapper space. At the same time,
 		 * this compensates for the swap-space over-allocation
 		 * (ie "nr_swap_pages" being too small).
 		 */
 		free += total_swapcache_pages;

 		/*
 		 * The code below doesn't account for free space in the
 		 * inode and dentry slab cache, slab cache fragmentation,
 		 * inodes and dentries which will become freeable under
 		 * VM load, etc. Lets just hope all these (complex)
 		 * factors balance out...
 		 */
 		free += (dentry_stat.nr_unused * sizeof(struct dentry)) >>
 			PAGE_SHIFT;
 		free += (inodes_stat.nr_unused * sizeof(struct inode)) >>
 			PAGE_SHIFT;

 		if (free > pages)
 			return 1;
 		vm_unacct_memory(pages);
 		return 0;
 	}

 	allowed = totalram_pages * sysctl_overcommit_ratio / 100;
 	allowed += total_swap_pages;

 	if (atomic_read(&vm_committed_space) < allowed)
 		return 1;

 	vm_unacct_memory(pages);

 	return 0;
 }

 /*
  * Requires inode->i_mapping->i_shared_sem
  */
 static inline void
 __remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode)
 {
 	if (inode) {
 		if (vma->vm_flags & VM_DENYWRITE)
 			atomic_inc(&inode->i_writecount);
 		list_del_init(&vma->shared);
 	}
 }

 /*
  * Remove one vm structure from the inode's i_mapping address space.
  */
 static void remove_shared_vm_struct(struct vm_area_struct *vma)
 {
 	struct file *file = vma->vm_file;

 	if (file) {
 		struct inode *inode = file->f_dentry->d_inode;

 		down(&inode->i_mapping->i_shared_sem);
 		__remove_shared_vm_struct(vma, inode);
 		up(&inode->i_mapping->i_shared_sem);
 	}
 }

 /*
  *  sys_brk() for the most part doesn't need the global kernel
  *  lock, except when an application is doing something nasty
  *  like trying to un-brk an area that has already been mapped
  *  to a regular file.  in this case, the unmapping will need
  *  to invoke file system routines that need the global lock.
  */
 asmlinkage unsigned long sys_brk(unsigned long brk)
 {
 	unsigned long rlim, retval;
 	unsigned long newbrk, oldbrk;
 	struct mm_struct *mm = current->mm;

 	down_write(&mm->mmap_sem);

 	if (brk < mm->end_code)
 		goto out;
 	newbrk = PAGE_ALIGN(brk);
 	oldbrk = PAGE_ALIGN(mm->brk);
 	if (oldbrk == newbrk)
 		goto set_brk;

 	/* Always allow shrinking brk. */
 	if (brk <= mm->brk) {
 		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
 			goto set_brk;
 		goto out;
 	}

 	/* Check against rlimit.. */
 	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
 		goto out;

 	/* Check against existing mmap mappings. */
 	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
 		goto out;

 	/* Ok, looks good - let it rip. */
 	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
 		goto out;
 set_brk:
 	mm->brk = brk;
 out:
 	retval = mm->brk;
 	up_write(&mm->mmap_sem);
 	return retval;
 }

 /* Combine the mmap "prot" and "flags" argument into one "vm_flags" used
  * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits
  * into "VM_xxx".
  */
 static inline unsigned long
 calc_vm_flags(unsigned long prot, unsigned long flags)
 {
 #define _trans(x,bit1,bit2) \
 ((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0)

 	unsigned long prot_bits, flag_bits;
 	prot_bits =
 		_trans(prot, PROT_READ, VM_READ) |
 		_trans(prot, PROT_WRITE, VM_WRITE) |
 		_trans(prot, PROT_EXEC, VM_EXEC);
 	flag_bits =
 		_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) |
 		_trans(flags, MAP_DENYWRITE, VM_DENYWRITE) |
 		_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE);
 	return prot_bits | flag_bits;
 #undef _trans
 }

 #ifdef DEBUG_MM_RB
 static int browse_rb(struct rb_node * rb_node) {
 	int i = 0;
 	if (rb_node) {
 		i++;
 		i += browse_rb(rb_node->rb_left);
 		i += browse_rb(rb_node->rb_right);
 	}
 	return i;
 }

 static void validate_mm(struct mm_struct * mm) {
 	int bug = 0;
 	int i = 0;
 	struct vm_area_struct * tmp = mm->mmap;
 	while (tmp) {
 		tmp = tmp->vm_next;
 		i++;
 	}
 	if (i != mm->map_count)
 		printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
 	i = browse_rb(mm->mm_rb.rb_node);
 	if (i != mm->map_count)
 		printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
 	if (bug)
 		BUG();
 }
 #else
 #define validate_mm(mm) do { } while (0)
 #endif

 static struct vm_area_struct *
 find_vma_prepare(struct mm_struct *mm, unsigned long addr,
 		struct vm_area_struct **pprev, struct rb_node ***rb_link,
 		struct rb_node ** rb_parent)
 {
 	struct vm_area_struct * vma;
 	struct rb_node ** __rb_link, * __rb_parent, * rb_prev;

 	__rb_link = &mm->mm_rb.rb_node;
 	rb_prev = __rb_parent = NULL;
 	vma = NULL;

 	while (*__rb_link) {
 		struct vm_area_struct *vma_tmp;

 		__rb_parent = *__rb_link;
 		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);

 		if (vma_tmp->vm_end > addr) {
 			vma = vma_tmp;
 			if (vma_tmp->vm_start <= addr)
 				return vma;
 			__rb_link = &__rb_parent->rb_left;
 		} else {
 			rb_prev = __rb_parent;
 			__rb_link = &__rb_parent->rb_right;
 		}
 	}

 	*pprev = NULL;
 	if (rb_prev)
 		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
 	*rb_link = __rb_link;
 	*rb_parent = __rb_parent;
 	return vma;
 }

 static inline void
 __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct vm_area_struct *prev, struct rb_node *rb_parent)
 {
 	if (prev) {
 		vma->vm_next = prev->vm_next;
 		prev->vm_next = vma;
 	} else {
 		mm->mmap = vma;
 		if (rb_parent)
 			vma->vm_next = rb_entry(rb_parent,
 					struct vm_area_struct, vm_rb);
 		else
 			vma->vm_next = NULL;
 	}
 }

 static void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
 			struct rb_node **rb_link, struct rb_node *rb_parent)
 {
 	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
 	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
 }

 static inline void __vma_link_file(struct vm_area_struct *vma)
 {
 	struct file * file;

 	file = vma->vm_file;
 	if (file) {
 		struct inode * inode = file->f_dentry->d_inode;
 		struct address_space *mapping = inode->i_mapping;

 		if (vma->vm_flags & VM_DENYWRITE)
 			atomic_dec(&inode->i_writecount);

 		if (vma->vm_flags & VM_SHARED)
 			list_add_tail(&vma->shared, &mapping->i_mmap_shared);
 		else
 			list_add_tail(&vma->shared, &mapping->i_mmap);
 	}
 }

 static void
 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct vm_area_struct *prev, struct rb_node **rb_link,
 	struct rb_node *rb_parent)
 {
 	__vma_link_list(mm, vma, prev, rb_parent);
 	__vma_link_rb(mm, vma, rb_link, rb_parent);
 	__vma_link_file(vma);
 }

 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 			struct vm_area_struct *prev, struct rb_node **rb_link,
 			struct rb_node *rb_parent)
 {
 	struct address_space *mapping = NULL;

 	if (vma->vm_file)
 		mapping = vma->vm_file->f_dentry->d_inode->i_mapping;

 	if (mapping)
 		down(&mapping->i_shared_sem);
 	spin_lock(&mm->page_table_lock);
 	__vma_link(mm, vma, prev, rb_link, rb_parent);
 	spin_unlock(&mm->page_table_lock);
 	if (mapping)
 		up(&mapping->i_shared_sem);

 	mark_mm_hugetlb(mm, vma);
 	mm->map_count++;
 	validate_mm(mm);
 }

 /*
  * Return true if we can merge this (vm_flags,file,vm_pgoff,size)
  * in front of (at a lower virtual address and file offset than) the vma.
  *
  * We don't check here for the merged mmap wrapping around the end of pagecache
  * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
  * wrap, nor mmaps which cover the final page at index -1UL.
  */
 static int
 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
 	struct file *file, unsigned long vm_pgoff, unsigned long size)
 {
 	if (vma->vm_file == file && vma->vm_flags == vm_flags) {
 		if (!file)
 			return 1;	/* anon mapping */
 		if (vma->vm_pgoff == vm_pgoff + size)
 			return 1;
 	}
 	return 0;
 }

 /*
  * Return true if we can merge this (vm_flags,file,vm_pgoff)
  * beyond (at a higher virtual address and file offset than) the vma.
  */
 static int
 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 	struct file *file, unsigned long vm_pgoff)
 {
 	if (vma->vm_file == file && vma->vm_flags == vm_flags) {
 		unsigned long vma_size;

 		if (!file)
 			return 1;	/* anon mapping */

 		vma_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 		if (vma->vm_pgoff + vma_size == vm_pgoff)
 			return 1;
 	}
 	return 0;
 }

 /*
  * Given a new mapping request (addr,end,vm_flags,file,pgoff), figure out
  * whether that can be merged with its predecessor or its successor.  Or
  * both (it neatly fills a hole).
  */
 static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
 			struct rb_node *rb_parent, unsigned long addr,
 			unsigned long end, unsigned long vm_flags,
 			struct file *file, unsigned long pgoff)
 {
 	spinlock_t * lock = &mm->page_table_lock;

 	if (!prev) {
 		prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
 		goto merge_next;
 	}

 	/*
 	 * Can it merge with the predecessor?
 	 */
 	if (prev->vm_end == addr &&
 			can_vma_merge_after(prev, vm_flags, file, pgoff)) {
 		struct vm_area_struct *next;
 		struct inode *inode = file ? file->f_dentry->d_inode : NULL;
 		int need_up = 0;

 		if (unlikely(file && prev->vm_next &&
 				prev->vm_next->vm_file == file)) {
 			down(&inode->i_mapping->i_shared_sem);
 			need_up = 1;
 		}
 		spin_lock(lock);
 		prev->vm_end = end;

 		/*
 		 * OK, it did.  Can we now merge in the successor as well?
 		 */
 		next = prev->vm_next;
 		if (next && prev->vm_end == next->vm_start &&
 				can_vma_merge_before(next, vm_flags, file,
 					pgoff, (end - addr) >> PAGE_SHIFT)) {
 			prev->vm_end = next->vm_end;
 			__vma_unlink(mm, next, prev);
 			__remove_shared_vm_struct(next, inode);
 			spin_unlock(lock);
 			if (need_up)
 				up(&inode->i_mapping->i_shared_sem);

 			mm->map_count--;
 			kmem_cache_free(vm_area_cachep, next);
 			return 1;
 		}
 		spin_unlock(lock);
 		if (need_up)
 			up(&inode->i_mapping->i_shared_sem);
 		return 1;
 	}

 	/*
 	 * Can this new request be merged in front of prev->vm_next?
 	 */
 	prev = prev->vm_next;
 	if (prev) {
  merge_next:
 		if (!can_vma_merge_before(prev, vm_flags, file,
 				pgoff, (end - addr) >> PAGE_SHIFT))
 			return 0;
 		if (end == prev->vm_start) {
 			spin_lock(lock);
 			prev->vm_start = addr;
 			prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT;
 			spin_unlock(lock);
 			return 1;
 		}
 	}

 	return 0;
 }

 /*
  * The caller must hold down_write(current->mm->mmap_sem).
  */

 unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 			unsigned long len, unsigned long prot,
 			unsigned long flags, unsigned long pgoff)
 {
 	struct mm_struct * mm = current->mm;
 	struct vm_area_struct * vma, * prev;
 	struct inode *inode;
 	unsigned int vm_flags;
 	int correct_wcount = 0;
 	int error;
 	struct rb_node ** rb_link, * rb_parent;
 	unsigned long charged = 0;

 	if (file && (!file->f_op || !file->f_op->mmap))
 		return -ENODEV;

 	if (!len)
 		return addr;

 	if (len > TASK_SIZE)
 		return -EINVAL;

 	len = PAGE_ALIGN(len);

 	/* offset overflow? */
 	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
 		return -EINVAL;

 	/* Too many mappings? */
 	if (mm->map_count > MAX_MAP_COUNT)
 		return -ENOMEM;

 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
 	 */
 	addr = get_unmapped_area(file, addr, len, pgoff, flags);
 	if (addr & ~PAGE_MASK)
 		return addr;

 	/* Do simple checking here so the lower-level routines won't have
 	 * to. we assume access permissions have been handled by the open
 	 * of the memory object, so we don't do any here.
 	 */
 	vm_flags = calc_vm_flags(prot,flags) | mm->def_flags |
 			VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

 	if (flags & MAP_LOCKED) {
 		if (!capable(CAP_IPC_LOCK))
 			return -EPERM;
 		vm_flags |= VM_LOCKED;
 	}
 	/* mlock MCL_FUTURE? */
 	if (vm_flags & VM_LOCKED) {
 		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
 		locked += len;
 		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
 			return -EAGAIN;
 	}

 	inode = file ? file->f_dentry->d_inode : NULL;

 	if (file) {
 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
 			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
 				return -EACCES;

 			/*
 			 * Make sure we don't allow writing to an append-only
 			 * file..
 			 */
 			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
 				return -EACCES;

 			/*
 			 * Make sure there are no mandatory locks on the file.
 			 */
 			if (locks_verify_locked(inode))
 				return -EAGAIN;

 			vm_flags |= VM_SHARED | VM_MAYSHARE;
 			if (!(file->f_mode & FMODE_WRITE))
 				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);

 			/* fall through */
 		case MAP_PRIVATE:
 			if (!(file->f_mode & FMODE_READ))
 				return -EACCES;
 			break;

 		default:
 			return -EINVAL;
 		}
 	} else {
 		vm_flags |= VM_SHARED | VM_MAYSHARE;
 		switch (flags & MAP_TYPE) {
 		default:
 			return -EINVAL;
 		case MAP_PRIVATE:
 			vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
 			/* fall through */
 		case MAP_SHARED:
 			break;
 		}
 	}

 	error = security_file_mmap(file, prot, flags);
 	if (error)
 		return error;

 	/* Clear old maps */
 	error = -ENOMEM;
 munmap_back:
 	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 	if (vma && vma->vm_start < addr + len) {
 		if (do_munmap(mm, addr, len))
 			return -ENOMEM;
 		goto munmap_back;
 	}

 	/* Check against address space limit. */
 	if ((mm->total_vm << PAGE_SHIFT) + len
 	    > current->rlim[RLIMIT_AS].rlim_cur)
 		return -ENOMEM;

 	if (!(flags & MAP_NORESERVE) || sysctl_overcommit_memory > 1) {
 		if (vm_flags & VM_SHARED) {
 			/* Check memory availability in shmem_file_setup? */
 			vm_flags |= VM_ACCOUNT;
 		} else if (vm_flags & VM_WRITE) {
 			/*
 			 * Private writable mapping: check memory availability
 			 */
 			charged = len >> PAGE_SHIFT;
 			if (!vm_enough_memory(charged))
 				return -ENOMEM;
 			vm_flags |= VM_ACCOUNT;
 		}
 	}

 	/* Can we just expand an old anonymous mapping? */
 	if (!file && !(vm_flags & VM_SHARED) && rb_parent)
 		if (vma_merge(mm, prev, rb_parent, addr, addr + len,
 					vm_flags, NULL, 0))
 			goto out;

 	/*
 	 * Determine the object being mapped and call the appropriate
 	 * specific mapper. the address has already been validated, but
 	 * not unmapped, but the maps are removed from the list.
 	 */
 	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 	error = -ENOMEM;
 	if (!vma)
 		goto unacct_error;

 	vma->vm_mm = mm;
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 	vma->vm_flags = vm_flags;
 	vma->vm_page_prot = protection_map[vm_flags & 0x0f];
 	vma->vm_ops = NULL;
 	vma->vm_pgoff = pgoff;
 	vma->vm_file = NULL;
 	vma->vm_private_data = NULL;
 	INIT_LIST_HEAD(&vma->shared);

 	if (file) {
 		error = -EINVAL;
 		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
 			goto free_vma;
 		if (vm_flags & VM_DENYWRITE) {
 			error = deny_write_access(file);
 			if (error)
 				goto free_vma;
 			correct_wcount = 1;
 		}
 		vma->vm_file = file;
 		get_file(file);
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
 	} else if (vm_flags & VM_SHARED) {
 		error = shmem_zero_setup(vma);
 		if (error)
 			goto free_vma;
 	}

 	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
 	 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
 	 * that memory reservation must be checked; but that reservation
 	 * belongs to shared memory object, not to vma: so now clear it.
 	 */
 	if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
 		vma->vm_flags &= ~VM_ACCOUNT;

 	/* Can addr have changed??
 	 *
 	 * Answer: Yes, several device drivers can do it in their
 	 *         f_op->mmap method. -DaveM
 	 */
 	addr = vma->vm_start;

 	if (!file || !rb_parent || !vma_merge(mm, prev, rb_parent, addr,
 				addr + len, vma->vm_flags, file, pgoff)) {
 		vma_link(mm, vma, prev, rb_link, rb_parent);
 		if (correct_wcount)
 			atomic_inc(&inode->i_writecount);
 	} else {
 		if (file) {
 			if (correct_wcount)
 				atomic_inc(&inode->i_writecount);
 			fput(file);
 		}
 		kmem_cache_free(vm_area_cachep, vma);
 	}
 out:
 	mm->total_vm += len >> PAGE_SHIFT;
 	if (vm_flags & VM_LOCKED) {
 		mm->locked_vm += len >> PAGE_SHIFT;
 		make_pages_present(addr, addr + len);
 	}
 	if (flags & MAP_POPULATE) {
 		up_write(&mm->mmap_sem);
 		sys_remap_file_pages(addr, len, prot,
 					pgoff, flags & MAP_NONBLOCK);
 		down_write(&mm->mmap_sem);
 	}
 	return addr;

 unmap_and_free_vma:
 	if (correct_wcount)
 		atomic_inc(&inode->i_writecount);
 	vma->vm_file = NULL;
 	fput(file);

 	/* Undo any partial mapping done by a device driver. */
 	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
 free_vma:
 	kmem_cache_free(vm_area_cachep, vma);
 unacct_error:
 	if (charged)
 		vm_unacct_memory(charged);
 	return error;
 }

 /* Get an address range which is currently unmapped.
  * For shmat() with addr=0.
  *
  * Ugly calling convention alert:
  * Return value with the low bits set means error value,
  * ie
  *	if (ret & ~PAGE_MASK)
  *		error = ret;
  *
  * This function "knows" that -ENOMEM has the bits set.
  */
 #ifndef HAVE_ARCH_UNMAPPED_AREA
 static inline unsigned long
 arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	int found_hole = 0;

 	if (len > TASK_SIZE)
 		return -ENOMEM;

 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
 	addr = mm->free_area_cache;

 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
 		if (TASK_SIZE - len < addr)
 			return -ENOMEM;
 		/*
 		 * Record the first available hole.
 		 */
 		if (!found_hole && (!vma || addr < vma->vm_start)) {
 			mm->free_area_cache = addr;
 			found_hole = 1;
 		}
 		if (!vma || addr + len <= vma->vm_start)
 			return addr;
 		addr = vma->vm_end;
 	}
 }
 #else
 extern unsigned long
 arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
 			unsigned long, unsigned long);
 #endif

 unsigned long
 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 		unsigned long pgoff, unsigned long flags)
 {
 	if (flags & MAP_FIXED) {
 		if (addr > TASK_SIZE - len)
 			return -ENOMEM;
 		if (addr & ~PAGE_MASK)
 			return -EINVAL;
 		if (file && is_file_hugepages(file)) {
 			unsigned long ret;

 			ret = is_aligned_hugepage_range(addr, len);
 			if (ret)
 				return ret;
 		}
 		return addr;
 	}

 	if (file && file->f_op && file->f_op->get_unmapped_area)
 		return file->f_op->get_unmapped_area(file, addr, len,
 						pgoff, flags);

 	return arch_get_unmapped_area(file, addr, len, pgoff, flags);
 }

 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
 {
 	struct vm_area_struct *vma = NULL;

 	if (mm) {
 		/* Check the cache first. */
 		/* (Cache hit rate is typically around 35%.) */
 		vma = mm->mmap_cache;
 		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
 			struct rb_node * rb_node;

 			rb_node = mm->mm_rb.rb_node;
 			vma = NULL;

 			while (rb_node) {
 				struct vm_area_struct * vma_tmp;

 				vma_tmp = rb_entry(rb_node,
 						struct vm_area_struct, vm_rb);

 				if (vma_tmp->vm_end > addr) {
 					vma = vma_tmp;
 					if (vma_tmp->vm_start <= addr)
 						break;
 					rb_node = rb_node->rb_left;
 				} else
 					rb_node = rb_node->rb_right;
 			}
 			if (vma)
 				mm->mmap_cache = vma;
 		}
 	}
 	return vma;
 }

 /* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
 struct vm_area_struct *
 find_vma_prev(struct mm_struct *mm, unsigned long addr,
 			struct vm_area_struct **pprev)
 {
 	struct vm_area_struct *vma = NULL, *prev = NULL;
 	struct rb_node * rb_node;
 	if (!mm)
 		goto out;

 	/* Guard against addr being lower than the first VMA */
 	vma = mm->mmap;

 	/* Go through the RB tree quickly. */
 	rb_node = mm->mm_rb.rb_node;

 	while (rb_node) {
 		struct vm_area_struct *vma_tmp;
 		vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);

 		if (addr < vma_tmp->vm_end) {
 			rb_node = rb_node->rb_left;
 		} else {
 			prev = vma_tmp;
 			if (!prev->vm_next || (addr < prev->vm_next->vm_end))
 				break;
 			rb_node = rb_node->rb_right;
 		}
 	}

 out:
 	*pprev = prev;
 	return prev ? prev->vm_next : vma;
 }

 #ifdef CONFIG_STACK_GROWSUP
 /*
  * vma is the first one with address > vma->vm_end.  Have to extend vma.
  */
 int expand_stack(struct vm_area_struct * vma, unsigned long address)
 {
 	unsigned long grow;

 	if (!(vma->vm_flags & VM_GROWSUP))
 		return -EFAULT;

 	/*
 	 * vma->vm_start/vm_end cannot change under us because the caller
 	 * is required to hold the mmap_sem in read mode. We need to get
 	 * the spinlock only before relocating the vma range ourself.
 	 */
 	address += 4 + PAGE_SIZE - 1;
 	address &= PAGE_MASK;
  	spin_lock(&vma->vm_mm->page_table_lock);
 	grow = (address - vma->vm_end) >> PAGE_SHIFT;

 	/* Overcommit.. */
 	if (!vm_enough_memory(grow)) {
 		spin_unlock(&vma->vm_mm->page_table_lock);
 		return -ENOMEM;
 	}

 	if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur ||
 			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
 			current->rlim[RLIMIT_AS].rlim_cur) {
 		spin_unlock(&vma->vm_mm->page_table_lock);
 		vm_unacct_memory(grow);
 		return -ENOMEM;
 	}
 	vma->vm_end = address;
 	vma->vm_mm->total_vm += grow;
 	if (vma->vm_flags & VM_LOCKED)
 		vma->vm_mm->locked_vm += grow;
 	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }

 struct vm_area_struct *
 find_extend_vma(struct mm_struct *mm, unsigned long addr)
 {
 	struct vm_area_struct *vma, *prev;

 	addr &= PAGE_MASK;
 	vma = find_vma_prev(mm, addr, &prev);
 	if (vma && (vma->vm_start <= addr))
 		return vma;
 	if (!prev || expand_stack(prev, addr))
 		return NULL;
 	if (prev->vm_flags & VM_LOCKED) {
 		make_pages_present(addr, prev->vm_end);
 	}
 	return prev;
 }
 #else
 /*
  * vma is the first one with address < vma->vm_start.  Have to extend vma.
  */
 int expand_stack(struct vm_area_struct *vma, unsigned long address)
 {
 	unsigned long grow;

 	/*
 	 * vma->vm_start/vm_end cannot change under us because the caller
 	 * is required to hold the mmap_sem in read mode. We need to get
 	 * the spinlock only before relocating the vma range ourself.
 	 */
 	address &= PAGE_MASK;
  	spin_lock(&vma->vm_mm->page_table_lock);
 	grow = (vma->vm_start - address) >> PAGE_SHIFT;

 	/* Overcommit.. */
 	if (!vm_enough_memory(grow)) {
 		spin_unlock(&vma->vm_mm->page_table_lock);
 		return -ENOMEM;
 	}

 	if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
 			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
 			current->rlim[RLIMIT_AS].rlim_cur) {
 		spin_unlock(&vma->vm_mm->page_table_lock);
 		vm_unacct_memory(grow);
 		return -ENOMEM;
 	}
 	vma->vm_start = address;
 	vma->vm_pgoff -= grow;
 	vma->vm_mm->total_vm += grow;
 	if (vma->vm_flags & VM_LOCKED)
 		vma->vm_mm->locked_vm += grow;
 	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }

 struct vm_area_struct *
 find_extend_vma(struct mm_struct * mm, unsigned long addr)
 {
 	struct vm_area_struct * vma;
 	unsigned long start;

 	addr &= PAGE_MASK;
 	vma = find_vma(mm,addr);
 	if (!vma)
 		return NULL;
 	if (vma->vm_start <= addr)
 		return vma;
 	if (!(vma->vm_flags & VM_GROWSDOWN))
 		return NULL;
 	start = vma->vm_start;
 	if (expand_stack(vma, addr))
 		return NULL;
 	if (vma->vm_flags & VM_LOCKED) {
 		make_pages_present(addr, start);
 	}
 	return vma;
 }
 #endif

 /*
  * Try to free as many page directory entries as we can,
  * without having to work very hard at actually scanning
  * the page tables themselves.
  *
  * Right now we try to free page tables if we have a nice
  * PGDIR-aligned area that got free'd up. We could be more
  * granular if we want to, but this is fast and simple,
  * and covers the bad cases.
  *
  * "prev", if it exists, points to a vma before the one
  * we just free'd - but there's no telling how much before.
  */
 static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
 	unsigned long start, unsigned long end)
 {
 	unsigned long first = start & PGDIR_MASK;
 	unsigned long last = end + PGDIR_SIZE - 1;
 	unsigned long start_index, end_index;
 	struct mm_struct *mm = tlb->mm;

 	if (!prev) {
 		prev = mm->mmap;
 		if (!prev)
 			goto no_mmaps;
 		if (prev->vm_end > start) {
 			if (last > prev->vm_start)
 				last = prev->vm_start;
 			goto no_mmaps;
 		}
 	}
 	for (;;) {
 		struct vm_area_struct *next = prev->vm_next;

 		if (next) {
 			if (next->vm_start < start) {
 				prev = next;
 				continue;
 			}
 			if (last > next->vm_start)
 				last = next->vm_start;
 		}
 		if (prev->vm_end > first)
 			first = prev->vm_end + PGDIR_SIZE - 1;
 		break;
 	}
 no_mmaps:
 	if (last < first)	/* for arches with discontiguous pgd indices */
 		return;
 	/*
 	 * If the PGD bits are not consecutive in the virtual address, the
 	 * old method of shifting the VA >> by PGDIR_SHIFT doesn't work.
 	 */
 	start_index = pgd_index(first);
 	if (start_index < FIRST_USER_PGD_NR)
 		start_index = FIRST_USER_PGD_NR;
 	end_index = pgd_index(last);
 	if (end_index > start_index) {
 		clear_page_tables(tlb, start_index, end_index - start_index);
 		flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK);
 	}
 }

 /* Normal function to fix up a mapping
  * This function is the default for when an area has no specific
  * function.  This may be used as part of a more specific routine.
  *
  * By the time this function is called, the area struct has been
  * removed from the process mapping list.
  */
 static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
 {
 	size_t len = area->vm_end - area->vm_start;

 	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
 	if (area->vm_flags & VM_LOCKED)
 		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
 	/*
 	 * Is this a new hole at the lowest possible address?
 	 */
 	if (area->vm_start >= TASK_UNMAPPED_BASE &&
 				area->vm_start < area->vm_mm->free_area_cache)
 	      area->vm_mm->free_area_cache = area->vm_start;

 	remove_shared_vm_struct(area);

 	if (area->vm_ops && area->vm_ops->close)
 		area->vm_ops->close(area);
 	if (area->vm_file)
 		fput(area->vm_file);
 	kmem_cache_free(vm_area_cachep, area);
 }

 /*
  * Update the VMA and inode share lists.
  *
  * Ok - we have the memory areas we should free on the 'free' list,
  * so release them, and do the vma updates.
  */
 static void unmap_vma_list(struct mm_struct *mm,
 	struct vm_area_struct *mpnt)
 {
 	do {
 		struct vm_area_struct *next = mpnt->vm_next;
 		unmap_vma(mm, mpnt);
 		mpnt = next;
 	} while (mpnt != NULL);
 	validate_mm(mm);
 }

 /*
  * Get rid of page table information in the indicated region.
  *
  * Called with the page table lock held.
  */
 static void unmap_region(struct mm_struct *mm,
 	struct vm_area_struct *vma,
 	struct vm_area_struct *prev,
 	unsigned long start,
 	unsigned long end)
 {
 	struct mmu_gather *tlb;
 	unsigned long nr_accounted = 0;

 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(tlb, prev, start, end);
 	tlb_finish_mmu(tlb, start, end);
 }

 /*
  * Create a list of vma's touched by the unmap, removing them from the mm's
  * vma list as we go..
  *
  * Called with the page_table_lock held.
  */
 static void
 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct vm_area_struct *prev, unsigned long end)
 {
 	struct vm_area_struct **insertion_point;
 	struct vm_area_struct *tail_vma = NULL;

 	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 	do {
 		rb_erase(&vma->vm_rb, &mm->mm_rb);
 		mm->map_count--;
 		tail_vma = vma;
 		vma = vma->vm_next;
 	} while (vma && vma->vm_start < end);
 	*insertion_point = vma;
 	tail_vma->vm_next = NULL;
 	mm->mmap_cache = NULL;		/* Kill the cache. */
 }

 /*
  * Split a vma into two pieces at address 'addr', a new vma is allocated
  * either for the first part or the the tail.
  */
 int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	      unsigned long addr, int new_below)
 {
 	struct vm_area_struct *new;

 	if (mm->map_count >= MAX_MAP_COUNT)
 		return -ENOMEM;

 	new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 	if (!new)
 		return -ENOMEM;

 	/* most fields are the same, copy all, and then fixup */
 	*new = *vma;

 	INIT_LIST_HEAD(&new->shared);

 	if (new_below) {
 		new->vm_end = addr;
 		vma->vm_start = addr;
 		vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT);
 	} else {
 		vma->vm_end = addr;
 		new->vm_start = addr;
 		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
 	}

 	if (new->vm_file)
 		get_file(new->vm_file);

 	if (new->vm_ops && new->vm_ops->open)
 		new->vm_ops->open(new);

 	insert_vm_struct(mm, new);
 	return 0;
 }

 /* Munmap is split into 2 main parts -- this part which finds
  * what needs doing, and the areas themselves, which do the
  * work.  This now handles partial unmappings.
  * Jeremy Fitzhardinge <jeremy@goop.org>
  */
 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 {
 	unsigned long end;
 	struct vm_area_struct *mpnt, *prev, *last;

 	if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
 		return -EINVAL;

 	if ((len = PAGE_ALIGN(len)) == 0)
 		return -EINVAL;

 	/* Find the first overlapping VMA */
 	mpnt = find_vma_prev(mm, start, &prev);
 	if (!mpnt)
 		return 0;
 	/* we have  start < mpnt->vm_end  */

 	if (is_vm_hugetlb_page(mpnt)) {
 		int ret = is_aligned_hugepage_range(start, len);

 		if (ret)
 			return ret;
 	}

 	/* if it doesn't overlap, we have nothing.. */
 	end = start + len;
 	if (mpnt->vm_start >= end)
 		return 0;

 	/* Something will probably happen, so notify. */
 	if (mpnt->vm_file && (mpnt->vm_flags & VM_EXEC))
 		profile_exec_unmap(mm);

 	/*
 	 * If we need to split any vma, do it now to save pain later.
 	 */
 	if (start > mpnt->vm_start) {
 		if (split_vma(mm, mpnt, start, 0))
 			return -ENOMEM;
 		prev = mpnt;
 		mpnt = mpnt->vm_next;
 	}

 	/* Does it split the last one? */
 	last = find_vma(mm, end);
 	if (last && end > last->vm_start) {
 		if (split_vma(mm, last, end, 0))
 			return -ENOMEM;
 	}

 	/*
 	 * Remove the vma's, and unmap the actual pages
 	 */
 	spin_lock(&mm->page_table_lock);
 	detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
 	unmap_region(mm, mpnt, prev, start, end);
 	spin_unlock(&mm->page_table_lock);

 	/* Fix up all other VM information */
 	unmap_vma_list(mm, mpnt);

 	return 0;
 }

 asmlinkage long sys_munmap(unsigned long addr, size_t len)
 {
 	int ret;
 	struct mm_struct *mm = current->mm;

 	down_write(&mm->mmap_sem);
 	ret = do_munmap(mm, addr, len);
 	up_write(&mm->mmap_sem);
 	return ret;
 }

 /*
  *  this is really a simplified "do_mmap".  it only handles
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
 unsigned long do_brk(unsigned long addr, unsigned long len)
 {
 	struct mm_struct * mm = current->mm;
 	struct vm_area_struct * vma, * prev;
 	unsigned long flags;
 	struct rb_node ** rb_link, * rb_parent;

 	len = PAGE_ALIGN(len);
 	if (!len)
 		return addr;

 	/*
 	 * mlock MCL_FUTURE?
 	 */
 	if (mm->def_flags & VM_LOCKED) {
 		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
 		locked += len;
 		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
 			return -EAGAIN;
 	}

 	/*
 	 * Clear old maps.  this also does some error checking for us
 	 */
  munmap_back:
 	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 	if (vma && vma->vm_start < addr + len) {
 		if (do_munmap(mm, addr, len))
 			return -ENOMEM;
 		goto munmap_back;
 	}

 	/* Check against address space limits *after* clearing old maps... */
 	if ((mm->total_vm << PAGE_SHIFT) + len
 	    > current->rlim[RLIMIT_AS].rlim_cur)
 		return -ENOMEM;

 	if (mm->map_count > MAX_MAP_COUNT)
 		return -ENOMEM;

 	if (!vm_enough_memory(len >> PAGE_SHIFT))
 		return -ENOMEM;

 	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;

 	/* Can we just expand an old anonymous mapping? */
 	if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len,
 					flags, NULL, 0))
 		goto out;

 	/*
 	 * create a vma struct for an anonymous mapping
 	 */
 	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 	if (!vma) {
 		vm_unacct_memory(len >> PAGE_SHIFT);
 		return -ENOMEM;
 	}

 	vma->vm_mm = mm;
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 	vma->vm_flags = flags;
 	vma->vm_page_prot = protection_map[flags & 0x0f];
 	vma->vm_ops = NULL;
 	vma->vm_pgoff = 0;
 	vma->vm_file = NULL;
 	vma->vm_private_data = NULL;
 	INIT_LIST_HEAD(&vma->shared);

 	vma_link(mm, vma, prev, rb_link, rb_parent);

 out:
 	mm->total_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED) {
 		mm->locked_vm += len >> PAGE_SHIFT;
 		make_pages_present(addr, addr + len);
 	}
 	return addr;
 }

 /* Build the RB tree corresponding to the VMA list. */
 void build_mmap_rb(struct mm_struct * mm)
 {
 	struct vm_area_struct * vma;
 	struct rb_node ** rb_link, * rb_parent;

 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
 	rb_parent = NULL;
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		__vma_link_rb(mm, vma, rb_link, rb_parent);
 		rb_parent = &vma->vm_rb;
 		rb_link = &rb_parent->rb_right;
 	}
 }

 /* Release all mmaps. */
 void exit_mmap(struct mm_struct *mm)
 {
 	struct mmu_gather *tlb;
 	struct vm_area_struct *vma;
 	unsigned long nr_accounted = 0;

 	profile_exit_mmap(mm);

 	lru_add_drain();

 	spin_lock(&mm->page_table_lock);

 	tlb = tlb_gather_mmu(mm, 1);
 	flush_cache_mm(mm);
 	/* Use ~0UL here to ensure all VMAs in the mm are unmapped */
 	mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
 					~0UL, &nr_accounted);
 	vm_unacct_memory(nr_accounted);
 	BUG_ON(mm->map_count);	/* This is just debugging */
 	clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
 	tlb_finish_mmu(tlb, 0, TASK_SIZE);

 	vma = mm->mmap;
 	mm->mmap = mm->mmap_cache = NULL;
 	mm->mm_rb = RB_ROOT;
 	mm->rss = 0;
 	mm->total_vm = 0;
 	mm->locked_vm = 0;

 	spin_unlock(&mm->page_table_lock);

 	/*
 	 * Walk the list again, actually closing and freeing it
 	 * without holding any MM locks.
 	 */
 	while (vma) {
 		struct vm_area_struct *next = vma->vm_next;
 		remove_shared_vm_struct(vma);
 		if (vma->vm_ops) {
 			if (vma->vm_ops->close)
 				vma->vm_ops->close(vma);
 		}
 		if (vma->vm_file)
 			fput(vma->vm_file);
 		kmem_cache_free(vm_area_cachep, vma);
 		vma = next;
 	}
 }

 /* Insert vm structure into process list sorted by address
  * and into the inode's i_mmap ring.  If vm_file is non-NULL
  * then i_shared_sem is taken here.
  */
 void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
 	struct vm_area_struct * __vma, * prev;
 	struct rb_node ** rb_link, * rb_parent;

 	__vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
 	if (__vma && __vma->vm_start < vma->vm_end)
 		BUG();
 	vma_link(mm, vma, prev, rb_link, rb_parent);
 	validate_mm(mm);
 }