releases/3.17.3/vfs-fix-data-corruption-when-blocksize-pagesize-for-mmaped-data.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From 90a8020278c1598fafd071736a0846b38510309c Mon Sep 17 00:00:00 2001
 From: Jan Kara <jack@suse.cz>
 Date: Wed, 1 Oct 2014 21:49:18 -0400
 Subject: vfs: fix data corruption when blocksize < pagesize for mmaped data

 From: Jan Kara <jack@suse.cz>

 commit 90a8020278c1598fafd071736a0846b38510309c upstream.

 ->page_mkwrite() is used by filesystems to allocate blocks under a page
 which is becoming writeably mmapped in some process' address space. This
 allows a filesystem to return a page fault if there is not enough space
 available, user exceeds quota or similar problem happens, rather than
 silently discarding data later when writepage is called.

 However VFS fails to call ->page_mkwrite() in all the cases where
 filesystems need it when blocksize < pagesize. For example when
 blocksize = 1024, pagesize = 4096 the following is problematic:
   ftruncate(fd, 0);
   pwrite(fd, buf, 1024, 0);
   map = mmap(NULL, 1024, PROT_WRITE, MAP_SHARED, fd, 0);
   map[0] = 'a';       ----> page_mkwrite() for index 0 is called
   ftruncate(fd, 10000); /* or even pwrite(fd, buf, 1, 10000) */
   mremap(map, 1024, 10000, 0);
   map[4095] = 'a';    ----> no page_mkwrite() called

 At the moment ->page_mkwrite() is called, filesystem can allocate only
 one block for the page because i_size == 1024. Otherwise it would create
 blocks beyond i_size which is generally undesirable. But later at
 ->writepage() time, we also need to store data at offset 4095 but we
 don't have block allocated for it.

 This patch introduces a helper function filesystems can use to have
 ->page_mkwrite() called at all the necessary moments.

 Signed-off-by: Jan Kara <jack@suse.cz>
 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

 ---
  fs/buffer.c        |    3 ++
  include/linux/mm.h |    1
  mm/truncate.c      |   57 +++++++++++++++++++++++++++++++++++++++++++++++++++++
  3 files changed, 61 insertions(+)

 --- a/fs/buffer.c
 +++ b/fs/buffer.c
 @@ -2082,6 +2082,7 @@ int generic_write_end(struct file *file,
  			struct page *page, void *fsdata)
  {
  	struct inode *inode = mapping->host;
 +	loff_t old_size = inode->i_size;
  	int i_size_changed = 0;

  	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 @@ -2101,6 +2102,8 @@ int generic_write_end(struct file *file,
  	unlock_page(page);
  	page_cache_release(page);

 +	if (old_size < pos)
 +		pagecache_isize_extended(inode, old_size, pos);
  	/*
  	 * Don't mark the inode dirty under page lock. First, it unnecessarily
  	 * makes the holding time of page lock longer. Second, it forces lock
 --- a/include/linux/mm.h
 +++ b/include/linux/mm.h
 @@ -1174,6 +1174,7 @@ static inline void unmap_shared_mapping_

  extern void truncate_pagecache(struct inode *inode, loff_t new);
  extern void truncate_setsize(struct inode *inode, loff_t newsize);
 +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
  void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
  int truncate_inode_page(struct address_space *mapping, struct page *page);
  int generic_error_remove_page(struct address_space *mapping, struct page *page);
 --- a/mm/truncate.c
 +++ b/mm/truncate.c
 @@ -20,6 +20,7 @@
  #include <linux/buffer_head.h>	/* grr. try_to_release_page,
  				   do_invalidatepage */
  #include <linux/cleancache.h>
 +#include <linux/rmap.h>
  #include "internal.h"

  static void clear_exceptional_entry(struct address_space *mapping,
 @@ -719,12 +720,68 @@ EXPORT_SYMBOL(truncate_pagecache);
   */
  void truncate_setsize(struct inode *inode, loff_t newsize)
  {
 +	loff_t oldsize = inode->i_size;
 +
  	i_size_write(inode, newsize);
 +	if (newsize > oldsize)
 +		pagecache_isize_extended(inode, oldsize, newsize);
  	truncate_pagecache(inode, newsize);
  }
  EXPORT_SYMBOL(truncate_setsize);

  /**
 + * pagecache_isize_extended - update pagecache after extension of i_size
 + * @inode:	inode for which i_size was extended
 + * @from:	original inode size
 + * @to:		new inode size
 + *
 + * Handle extension of inode size either caused by extending truncate or by
 + * write starting after current i_size. We mark the page straddling current
 + * i_size RO so that page_mkwrite() is called on the nearest write access to
 + * the page.  This way filesystem can be sure that page_mkwrite() is called on
 + * the page before user writes to the page via mmap after the i_size has been
 + * changed.
 + *
 + * The function must be called after i_size is updated so that page fault
 + * coming after we unlock the page will already see the new i_size.
 + * The function must be called while we still hold i_mutex - this not only
 + * makes sure i_size is stable but also that userspace cannot observe new
 + * i_size value before we are prepared to store mmap writes at new inode size.
 + */
 +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
 +{
 +	int bsize = 1 << inode->i_blkbits;
 +	loff_t rounded_from;
 +	struct page *page;
 +	pgoff_t index;
 +
 +	WARN_ON(!mutex_is_locked(&inode->i_mutex));
 +	WARN_ON(to > inode->i_size);
 +
 +	if (from >= to || bsize == PAGE_CACHE_SIZE)
 +		return;
 +	/* Page straddling @from will not have any hole block created? */
 +	rounded_from = round_up(from, bsize);
 +	if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1)))
 +		return;
 +
 +	index = from >> PAGE_CACHE_SHIFT;
 +	page = find_lock_page(inode->i_mapping, index);
 +	/* Page not cached? Nothing to do */
 +	if (!page)
 +		return;
 +	/*
 +	 * See clear_page_dirty_for_io() for details why set_page_dirty()
 +	 * is needed.
 +	 */
 +	if (page_mkclean(page))
 +		set_page_dirty(page);
 +	unlock_page(page);
 +	page_cache_release(page);
 +}
 +EXPORT_SYMBOL(pagecache_isize_extended);
 +
 +/**
   * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
   * @inode: inode
   * @lstart: offset of beginning of hole
	From 90a8020278c1598fafd071736a0846b38510309c Mon Sep 17 00:00:00 2001
	From: Jan Kara <jack@suse.cz>
	Date: Wed, 1 Oct 2014 21:49:18 -0400
	Subject: vfs: fix data corruption when blocksize < pagesize for mmaped data

	From: Jan Kara <jack@suse.cz>

	commit 90a8020278c1598fafd071736a0846b38510309c upstream.

	->page_mkwrite() is used by filesystems to allocate blocks under a page
	which is becoming writeably mmapped in some process' address space. This
	allows a filesystem to return a page fault if there is not enough space
	available, user exceeds quota or similar problem happens, rather than
	silently discarding data later when writepage is called.

	However VFS fails to call ->page_mkwrite() in all the cases where
	filesystems need it when blocksize < pagesize. For example when
	blocksize = 1024, pagesize = 4096 the following is problematic:
	ftruncate(fd, 0);
	pwrite(fd, buf, 1024, 0);
	map = mmap(NULL, 1024, PROT_WRITE, MAP_SHARED, fd, 0);
	map[0] = 'a'; ----> page_mkwrite() for index 0 is called
	ftruncate(fd, 10000); /* or even pwrite(fd, buf, 1, 10000) */
	mremap(map, 1024, 10000, 0);
	map[4095] = 'a'; ----> no page_mkwrite() called

	At the moment ->page_mkwrite() is called, filesystem can allocate only
	one block for the page because i_size == 1024. Otherwise it would create
	blocks beyond i_size which is generally undesirable. But later at
	->writepage() time, we also need to store data at offset 4095 but we
	don't have block allocated for it.

	This patch introduces a helper function filesystems can use to have
	->page_mkwrite() called at all the necessary moments.

	Signed-off-by: Jan Kara <jack@suse.cz>
	Signed-off-by: Theodore Ts'o <tytso@mit.edu>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

	---
	fs/buffer.c \| 3 ++
	include/linux/mm.h \| 1
	mm/truncate.c \| 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++
	3 files changed, 61 insertions(+)

	--- a/fs/buffer.c
	+++ b/fs/buffer.c
	@@ -2082,6 +2082,7 @@ int generic_write_end(struct file *file,
	struct page page, void fsdata)
	{
	struct inode *inode = mapping->host;
	+ loff_t old_size = inode->i_size;
	int i_size_changed = 0;

	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
	@@ -2101,6 +2102,8 @@ int generic_write_end(struct file *file,
	unlock_page(page);
	page_cache_release(page);

	+ if (old_size < pos)
	+ pagecache_isize_extended(inode, old_size, pos);
	/*
	* Don't mark the inode dirty under page lock. First, it unnecessarily
	* makes the holding time of page lock longer. Second, it forces lock
	--- a/include/linux/mm.h
	+++ b/include/linux/mm.h
	@@ -1174,6 +1174,7 @@ static inline void unmap_shared_mapping_

	extern void truncate_pagecache(struct inode *inode, loff_t new);
	extern void truncate_setsize(struct inode *inode, loff_t newsize);
	+void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
	void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
	int truncate_inode_page(struct address_space mapping, struct page page);
	int generic_error_remove_page(struct address_space mapping, struct page page);
	--- a/mm/truncate.c
	+++ b/mm/truncate.c
	@@ -20,6 +20,7 @@
	#include <linux/buffer_head.h> /* grr. try_to_release_page,
	do_invalidatepage */
	#include <linux/cleancache.h>
	+#include <linux/rmap.h>
	#include "internal.h"

	static void clear_exceptional_entry(struct address_space *mapping,
	@@ -719,12 +720,68 @@ EXPORT_SYMBOL(truncate_pagecache);
	*/
	void truncate_setsize(struct inode *inode, loff_t newsize)
	{
	+ loff_t oldsize = inode->i_size;
	+
	i_size_write(inode, newsize);
	+ if (newsize > oldsize)
	+ pagecache_isize_extended(inode, oldsize, newsize);
	truncate_pagecache(inode, newsize);
	}
	EXPORT_SYMBOL(truncate_setsize);

	/**
	+ * pagecache_isize_extended - update pagecache after extension of i_size
	+ * @inode: inode for which i_size was extended
	+ * @from: original inode size
	+ * @to: new inode size
	+ *
	+ * Handle extension of inode size either caused by extending truncate or by
	+ * write starting after current i_size. We mark the page straddling current
	+ * i_size RO so that page_mkwrite() is called on the nearest write access to
	+ * the page. This way filesystem can be sure that page_mkwrite() is called on
	+ * the page before user writes to the page via mmap after the i_size has been
	+ * changed.
	+ *
	+ * The function must be called after i_size is updated so that page fault
	+ * coming after we unlock the page will already see the new i_size.
	+ * The function must be called while we still hold i_mutex - this not only
	+ * makes sure i_size is stable but also that userspace cannot observe new
	+ * i_size value before we are prepared to store mmap writes at new inode size.
	+ */
	+void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
	+{
	+ int bsize = 1 << inode->i_blkbits;
	+ loff_t rounded_from;
	+ struct page *page;
	+ pgoff_t index;
	+
	+ WARN_ON(!mutex_is_locked(&inode->i_mutex));
	+ WARN_ON(to > inode->i_size);
	+
	+ if (from >= to \|\| bsize == PAGE_CACHE_SIZE)
	+ return;
	+ /* Page straddling @from will not have any hole block created? */
	+ rounded_from = round_up(from, bsize);
	+ if (to <= rounded_from \|\| !(rounded_from & (PAGE_CACHE_SIZE - 1)))
	+ return;
	+
	+ index = from >> PAGE_CACHE_SHIFT;
	+ page = find_lock_page(inode->i_mapping, index);
	+ /* Page not cached? Nothing to do */
	+ if (!page)
	+ return;
	+ /*
	+ * See clear_page_dirty_for_io() for details why set_page_dirty()
	+ * is needed.
	+ */
	+ if (page_mkclean(page))
	+ set_page_dirty(page);
	+ unlock_page(page);
	+ page_cache_release(page);
	+}
	+EXPORT_SYMBOL(pagecache_isize_extended);
	+
	+/**
	* truncate_pagecache_range - unmap and remove pagecache that is hole-punched
	* @inode: inode
	* @lstart: offset of beginning of hole