fs/tux3/filemap_blocklib.c - pub/scm/linux/kernel/git/daniel/linux-tux3 - Git at Google

 /*
  * Copied some block library functions, to replace mark_buffer_dirty()
  * by pagefork_for_blockdirty() and __tux3_mark_buffer_dirty(),
  * to replace discard_buffer() by tux3_invalidate_buffer(), and add
  * tux3_iattrdirty().
  *
  * We should check the update of original functions, and sync with it.
  */

 #include <linux/pagevec.h>
 #include <linux/cleancache.h>

 /*
  * Copy of page_zero_new_buffers()
  * (changed to call __tux3_mark_buffer_dirty())
  */
 static void tux3_page_zero_new_buffers(struct page *page, unsigned from,
 				       unsigned to)
 {
 	unsigned int block_start, block_end;
 	struct buffer_head *head, *bh;

 	BUG_ON(!PageLocked(page));
 	if (!page_has_buffers(page))
 		return;

 	bh = head = page_buffers(page);
 	block_start = 0;
 	do {
 		block_end = block_start + bh->b_size;

 		if (buffer_new(bh)) {
 			if (block_end > from && block_start < to) {
 				unsigned delta = tux3_get_current_delta();

 				if (!PageUptodate(page)) {
 					unsigned start, size;

 					start = max(from, block_start);
 					size = min(to, block_end) - start;

 					zero_user(page, start, size);
 					set_buffer_uptodate(bh);
 				}

 				clear_buffer_new(bh);
 				__tux3_mark_buffer_dirty(bh, delta);
 			}
 		}

 		block_start = block_end;
 		bh = bh->b_this_page;
 	} while (bh != head);
 }

 /*
  * Copy of __block_write_begin() (changed to call __tux3_mark_buffer_dirty(),
  * and to remove unmap_underlying_metadata())
  */
 static int __tux3_write_begin(struct page *page, loff_t pos, unsigned len,
 			      get_block_t *get_block)
 {
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
 	unsigned to = from + len;
 	struct inode *inode = page->mapping->host;
 	struct sb *sb = tux_sb(inode->i_sb);
 	unsigned block_start, block_end;
 	sector_t block;
 	int err = 0;
 	unsigned blocksize, bbits;
 	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

 	BUG_ON(!PageLocked(page));
 	BUG_ON(from > PAGE_CACHE_SIZE);
 	BUG_ON(to > PAGE_CACHE_SIZE);
 	BUG_ON(from > to);

 	/* Use blocksize/blockbits in sb, instead of inode->i_blkbits */
 	blocksize = sb->blocksize;
 	bbits = sb->blockbits;
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 	head = page_buffers(page);

 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);

 	for(bh = head, block_start = 0; bh != head || !block_start;
 	    block++, block_start=block_end, bh = bh->b_this_page) {
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (PageUptodate(page)) {
 				if (!buffer_uptodate(bh))
 					set_buffer_uptodate(bh);
 			}
 			continue;
 		}
 		if (buffer_new(bh))
 			clear_buffer_new(bh);
 		if (!buffer_mapped(bh)) {
 			/*
 			 * FIXME: If user overwrites block fully, we
 			 * don't need get_block(). Since we know it is
 			 * delayed allocation, so, we can use SEG_HOLE
 			 * as delayed allocation.
 			 */
 			WARN_ON(bh->b_size != blocksize);
 			err = get_block(inode, block, bh, 1);
 			if (err)
 				break;
 			if (buffer_new(bh)) {
 #if 0
 				unmap_underlying_metadata(bh->b_bdev,
 							bh->b_blocknr);
 #endif
 				if (PageUptodate(page)) {
 					/* FIXME: do we have to mark this dirty?
 					 * re-think after mmap support */
 					//clear_buffer_new(bh);
 					set_buffer_uptodate(bh);
 					//__tux3_mark_buffer_dirty(bh, delta);
 					continue;
 				}
 				if (block_end > to || block_start < from)
 					zero_user_segments(page,
 						to, block_end,
 						block_start, from);
 				continue;
 			}
 		}
 		if (PageUptodate(page)) {
 			if (!buffer_uptodate(bh))
 				set_buffer_uptodate(bh);
 			continue;
 		}
 		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
 		    !buffer_unwritten(bh) &&
 		     (block_start < from || block_end > to)) {
 			ll_rw_block(READ, 1, &bh);
 			*wait_bh++=bh;
 		}
 	}
 	/*
 	 * If we issued read requests - let them complete.
 	 */
 	while(wait_bh > wait) {
 		wait_on_buffer(*--wait_bh);
 		if (!buffer_uptodate(*wait_bh))
 			err = -EIO;
 	}
 	if (unlikely(err))
 		tux3_page_zero_new_buffers(page, from, to);
 	return err;
 }

 /*
  * Copy of block_write_begin()
  * (Add to call pagefork_for_blockdirty() for buffer fork)
  */
 static int tux3_write_begin(struct address_space *mapping, loff_t pos,
 			    unsigned len, unsigned flags,
 			    struct page **pagep, get_block_t *get_block,
 			    int check_fork)
 {
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	struct page *page;
 	int status;

 retry:
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;

 	/*
 	 * FIXME: If check_fork == 0, caller handle buffer fork.
 	 * Unlike check_fork hack, we are better to provide the different
 	 * blockget() implementation doesn't use tux3_write_begin().
 	 */
 	if (check_fork) {
 		struct page *tmp;

 		tmp = pagefork_for_blockdirty(page, tux3_get_current_delta());
 		if (IS_ERR(tmp)) {
 			int err;
 			unlock_page(page);
 			page_cache_release(page);

 			err = PTR_ERR(tmp);
 			if (err == -EAGAIN)
 				goto retry;
 			return err;
 		}
 		page = tmp;
 	}

 	status = __tux3_write_begin(page, pos, len, get_block);
 	if (unlikely(status)) {
 		unlock_page(page);
 		page_cache_release(page);
 		page = NULL;
 	}

 	*pagep = page;
 	return status;
 }

 /*
  * Copy of __block_commit_write()
  * (changed to call __tux3_mark_buffer_dirty())
  */
 static int __tux3_commit_write(struct inode *inode, struct page *page,
 			       unsigned from, unsigned to)
 {
 	unsigned block_start, block_end;
 	int partial = 0;
 	unsigned blocksize;
 	struct buffer_head *bh, *head;

 	bh = head = page_buffers(page);
 	blocksize = bh->b_size;

 	block_start = 0;
 	do {
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (!buffer_uptodate(bh))
 				partial = 1;
 		} else {
 			set_buffer_uptodate(bh);
 			__tux3_mark_buffer_dirty(bh, tux3_get_current_delta());
 		}
 		clear_buffer_new(bh);

 		block_start = block_end;
 		bh = bh->b_this_page;
 	} while (bh != head);

 	/*
 	 * If this is a partial write which happened to make all buffers
 	 * uptodate then we can optimize away a bogus readpage() for
 	 * the next read(). Here we 'discover' whether the page went
 	 * uptodate as a result of this (potentially partial) write.
 	 */
 	if (!partial)
 		SetPageUptodate(page);
 	return 0;
 }

 /* Copy of block_write_end() */
 static int __tux3_write_end(struct file *file, struct address_space *mapping,
 			    loff_t pos, unsigned len, unsigned copied,
 			    struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	unsigned start;

 	start = pos & (PAGE_CACHE_SIZE - 1);

 	if (unlikely(copied < len)) {
 		/*
 		 * The buffers that were written will now be uptodate, so we
 		 * don't have to worry about a readpage reading them and
 		 * overwriting a partial write. However if we have encountered
 		 * a short write and only partially written into a buffer, it
 		 * will not be marked uptodate, so a readpage might come in and
 		 * destroy our partial write.
 		 *
 		 * Do the simplest thing, and just treat any short write to a
 		 * non uptodate page as a zero-length write, and force the
 		 * caller to redo the whole thing.
 		 */
 		if (!PageUptodate(page))
 			copied = 0;

 		tux3_page_zero_new_buffers(page, start+copied, start+len);
 	}
 	flush_dcache_page(page);

 	/* This could be a short (even 0-length) commit */
 	__tux3_commit_write(inode, page, start, start+copied);

 	return copied;
 }

 /* Copy of generic_write_end() (added tux3_iattrdirty()) */
 static int tux3_write_end(struct file *file, struct address_space *mapping,
 			  loff_t pos, unsigned len, unsigned copied,
 			  struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	int i_size_changed = 0;

 	copied = __tux3_write_end(file, mapping, pos, len, copied, page, fsdata);

 	/*
 	 * No need to use i_size_read() here, the i_size
 	 * cannot change under us because we hold i_mutex.
 	 *
 	 * But it's important to update i_size while still holding page lock:
 	 * page writeout could otherwise come in and zero beyond i_size.
 	 */
 	if (pos+copied > inode->i_size) {
 		tux3_iattrdirty(inode);
 		i_size_write(inode, pos+copied);
 		i_size_changed = 1;
 	}

 	unlock_page(page);
 	page_cache_release(page);

 	/*
 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
 	 * makes the holding time of page lock longer. Second, it forces lock
 	 * ordering of page lock and transaction start for journaling
 	 * filesystems.
 	 */
 	if (i_size_changed)
 		tux3_mark_inode_dirty(inode);

 	return copied;
 }

 /*
  * Check if we can cancel the dirty of page. This is called after
  * clear dirty of buffers on this page.
  *
  * This would be called for similar purpose to tux3_invalidatepage(),
  * but caller care to change buffer state.
  *
  * FIXME: this traverse buffers on page for each clear dirty
  * buffer. We may want to clear dirty page as batch job (like
  * ->writepages())
  * FIXME: cancel dirty is untested for mmap write.
  *
  * Caller must care locking (e.g. volmap page in backend, hold lock_page()).
  */
 void tux3_try_cancel_dirty_page(struct page *page)
 {
 	struct buffer_head *tmp, *head;

 	tmp = head = page_buffers(page);
 	do {
 		if (buffer_dirty(tmp))
 			return;

 		tmp = tmp->b_this_page;
 	} while (tmp != head);

 	cancel_dirty_page(page, PAGE_CACHE_SIZE);
 }

 /*
  * Based on block_invalidatepage().
  * (changed to call tux3_invalidate_buffer(), and if no dirty buffers,
  * cancel dirty page)
  *
  * This invalidate the buffers on page. Then if there is no dirty
  * buffers, cancel dirty page.
  *
  * FIXME: cancel dirty is untested for mmap write.
  *
  * Caller must hold lock_page().
  */
 static void tux3_invalidatepage(struct page *page, unsigned long offset)
 {
 	struct buffer_head *head, *bh, *next;
 	unsigned int curr_off = 0;
 	int has_dirty = 0;

 	BUG_ON(!PageLocked(page));
 	/* If there is no buffer, buffers shouldn't be dirty */
 	if (!page_has_buffers(page))
 		goto out;

 	head = page_buffers(page);
 	bh = head;
 	do {
 		unsigned int next_off = curr_off + bh->b_size;
 		next = bh->b_this_page;

 		/* Is this block fully invalidated? */
 		if (offset <= curr_off)
 			tux3_invalidate_buffer(bh);

 		/* If buffer is dirty, don't cancel dirty page */
 		if (buffer_dirty(bh))
 			has_dirty = 1;

 		curr_off = next_off;
 		bh = next;
 	} while (bh != head);

 	if (!has_dirty)
 		cancel_dirty_page(page, PAGE_CACHE_SIZE - offset);

 	/*
 	 * We release buffers only if the entire page is being invalidated.
 	 * The get_block cached value has been unconditionally invalidated,
 	 * so real IO is not possible anymore.
 	 */
 	if (offset == 0)
 		try_to_release_page(page, 0);
 out:
 	return;
 }

 /*
  * Based on block_truncate_page()
  * (changed to call pagefork_for_blockdirty() and __tux3_mark_buffer_dirty()())
  *
  * This fills zero for whole page, and checks if buffer can be truncated.
  * Then invalidate buffers if it is needed.
  *
  * Even if truncate was block boundary, we may have to fork page.  If
  * the buffers are dirtied for past delta, we can't truncate, so this
  * forks buffer in that case.
  */
 static int __tux3_truncate_partial_block(struct address_space *mapping,
 					 loff_t from, get_block_t *get_block)
 {
 	struct inode *inode = mapping->host;
 	struct sb *sb = tux_sb(inode->i_sb);
 	unsigned delta = tux3_get_current_delta();
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE - 1);
 	sector_t iblock;
 	unsigned pos, invalid_from;
 	struct page *page, *tmp;
 	struct buffer_head *bh = NULL;
 	int err, forked;

 	/* Page boundary? */
 	if (!offset)
 		return 0;

 	iblock = from >> sb->blockbits;
 	pos = offset >> sb->blockbits;
 	invalid_from = offset;

 	/*
 	 * Block boundary? Make sure the buffers can be truncated.
 	 */
 	if (!(offset & sb->blockmask)) {
 		/*
 		 * If there is dirty buffers outside i_size, we have
 		 * to zero fill those. To do it, we need buffer fork
 		 * to make stable page on in-flight delta.
 		 *
 		 * NOTE: Zeroed buffers are not needed to be written
 		 * though, we have to provide the data on page for
 		 * frontend until data on forked page is available via
 		 * dtree.  So, this dirty the buffer to pin the page.
 		 *
 		 * FIXME: This dirty buffer outside i_size is not
 		 * needed to be written, if buffer is outside i_size,
 		 * buffer is not written. Although if buffer became
 		 * inside i_size on this delta, zeroed buffer can be
 		 * written out.  This is unnecessary writeout.
 		 *
 		 * FIXME: If we didn't need buffer fork, we don't need
 		 * to dirty buffer.
 		 */
 retry_find:
 		page = find_lock_page(mapping, index);
 		if (page) {
 			tmp = pagefork_for_blockdirty(page, delta);
 			if (IS_ERR(tmp)) {
 				unlock_page(page);
 				page_cache_release(page);

 				err = PTR_ERR(tmp);
 				if (err == -EAGAIN)
 					goto retry_find;
 				goto out;
 			}
 			forked = tmp != page;
 			page = tmp;

 dirty_buffer_outside:
 			/* If no buffer fork, we don't need to pin the page */
 			/* FIXME: might be forked in previous truncate,
 			 * so dirty unconditionally */
 			forked = 1;
 			if (forked && page_has_buffers(page)) {
 				assert(page_has_buffers(page));
 				/* Dirty outside i_size to pin the page */
 				bh = __get_buffer(page, pos);
 				__tux3_mark_buffer_dirty(bh, delta);

 				invalid_from = (pos + 1) << sb->blockbits;
 				invalid_from &= PAGE_CACHE_SIZE - 1;
 			}

 			goto zero_fill_page;
 		}

 		/* No page, do nothing */
 		return 0;
 	}

 retry_grab:
 	page = grab_cache_page(mapping, index);
 	err = -ENOMEM;
 	if (!page)
 		goto out;

 	tmp = pagefork_for_blockdirty(page, delta);
 	if (IS_ERR(tmp)) {
 		unlock_page(page);
 		page_cache_release(page);

 		err = PTR_ERR(tmp);
 		if (err == -EAGAIN)
 			goto retry_grab;
 		goto out;
 	}
 	forked = tmp != page;
 	page = tmp;

 	if (!page_has_buffers(page))
 		create_empty_buffers(page, sb->blocksize, 0);

 	/* Find the buffer that contains "offset" */
 	bh = __get_buffer(page, pos);

 	err = 0;
 	/*
 	 * FIXME: If this buffer is dirty, we would not need to call
 	 * get_block()?
 	 */
 	if (!buffer_mapped(bh)) {
 		WARN_ON(bh->b_size != sb->blocksize);
 		err = get_block(inode, iblock, bh, 0);
 		if (err)
 			goto unlock;
 		/* unmapped? It's a hole - nothing to do */
 		if (!buffer_mapped(bh)) {
 			/*
 			 * If this is hole and partial truncate is not
 			 * last block on the page, we have to check
 			 * whether the page needs buffer fork or not.
 			 */
 			if (pos + 1 < PAGE_CACHE_SIZE >> sb->blockbits) {
 				pos++;
 				goto dirty_buffer_outside;
 			}
 			goto unlock;
 		}
 	}

 	/* Ok, it's mapped. Make sure it's up-to-date */
 	if (PageUptodate(page))
 		set_buffer_uptodate(bh);

 	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
 		err = -EIO;
 		ll_rw_block(READ, 1, &bh);
 		wait_on_buffer(bh);
 		/* Uhhuh. Read error. Complain and punt. */
 		if (!buffer_uptodate(bh))
 			goto unlock;
 	}

 	__tux3_mark_buffer_dirty(bh, delta);
 	/*
 	 * FIXME: If we did buffer fork, the other buffers should be
 	 * clean, so we don't need to invalidate buffers outside
 	 * i_size.
 	 */

 zero_fill_page:
 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 	cleancache_invalidate_page(mapping, page);
 	if (invalid_from && page_has_buffers(page))
 		mapping->a_ops->invalidatepage(page, invalid_from);

 	err = 0;

 unlock:
 	unlock_page(page);
 	page_cache_release(page);
 out:
 	return err;
 }

 /* Truncate partial block. If partial, we have to update last block. */
 int tux3_truncate_partial_block(struct inode *inode, loff_t newsize)
 {
 	return __tux3_truncate_partial_block(inode->i_mapping, newsize,
 					     tux3_get_block);
 }

 /*
  * Copy of truncate_inode_pages_range()
  *
  * Changes:
  * - to call bufferfork_to_invalidate() before invalidate buffers
  * - remove to wait the page under I/O (we do buffer fork instead)
  *
  * FIXME: some functions are not exported to implement own
  * truncate_inode_pages_page() fully. So this just do the buffer fork,
  * without invalidate. This way is inefficient, and we would want to merge
  * tux3_truncate_inode_pages_page() and truncate_inode_pages_range().
  */
 void tux3_truncate_inode_pages_range(struct address_space *mapping,
 				     loff_t lstart, loff_t lend)
 {
 	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
 #if 0
 	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 #endif
 	struct pagevec pvec;
 	pgoff_t index;
 	pgoff_t end;
 	int i;

 #if 0 /* FIXME */
 	cleancache_invalidate_inode(mapping);
 #endif
 	if (mapping->nrpages == 0)
 		return;

 	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
 	end = (lend >> PAGE_CACHE_SHIFT);

 	pagevec_init(&pvec, 0);
 	index = start;
 	while (index <= end && pagevec_lookup(&pvec, mapping, index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
 #if 0 /* FIXME */
 		mem_cgroup_uncharge_start();
 #endif
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];

 			/* We rely upon deletion not changing page->index */
 			index = page->index;
 			if (index > end)
 				break;

 			if (!trylock_page(page))
 				continue;
 			WARN_ON(page->index != index);
 #if 0
 			if (PageWriteback(page)) {
 				unlock_page(page);
 				continue;
 			}
 #endif
 			bufferfork_to_invalidate(mapping, page);
 			unlock_page(page);
 		}
 		pagevec_release(&pvec);
 #if 0 /* FIXME */
 		mem_cgroup_uncharge_end();
 #endif
 		cond_resched();
 		index++;
 	}
 #if 0
 	/* Partial page is handled on tux3_truncate_page() */
 	if (partial) {
 		struct page *page = find_lock_page(mapping, start - 1);
 		if (page) {
 			wait_on_page_writeback(page);
 			tux3_truncate_partial_page(page, partial);
 			unlock_page(page);
 			page_cache_release(page);
 		}
 	}
 #endif
 	index = start;
 	for ( ; ; ) {
 		cond_resched();
 		if (!pagevec_lookup(&pvec, mapping, index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
 #if 0
 			if (index == start)
 				break;
 			index = start;
 			continue;
 #else
 			/*
 			 * We leave the pages as is if it can be invalidated.
 			 * And we don't need check the same page repeatedly.
 			 */
 			break;
 #endif
 		}
 		if (index == start && pvec.pages[0]->index > end) {
 			pagevec_release(&pvec);
 			break;
 		}
 #if 0 /* FIXME */
 		mem_cgroup_uncharge_start();
 #endif
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];

 			/* We rely upon deletion not changing page->index */
 			index = page->index;
 			if (index > end)
 				break;

 			lock_page(page);
 			WARN_ON(page->index != index);
 #if 0
 			wait_on_page_writeback(page);
 #endif
 			bufferfork_to_invalidate(mapping, page);
 			unlock_page(page);
 		}
 		pagevec_release(&pvec);
 #if 0 /* FIXME */
 		mem_cgroup_uncharge_end();
 #endif
 		index++;
 	}
 #if 0 /* FIXME */
 	cleancache_invalidate_inode(mapping);
 #endif
 }
	/*
	* Copied some block library functions, to replace mark_buffer_dirty()
	* by pagefork_for_blockdirty() and __tux3_mark_buffer_dirty(),
	* to replace discard_buffer() by tux3_invalidate_buffer(), and add
	* tux3_iattrdirty().
	*
	* We should check the update of original functions, and sync with it.
	*/

	#include <linux/pagevec.h>
	#include <linux/cleancache.h>

	/*
	* Copy of page_zero_new_buffers()
	* (changed to call __tux3_mark_buffer_dirty())
	*/
	static void tux3_page_zero_new_buffers(struct page *page, unsigned from,
	unsigned to)
	{
	unsigned int block_start, block_end;
	struct buffer_head head, bh;

	BUG_ON(!PageLocked(page));
	if (!page_has_buffers(page))
	return;

	bh = head = page_buffers(page);
	block_start = 0;
	do {
	block_end = block_start + bh->b_size;

	if (buffer_new(bh)) {
	if (block_end > from && block_start < to) {
	unsigned delta = tux3_get_current_delta();

	if (!PageUptodate(page)) {
	unsigned start, size;

	start = max(from, block_start);
	size = min(to, block_end) - start;

	zero_user(page, start, size);
	set_buffer_uptodate(bh);
	}

	clear_buffer_new(bh);
	__tux3_mark_buffer_dirty(bh, delta);
	}
	}

	block_start = block_end;
	bh = bh->b_this_page;
	} while (bh != head);
	}

	/*
	* Copy of __block_write_begin() (changed to call __tux3_mark_buffer_dirty(),
	* and to remove unmap_underlying_metadata())
	*/
	static int __tux3_write_begin(struct page *page, loff_t pos, unsigned len,
	get_block_t *get_block)
	{
	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
	unsigned to = from + len;
	struct inode *inode = page->mapping->host;
	struct sb *sb = tux_sb(inode->i_sb);
	unsigned block_start, block_end;
	sector_t block;
	int err = 0;
	unsigned blocksize, bbits;
	struct buffer_head bh, head, wait[2], *wait_bh=wait;

	BUG_ON(!PageLocked(page));
	BUG_ON(from > PAGE_CACHE_SIZE);
	BUG_ON(to > PAGE_CACHE_SIZE);
	BUG_ON(from > to);

	/* Use blocksize/blockbits in sb, instead of inode->i_blkbits */
	blocksize = sb->blocksize;
	bbits = sb->blockbits;
	if (!page_has_buffers(page))
	create_empty_buffers(page, blocksize, 0);
	head = page_buffers(page);

	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);

	for(bh = head, block_start = 0; bh != head \|\| !block_start;
	block++, block_start=block_end, bh = bh->b_this_page) {
	block_end = block_start + blocksize;
	if (block_end <= from \|\| block_start >= to) {
	if (PageUptodate(page)) {
	if (!buffer_uptodate(bh))
	set_buffer_uptodate(bh);
	}
	continue;
	}
	if (buffer_new(bh))
	clear_buffer_new(bh);
	if (!buffer_mapped(bh)) {
	/*
	* FIXME: If user overwrites block fully, we
	* don't need get_block(). Since we know it is
	* delayed allocation, so, we can use SEG_HOLE
	* as delayed allocation.
	*/
	WARN_ON(bh->b_size != blocksize);
	err = get_block(inode, block, bh, 1);
	if (err)
	break;
	if (buffer_new(bh)) {
	#if 0
	unmap_underlying_metadata(bh->b_bdev,
	bh->b_blocknr);
	#endif
	if (PageUptodate(page)) {
	/* FIXME: do we have to mark this dirty?
	* re-think after mmap support */
	//clear_buffer_new(bh);
	set_buffer_uptodate(bh);
	//__tux3_mark_buffer_dirty(bh, delta);
	continue;
	}
	if (block_end > to \|\| block_start < from)
	zero_user_segments(page,
	to, block_end,
	block_start, from);
	continue;
	}
	}
	if (PageUptodate(page)) {
	if (!buffer_uptodate(bh))
	set_buffer_uptodate(bh);
	continue;
	}
	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
	!buffer_unwritten(bh) &&
	(block_start < from \|\| block_end > to)) {
	ll_rw_block(READ, 1, &bh);
	*wait_bh++=bh;
	}
	}
	/*
	* If we issued read requests - let them complete.
	*/
	while(wait_bh > wait) {
	wait_on_buffer(*--wait_bh);
	if (!buffer_uptodate(*wait_bh))
	err = -EIO;
	}
	if (unlikely(err))
	tux3_page_zero_new_buffers(page, from, to);
	return err;
	}

	/*
	* Copy of block_write_begin()
	* (Add to call pagefork_for_blockdirty() for buffer fork)
	*/
	static int tux3_write_begin(struct address_space *mapping, loff_t pos,
	unsigned len, unsigned flags,
	struct page *pagep, get_block_t get_block,
	int check_fork)
	{
	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
	struct page *page;
	int status;

	retry:
	page = grab_cache_page_write_begin(mapping, index, flags);
	if (!page)
	return -ENOMEM;

	/*
	* FIXME: If check_fork == 0, caller handle buffer fork.
	* Unlike check_fork hack, we are better to provide the different
	* blockget() implementation doesn't use tux3_write_begin().
	*/
	if (check_fork) {
	struct page *tmp;

	tmp = pagefork_for_blockdirty(page, tux3_get_current_delta());
	if (IS_ERR(tmp)) {
	int err;
	unlock_page(page);
	page_cache_release(page);

	err = PTR_ERR(tmp);
	if (err == -EAGAIN)
	goto retry;
	return err;
	}
	page = tmp;
	}

	status = __tux3_write_begin(page, pos, len, get_block);
	if (unlikely(status)) {
	unlock_page(page);
	page_cache_release(page);
	page = NULL;
	}

	*pagep = page;
	return status;
	}

	/*
	* Copy of __block_commit_write()
	* (changed to call __tux3_mark_buffer_dirty())
	*/
	static int __tux3_commit_write(struct inode inode, struct page page,
	unsigned from, unsigned to)
	{
	unsigned block_start, block_end;
	int partial = 0;
	unsigned blocksize;
	struct buffer_head bh, head;

	bh = head = page_buffers(page);
	blocksize = bh->b_size;

	block_start = 0;
	do {
	block_end = block_start + blocksize;
	if (block_end <= from \|\| block_start >= to) {
	if (!buffer_uptodate(bh))
	partial = 1;
	} else {
	set_buffer_uptodate(bh);
	__tux3_mark_buffer_dirty(bh, tux3_get_current_delta());
	}
	clear_buffer_new(bh);

	block_start = block_end;
	bh = bh->b_this_page;
	} while (bh != head);

	/*
	* If this is a partial write which happened to make all buffers
	* uptodate then we can optimize away a bogus readpage() for
	* the next read(). Here we 'discover' whether the page went
	* uptodate as a result of this (potentially partial) write.
	*/
	if (!partial)
	SetPageUptodate(page);
	return 0;
	}

	/* Copy of block_write_end() */
	static int __tux3_write_end(struct file file, struct address_space mapping,
	loff_t pos, unsigned len, unsigned copied,
	struct page page, void fsdata)
	{
	struct inode *inode = mapping->host;
	unsigned start;

	start = pos & (PAGE_CACHE_SIZE - 1);

	if (unlikely(copied < len)) {
	/*
	* The buffers that were written will now be uptodate, so we
	* don't have to worry about a readpage reading them and
	* overwriting a partial write. However if we have encountered
	* a short write and only partially written into a buffer, it
	* will not be marked uptodate, so a readpage might come in and
	* destroy our partial write.
	*
	* Do the simplest thing, and just treat any short write to a
	* non uptodate page as a zero-length write, and force the
	* caller to redo the whole thing.
	*/
	if (!PageUptodate(page))
	copied = 0;

	tux3_page_zero_new_buffers(page, start+copied, start+len);
	}
	flush_dcache_page(page);

	/* This could be a short (even 0-length) commit */
	__tux3_commit_write(inode, page, start, start+copied);

	return copied;
	}

	/* Copy of generic_write_end() (added tux3_iattrdirty()) */
	static int tux3_write_end(struct file file, struct address_space mapping,
	loff_t pos, unsigned len, unsigned copied,
	struct page page, void fsdata)
	{
	struct inode *inode = mapping->host;
	int i_size_changed = 0;

	copied = __tux3_write_end(file, mapping, pos, len, copied, page, fsdata);

	/*
	* No need to use i_size_read() here, the i_size
	* cannot change under us because we hold i_mutex.
	*
	* But it's important to update i_size while still holding page lock:
	* page writeout could otherwise come in and zero beyond i_size.
	*/
	if (pos+copied > inode->i_size) {
	tux3_iattrdirty(inode);
	i_size_write(inode, pos+copied);
	i_size_changed = 1;
	}

	unlock_page(page);
	page_cache_release(page);

	/*
	* Don't mark the inode dirty under page lock. First, it unnecessarily
	* makes the holding time of page lock longer. Second, it forces lock
	* ordering of page lock and transaction start for journaling
	* filesystems.
	*/
	if (i_size_changed)
	tux3_mark_inode_dirty(inode);

	return copied;
	}

	/*
	* Check if we can cancel the dirty of page. This is called after
	* clear dirty of buffers on this page.
	*
	* This would be called for similar purpose to tux3_invalidatepage(),
	* but caller care to change buffer state.
	*
	* FIXME: this traverse buffers on page for each clear dirty
	* buffer. We may want to clear dirty page as batch job (like
	* ->writepages())
	* FIXME: cancel dirty is untested for mmap write.
	*
	* Caller must care locking (e.g. volmap page in backend, hold lock_page()).
	*/
	void tux3_try_cancel_dirty_page(struct page *page)
	{
	struct buffer_head tmp, head;

	tmp = head = page_buffers(page);
	do {
	if (buffer_dirty(tmp))
	return;

	tmp = tmp->b_this_page;
	} while (tmp != head);

	cancel_dirty_page(page, PAGE_CACHE_SIZE);
	}

	/*
	* Based on block_invalidatepage().
	* (changed to call tux3_invalidate_buffer(), and if no dirty buffers,
	* cancel dirty page)
	*
	* This invalidate the buffers on page. Then if there is no dirty
	* buffers, cancel dirty page.
	*
	* FIXME: cancel dirty is untested for mmap write.
	*
	* Caller must hold lock_page().
	*/
	static void tux3_invalidatepage(struct page *page, unsigned long offset)
	{
	struct buffer_head head, bh, *next;
	unsigned int curr_off = 0;
	int has_dirty = 0;

	BUG_ON(!PageLocked(page));
	/* If there is no buffer, buffers shouldn't be dirty */
	if (!page_has_buffers(page))
	goto out;

	head = page_buffers(page);
	bh = head;
	do {
	unsigned int next_off = curr_off + bh->b_size;
	next = bh->b_this_page;

	/* Is this block fully invalidated? */
	if (offset <= curr_off)
	tux3_invalidate_buffer(bh);

	/* If buffer is dirty, don't cancel dirty page */
	if (buffer_dirty(bh))
	has_dirty = 1;

	curr_off = next_off;
	bh = next;
	} while (bh != head);

	if (!has_dirty)
	cancel_dirty_page(page, PAGE_CACHE_SIZE - offset);

	/*
	* We release buffers only if the entire page is being invalidated.
	* The get_block cached value has been unconditionally invalidated,
	* so real IO is not possible anymore.
	*/
	if (offset == 0)
	try_to_release_page(page, 0);
	out:
	return;
	}

	/*
	* Based on block_truncate_page()
	* (changed to call pagefork_for_blockdirty() and __tux3_mark_buffer_dirty()())
	*
	* This fills zero for whole page, and checks if buffer can be truncated.
	* Then invalidate buffers if it is needed.
	*
	* Even if truncate was block boundary, we may have to fork page. If
	* the buffers are dirtied for past delta, we can't truncate, so this
	* forks buffer in that case.
	*/
	static int __tux3_truncate_partial_block(struct address_space *mapping,
	loff_t from, get_block_t *get_block)
	{
	struct inode *inode = mapping->host;
	struct sb *sb = tux_sb(inode->i_sb);
	unsigned delta = tux3_get_current_delta();
	pgoff_t index = from >> PAGE_CACHE_SHIFT;
	unsigned offset = from & (PAGE_CACHE_SIZE - 1);
	sector_t iblock;
	unsigned pos, invalid_from;
	struct page page, tmp;
	struct buffer_head *bh = NULL;
	int err, forked;

	/* Page boundary? */
	if (!offset)
	return 0;

	iblock = from >> sb->blockbits;
	pos = offset >> sb->blockbits;
	invalid_from = offset;

	/*
	* Block boundary? Make sure the buffers can be truncated.
	*/
	if (!(offset & sb->blockmask)) {
	/*
	* If there is dirty buffers outside i_size, we have
	* to zero fill those. To do it, we need buffer fork
	* to make stable page on in-flight delta.
	*
	* NOTE: Zeroed buffers are not needed to be written
	* though, we have to provide the data on page for
	* frontend until data on forked page is available via
	* dtree. So, this dirty the buffer to pin the page.
	*
	* FIXME: This dirty buffer outside i_size is not
	* needed to be written, if buffer is outside i_size,
	* buffer is not written. Although if buffer became
	* inside i_size on this delta, zeroed buffer can be
	* written out. This is unnecessary writeout.
	*
	* FIXME: If we didn't need buffer fork, we don't need
	* to dirty buffer.
	*/
	retry_find:
	page = find_lock_page(mapping, index);
	if (page) {
	tmp = pagefork_for_blockdirty(page, delta);
	if (IS_ERR(tmp)) {
	unlock_page(page);
	page_cache_release(page);

	err = PTR_ERR(tmp);
	if (err == -EAGAIN)
	goto retry_find;
	goto out;
	}
	forked = tmp != page;
	page = tmp;

	dirty_buffer_outside:
	/* If no buffer fork, we don't need to pin the page */
	/* FIXME: might be forked in previous truncate,
	* so dirty unconditionally */
	forked = 1;
	if (forked && page_has_buffers(page)) {
	assert(page_has_buffers(page));
	/* Dirty outside i_size to pin the page */
	bh = __get_buffer(page, pos);
	__tux3_mark_buffer_dirty(bh, delta);

	invalid_from = (pos + 1) << sb->blockbits;
	invalid_from &= PAGE_CACHE_SIZE - 1;
	}

	goto zero_fill_page;
	}

	/* No page, do nothing */
	return 0;
	}

	retry_grab:
	page = grab_cache_page(mapping, index);
	err = -ENOMEM;
	if (!page)
	goto out;

	tmp = pagefork_for_blockdirty(page, delta);
	if (IS_ERR(tmp)) {
	unlock_page(page);
	page_cache_release(page);

	err = PTR_ERR(tmp);
	if (err == -EAGAIN)
	goto retry_grab;
	goto out;
	}
	forked = tmp != page;
	page = tmp;

	if (!page_has_buffers(page))
	create_empty_buffers(page, sb->blocksize, 0);

	/* Find the buffer that contains "offset" */
	bh = __get_buffer(page, pos);

	err = 0;
	/*
	* FIXME: If this buffer is dirty, we would not need to call
	* get_block()?
	*/
	if (!buffer_mapped(bh)) {
	WARN_ON(bh->b_size != sb->blocksize);
	err = get_block(inode, iblock, bh, 0);
	if (err)
	goto unlock;
	/* unmapped? It's a hole - nothing to do */
	if (!buffer_mapped(bh)) {
	/*
	* If this is hole and partial truncate is not
	* last block on the page, we have to check
	* whether the page needs buffer fork or not.
	*/
	if (pos + 1 < PAGE_CACHE_SIZE >> sb->blockbits) {
	pos++;
	goto dirty_buffer_outside;
	}
	goto unlock;
	}
	}

	/* Ok, it's mapped. Make sure it's up-to-date */
	if (PageUptodate(page))
	set_buffer_uptodate(bh);

	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
	err = -EIO;
	ll_rw_block(READ, 1, &bh);
	wait_on_buffer(bh);
	/* Uhhuh. Read error. Complain and punt. */
	if (!buffer_uptodate(bh))
	goto unlock;
	}

	__tux3_mark_buffer_dirty(bh, delta);
	/*
	* FIXME: If we did buffer fork, the other buffers should be
	* clean, so we don't need to invalidate buffers outside
	* i_size.
	*/

	zero_fill_page:
	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
	cleancache_invalidate_page(mapping, page);
	if (invalid_from && page_has_buffers(page))
	mapping->a_ops->invalidatepage(page, invalid_from);

	err = 0;

	unlock:
	unlock_page(page);
	page_cache_release(page);
	out:
	return err;
	}

	/* Truncate partial block. If partial, we have to update last block. */
	int tux3_truncate_partial_block(struct inode *inode, loff_t newsize)
	{
	return __tux3_truncate_partial_block(inode->i_mapping, newsize,
	tux3_get_block);
	}

	/*
	* Copy of truncate_inode_pages_range()
	*
	* Changes:
	* - to call bufferfork_to_invalidate() before invalidate buffers
	* - remove to wait the page under I/O (we do buffer fork instead)
	*
	* FIXME: some functions are not exported to implement own
	* truncate_inode_pages_page() fully. So this just do the buffer fork,
	* without invalidate. This way is inefficient, and we would want to merge
	* tux3_truncate_inode_pages_page() and truncate_inode_pages_range().
	*/
	void tux3_truncate_inode_pages_range(struct address_space *mapping,
	loff_t lstart, loff_t lend)
	{
	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
	#if 0
	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
	#endif
	struct pagevec pvec;
	pgoff_t index;
	pgoff_t end;
	int i;

	#if 0 /* FIXME */
	cleancache_invalidate_inode(mapping);
	#endif
	if (mapping->nrpages == 0)
	return;

	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
	end = (lend >> PAGE_CACHE_SHIFT);

	pagevec_init(&pvec, 0);
	index = start;
	while (index <= end && pagevec_lookup(&pvec, mapping, index,
	min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
	#if 0 /* FIXME */
	mem_cgroup_uncharge_start();
	#endif
	for (i = 0; i < pagevec_count(&pvec); i++) {
	struct page *page = pvec.pages[i];

	/* We rely upon deletion not changing page->index */
	index = page->index;
	if (index > end)
	break;

	if (!trylock_page(page))
	continue;
	WARN_ON(page->index != index);
	#if 0
	if (PageWriteback(page)) {
	unlock_page(page);
	continue;
	}
	#endif
	bufferfork_to_invalidate(mapping, page);
	unlock_page(page);
	}
	pagevec_release(&pvec);
	#if 0 /* FIXME */
	mem_cgroup_uncharge_end();
	#endif
	cond_resched();
	index++;
	}
	#if 0
	/* Partial page is handled on tux3_truncate_page() */
	if (partial) {
	struct page *page = find_lock_page(mapping, start - 1);
	if (page) {
	wait_on_page_writeback(page);
	tux3_truncate_partial_page(page, partial);
	unlock_page(page);
	page_cache_release(page);
	}
	}
	#endif
	index = start;
	for ( ; ; ) {
	cond_resched();
	if (!pagevec_lookup(&pvec, mapping, index,
	min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
	#if 0
	if (index == start)
	break;
	index = start;
	continue;
	#else
	/*
	* We leave the pages as is if it can be invalidated.
	* And we don't need check the same page repeatedly.
	*/
	break;
	#endif
	}
	if (index == start && pvec.pages[0]->index > end) {
	pagevec_release(&pvec);
	break;
	}
	#if 0 /* FIXME */
	mem_cgroup_uncharge_start();
	#endif
	for (i = 0; i < pagevec_count(&pvec); i++) {
	struct page *page = pvec.pages[i];

	/* We rely upon deletion not changing page->index */
	index = page->index;
	if (index > end)
	break;

	lock_page(page);
	WARN_ON(page->index != index);
	#if 0
	wait_on_page_writeback(page);
	#endif
	bufferfork_to_invalidate(mapping, page);
	unlock_page(page);
	}
	pagevec_release(&pvec);
	#if 0 /* FIXME */
	mem_cgroup_uncharge_end();
	#endif
	index++;
	}
	#if 0 /* FIXME */
	cleancache_invalidate_inode(mapping);
	#endif
	}