fs/tux3/buffer_writeback.c - pub/scm/linux/kernel/git/daniel/linux-tux3 - Git at Google

 /*
  * Write back buffers
  */

 #include "buffer_writebacklib.c"

 /*
  * Helper for waiting I/O
  */

 static void iowait_inflight_inc(struct iowait *iowait)
 {
 	atomic_inc(&iowait->inflight);
 }

 static void iowait_inflight_dec(struct iowait *iowait)
 {
 	if (atomic_dec_and_test(&iowait->inflight))
 		complete(&iowait->done);
 }

 void tux3_iowait_init(struct iowait *iowait)
 {
 	/*
 	 * Grab 1 to prevent the partial complete until all I/O is
 	 * submitted
 	 */
 	init_completion(&iowait->done);
 	atomic_set(&iowait->inflight, 1);
 }

 void tux3_iowait_wait(struct iowait *iowait)
 {
 	/* All I/O was submitted, release initial 1, then wait I/O */
 	iowait_inflight_dec(iowait);
 	wait_for_completion(&iowait->done);
 }

 /*
  * Helper for buffer vector I/O.
  */

 static inline struct buffer_head *buffers_entry(struct list_head *x)
 {
 	return list_entry(x, struct buffer_head, b_assoc_buffers);
 }

 #define MAX_BUFVEC_COUNT	UINT_MAX

 /* Initialize bufvec */
 static void bufvec_init(struct bufvec *bufvec, struct address_space *mapping,
 			struct list_head *head, struct tux3_iattr_data *idata)
 {
 	INIT_LIST_HEAD(&bufvec->contig);
 	bufvec->buffers		= head;
 	bufvec->contig_count	= 0;
 	bufvec->idata		= idata;
 	bufvec->mapping		= mapping;
 	bufvec->on_page_idx	= 0;
 	bufvec->bio		= NULL;
 	bufvec->bio_lastbuf	= NULL;
 }

 static void bufvec_free(struct bufvec *bufvec)
 {
 	/* FIXME: on error path, this will happens */
 	assert(!bufvec->buffers || list_empty(bufvec->buffers));
 	assert(list_empty(&bufvec->contig));
 	assert(bufvec->bio == NULL);
 }

 static inline void bufvec_buffer_move_to_contig(struct bufvec *bufvec,
 						struct buffer_head *buffer)
 {
 	/*
 	 * This is called by backend, it means buffer state should be
 	 * stable. So, we don't need lock for buffer state list
 	 * (->b_assoc_buffers).
 	 *
 	 * FIXME: above is true?
 	 */
 	list_move_tail(&buffer->b_assoc_buffers, &bufvec->contig);
 	bufvec->contig_count++;
 }

 /*
  * Special purpose single pointer list (FIFO order) for buffers on bio
  */
 static void bufvec_bio_add_buffer(struct bufvec *bufvec,
 				  struct buffer_head *new)
 {
 	new->b_private = NULL;

 	if (bufvec->bio_lastbuf)
 		bufvec->bio_lastbuf->b_private = new;
 	else
 		bufvec->bio->bi_private = new;

 	bufvec->bio_lastbuf = new;
 }

 static struct buffer_head *bufvec_bio_del_buffer(struct bio *bio)
 {
 	struct buffer_head *buffer = bio->bi_private;

 	if (buffer) {
 		bio->bi_private = buffer->b_private;
 		buffer->b_private = NULL;
 	}

 	return buffer;
 }

 static struct address_space *bufvec_bio_mapping(struct bio *bio)
 {
 	struct buffer_head *buffer = bio->bi_private;
 	assert(buffer);
 	/* FIXME: we want to remove usage of b_assoc_map */
 	return buffer->b_assoc_map;
 }

 static struct bio *bufvec_bio_alloc(struct sb *sb, unsigned int count,
 				    block_t physical,
 				    void (*end_io)(struct bio *, int))
 {
 	gfp_t gfp_flags = GFP_NOFS;
 	struct bio *bio;

 	count = min_t(unsigned int, count, bio_get_nr_vecs(vfs_sb(sb)->s_bdev));

 	bio = bio_alloc(gfp_flags, count);
 	/* This retry is from mpage_alloc() */
 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
 		while (!bio && (count /= 2))
 			bio = bio_alloc(gfp_flags, count);
 	}
 	assert(bio);	/* GFP_NOFS shouldn't fail to allocate */

 	bio->bi_bdev	= vfs_sb(sb)->s_bdev;
 	bio->bi_sector	= physical << (sb->blockbits - 9);
 	bio->bi_end_io	= end_io;

 	return bio;
 }

 static void bufvec_submit_bio(int rw, struct bufvec *bufvec)
 {
 	struct sb *sb = tux_sb(bufvec_inode(bufvec)->i_sb);
 	struct bio *bio = bufvec->bio;

 	bufvec->bio = NULL;
 	bufvec->bio_lastbuf = NULL;

 	trace("bio %p, physical %Lu, count %u", bio,
 	      (block_t)bio->bi_sector >> (sb->blockbits - 9),
 	      bio->bi_size >> sb->blockbits);

 	iowait_inflight_inc(sb->iowait);
 	submit_bio(rw, bio);
 }

 /*
  * We flush all buffers on this page?
  *
  * The page may have the dirty buffer for both of "delta" and
  * "unify", and we may flush only dirty buffers for "delta". So, if
  * the page still has the dirty buffer, we should still keep the page
  * dirty for "unify".
  */
 static int keep_page_dirty(struct bufvec *bufvec, struct page *page)
 {
 	struct buffer_head *first = page_buffers(page);
 	struct inode *inode = bufvec_inode(bufvec);

 	if (tux_inode(inode)->inum == TUX_VOLMAP_INO) {
 		struct buffer_head *tmp = first;
 		unsigned count = 0;
 		do {
 			if (buffer_dirty(tmp)) {
 				count++;
 				/* dirty buffers > flushing buffers? */
 				if (count > bufvec->on_page_idx)
 					return 1;
 			}
 			tmp = tmp->b_this_page;
 		} while (tmp != first);
 	}

 	return 0;
 }

 /* Preparation and lock page for I/O */
 static void
 bufvec_prepare_and_lock_page(struct bufvec *bufvec, struct page *page)
 {
 	struct tux3_iattr_data *idata = bufvec->idata;
 	pgoff_t last_index;
 	unsigned offset;
 	int old_flag, old_writeback;

 	lock_page(page);
 	assert(PageDirty(page));
 	assert(!PageWriteback(page));

 	/*
 	 * Set "writeback" flag before clearing "dirty" flag, so, page
 	 * presents either of "dirty" or "writeback" flag.  With this,
 	 * free_forked_buffers() can check page flags without locking
 	 * page. See FIXME of forked_buffers().
 	 *
 	 * And writeback flag prevents vmscan releases page.
 	 */
 	old_writeback = TestSetPageWriteback(page);
 	assert(!old_writeback);

 	/*
 	 * NOTE: This has the race if there is concurrent mark
 	 * dirty. But we shouldn't use concurrent dirty [B] on volmap.
 	 *
 	 *           [ A ]                        [ B ]
 	 * if (!keep_page_dirty())
 	 *                                   mark_buffer_dirty()
 	 *                                       TestSetPageDirty()
 	 *     // this lost dirty of [B]
 	 *     clear_dirty_for_io()
 	 */
 	if (!keep_page_dirty(bufvec, page)) {
 		old_flag = tux3_clear_page_dirty_for_io(page);
 		assert(old_flag);
 	}

 	/*
 	 * This fixes incoherency of page accounting and radix-tree
 	 * tag by above change of dirty and writeback.
 	 *
 	 * NOTE: This is assuming to be called after clearing dirty
 	 * (See comment of tux3_clear_page_dirty_for_io()).
 	 */
 	__tux3_test_set_page_writeback(page, old_writeback);

 	/*
 	 * Zero fill the page for mmap outside i_size after clear dirty.
 	 *
 	 * The page straddles i_size.  It must be zeroed out on each and every
 	 * writepage invocation because it may be mmapped.  "A file is mapped
 	 * in multiples of the page size.  For a file that is not a multiple of
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
 	offset = idata->i_size & (PAGE_CACHE_SIZE - 1);
 	last_index = idata->i_size >> PAGE_CACHE_SHIFT;
 	if (offset && last_index == page->index)
 		zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 }

 static void bufvec_prepare_and_unlock_page(struct page *page)
 {
 	unlock_page(page);
 }

 /* Completion of page for I/O */
 static void bufvec_page_end_io(struct page *page, int uptodate, int quiet)
 {
 	end_page_writeback(page);
 	tux3_accout_clear_writeback(page);
 }

 /* Completion of buffer for I/O */
 static void bufvec_buffer_end_io(struct buffer_head *buffer, int uptodate,
 				 int quiet)
 {
 	char b[BDEVNAME_SIZE];

 	if (uptodate)
 		set_buffer_uptodate(buffer);
 	else {
 		if (!quiet) {
 			printk(KERN_WARNING "lost page write due to "
 			       "I/O error on %s\n",
 			       bdevname(buffer->b_bdev, b));
 		}
 		set_buffer_write_io_error(buffer);
 		clear_buffer_uptodate(buffer);
 	}
 }

 /* Check whether buffers are contiguous or not. */
 static int bufvec_is_multiple_ranges(struct bufvec *bufvec)
 {
 	block_t logical, physical;
 	unsigned int i;

 	logical = bufindex(bufvec->on_page[0].buffer);
 	physical = bufvec->on_page[0].block;
 	for (i = 1; i < bufvec->on_page_idx; i++) {
 		if (logical + i != bufindex(bufvec->on_page[i].buffer) ||
 		    physical + i != bufvec->on_page[i].block) {
 			return 1;
 		}
 	}

 	return 0;
 }

 /*
  * BIO completion for complex case. There are multiple ranges on the
  * page, and those are submitted BIO for each range. So, completion of
  * the page is only if all BIOs are done.
  */
 static void bufvec_end_io_multiple(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	const int quiet = test_bit(BIO_QUIET, &bio->bi_flags);
 	struct address_space *mapping;
 	struct page *page;
 	struct buffer_head *buffer, *first, *tmp;
 	unsigned long flags;

 	trace("bio %p, err %d", bio, err);

 	/* FIXME: inode is still guaranteed to be available? */
 	mapping = bufvec_bio_mapping(bio);

 	buffer = bufvec_bio_del_buffer(bio);
 	page = buffer->b_page;
 	first = page_buffers(page);

 	trace("buffer %p", buffer);
 	tux3_clear_buffer_dirty_for_io_hack(buffer);
 	bufvec_buffer_end_io(buffer, uptodate, quiet);
 	put_bh(buffer);

 	iowait_inflight_dec(tux_sb(mapping->host->i_sb)->iowait);
 	bio_put(bio);

 	/* Check buffers on the page. If all was done, clear writeback */
 	local_irq_save(flags);
 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);

 	clear_buffer_async_write(buffer);
 	tmp = buffer->b_this_page;
 	while (tmp != buffer) {
 		if (buffer_async_write(tmp))
 			goto still_busy;
 		tmp = tmp->b_this_page;
 	}
 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 	local_irq_restore(flags);

 	bufvec_page_end_io(page, uptodate, quiet);
 	return;

 still_busy:
 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 	local_irq_restore(flags);
 }

 /*
  * This page across on multiple ranges.
  *
  * To handle I/O completion properly, this sets "buffer_async_write"
  * to all buffers, then submits buffers with own bio. And on end_io,
  * we check if "buffer_async_write" of all buffers was cleared.
  *
  * FIXME: Some buffers on the page can be contiguous, we can submit
  * those as one bio if contiguous.
  */
 static void bufvec_bio_add_multiple(int rw, struct bufvec *bufvec)
 {
 	/* FIXME: inode is still guaranteed to be available? */
 	struct sb *sb = tux_sb(bufvec_inode(bufvec)->i_sb);
 	struct page *page;
 	unsigned int i;

 	/* If there is bio, submit it */
 	if (bufvec->bio)
 		bufvec_submit_bio(rw, bufvec);

 	page = bufvec->on_page[0].buffer->b_page;

 	/* Prepare the page and buffers on the page for I/O */
 	bufvec_prepare_and_lock_page(bufvec, page);
 	/* Set buffer_async_write to all buffers at first, then submit */
 	for (i = 0; i < bufvec->on_page_idx; i++) {
 		struct buffer_head *buffer = bufvec->on_page[i].buffer;
 		block_t physical = bufvec->on_page[i].block;
 		get_bh(buffer);
 		tux3_clear_buffer_dirty_for_io(buffer, sb, physical);
 		/* Buffer locking order for I/O is lower index to
 		 * bigger index. And grouped by inode. FIXME: is this sane? */
 		/* lock_buffer(buffer); FIXME: need? */
 		set_buffer_async_write(buffer);
 	}

 	for (i = 0; i < bufvec->on_page_idx; i++) {
 		struct buffer_head *buffer = bufvec->on_page[i].buffer;
 		block_t physical = bufvec->on_page[i].block;
 		unsigned int length = bufsize(buffer);
 		unsigned int offset = bh_offset(buffer);

 		bufvec->bio = bufvec_bio_alloc(sb, 1, physical,
 					       bufvec_end_io_multiple);

 		trace("page %p, index %Lu, physical %Lu, length %u, offset %u",
 		      page, bufindex(bufvec->on_page[i].buffer), physical,
 		      length, offset);
 		if (!bio_add_page(bufvec->bio, page, length, offset))
 			assert(0);	/* why? */

 		bufvec_bio_add_buffer(bufvec, buffer);

 		bufvec_submit_bio(rw, bufvec);
 	}
 	bufvec_prepare_and_unlock_page(page);

 	bufvec->on_page_idx = 0;
 }

 /*
  * bio completion for bufvec based I/O
  */
 static void bufvec_end_io(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	const int quiet = test_bit(BIO_QUIET, &bio->bi_flags);
 	struct address_space *mapping;
 	struct page *page, *last_page;

 	trace("bio %p, err %d", bio, err);

 	/* FIXME: inode is still guaranteed to be available? */
 	mapping = bufvec_bio_mapping(bio);

 	/* Remove buffer from bio, then unlock buffer */
 	last_page = NULL;
 	while (1) {
 		struct buffer_head *buffer = bufvec_bio_del_buffer(bio);
 		if (!buffer)
 			break;

 		page = buffer->b_page;

 		trace("buffer %p", buffer);
 		tux3_clear_buffer_dirty_for_io_hack(buffer);
 		put_bh(buffer);

 		if (page != last_page) {
 			bufvec_page_end_io(page, uptodate, quiet);
 			last_page = page;
 		}
 	}

 	iowait_inflight_dec(tux_sb(mapping->host->i_sb)->iowait);
 	bio_put(bio);
 }

 /*
  * Try to add buffers on a page to bio. If it was failed, we submit
  * bio, then add buffers on new bio.
  *
  * FIXME: We can free buffers early, and avoid to use buffers in I/O
  * completion, after prepared the page (like __mpage_writepage).
  */
 static void bufvec_bio_add_page(int rw, struct bufvec *bufvec)
 {
 	/* FIXME: inode is still guaranteed to be available? */
 	struct sb *sb = tux_sb(bufvec_inode(bufvec)->i_sb);
 	struct page *page;
 	block_t physical;
 	unsigned int i, length, offset;

 	page = bufvec->on_page[0].buffer->b_page;
 	physical = bufvec->on_page[0].block;
 	offset = bh_offset(bufvec->on_page[0].buffer);
 	length = bufvec->on_page_idx << sb->blockbits;

 	trace("page %p, index %Lu, physical %Lu, length %u, offset %u",
 	      page, bufindex(bufvec->on_page[0].buffer), physical,
 	      length, offset);

 	/* Try to add buffers to exists bio */
 	if (!bufvec->bio || !bio_add_page(bufvec->bio, page, length, offset)) {
 		/* Couldn't add. So submit old bio and allocate new bio */
 		if (bufvec->bio)
 			bufvec_submit_bio(rw, bufvec);

 		bufvec->bio =
 			bufvec_bio_alloc(sb, bufvec_contig_count(bufvec) + 1,
 					 physical, bufvec_end_io);

 		if (!bio_add_page(bufvec->bio, page, length, offset))
 			assert(0);	/* why? */
 	}

 	/* Prepare the page, and buffers on the page for I/O */
 	bufvec_prepare_and_lock_page(bufvec, page);
 	for (i = 0; i < bufvec->on_page_idx; i++) {
 		struct buffer_head *buffer = bufvec->on_page[i].buffer;
 		block_t physical = bufvec->on_page[i].block;
 		get_bh(buffer);
 		tux3_clear_buffer_dirty_for_io(buffer, sb, physical);
 		bufvec_bio_add_buffer(bufvec, buffer);
 	}
 	bufvec_prepare_and_unlock_page(page);

 	bufvec->on_page_idx = 0;
 }

 /* Check whether "physical" is contiguous with bio */
 static int bufvec_bio_is_contiguous(struct bufvec *bufvec, block_t physical)
 {
 	struct sb *sb = tux_sb(bufvec_inode(bufvec)->i_sb);
 	struct bio *bio = bufvec->bio;
 	block_t next;

 	next = (block_t)bio->bi_sector + (bio->bi_size >> 9);
 	return next == (physical << (sb->blockbits - 9));
 }

 /* Get the page of next candidate buffer. */
 static struct page *bufvec_next_buffer_page(struct bufvec *bufvec)
 {
 	if (!list_empty(&bufvec->contig))
 		return bufvec_contig_buf(bufvec)->b_page;

 	if (bufvec->buffers && !list_empty(bufvec->buffers))
 		return buffers_entry(bufvec->buffers->next)->b_page;

 	return NULL;
 }

 /*
  * Prepare and submit I/O for specified range.
  *
  * This submits the contiguous range at once as much as possible.
  *
  * But if the page across on multiple ranges, we can't know when all
  * I/O was done on the page (and when we can clear the writeback flag).
  * So, we use different strategy. Those ranges are submitted as
  * multiple BIOs, and use BH_Update_Lock for exclusive check if I/O was
  * done.
  *
  * This doesn't guarantee all candidate buffers are submitted. E.g. if
  * the page across on multiple ranges, the page will be pending until
  * all physical addresses was specified.
  *
  * return value:
  * < 0 - error
  *   0 - success
  */
 int bufvec_io(int rw, struct bufvec *bufvec, block_t physical, unsigned count)
 {
 	unsigned int i;
 	int need_check = 0;

 	trace("index %Lu, contig_count %u, physical %Lu, count %u",
 	      bufvec_contig_index(bufvec), bufvec_contig_count(bufvec),
 	      physical, count);

 	assert(rw & WRITE);	/* FIXME: now only support WRITE */
 	assert(bufvec_contig_count(bufvec) >= count);

 	if (bufvec->on_page_idx) {
 		/*
 		 * If there is the pending buffers on the page, and buffers
 		 * was not contiguous, this is the complex case.
 		 */
 		need_check = 1;
 	} else if (bufvec->bio && !bufvec_bio_is_contiguous(bufvec, physical)) {
 		/*
 		 * If new range is not contiguous with the pending bio,
 		 * submit the pending bio.
 		 */
 		bufvec_submit_bio(rw, bufvec);
 	}

 	/* Add buffers to bio for each page */
 	for (i = 0; i < count; i++) {
 		struct buffer_head *buffer = bufvec_contig_buf(bufvec);

 		/* FIXME: need lock? (buffer is already owned by backend...) */
 		bufvec->contig_count--;
 		list_del_init(&buffer->b_assoc_buffers);

 		/* Collect buffers on the same page */
 		bufvec->on_page[bufvec->on_page_idx].buffer = buffer;
 		bufvec->on_page[bufvec->on_page_idx].block = physical + i;
 		bufvec->on_page_idx++;

 		/* If next buffer isn't on same page, add buffers to bio */
 		if (buffer->b_page != bufvec_next_buffer_page(bufvec)) {
 			int multiple = 0;
 			if (need_check) {
 				need_check = 0;
 				multiple = bufvec_is_multiple_ranges(bufvec);
 			}

 			if (multiple)
 				bufvec_bio_add_multiple(rw, bufvec);
 			else
 				bufvec_bio_add_page(rw, bufvec);
 		}
 	}

 	/* If no more buffer, submit the pending bio */
 	if (bufvec->bio && !bufvec_next_buffer_page(bufvec))
 		bufvec_submit_bio(rw, bufvec);

 	return 0;
 }

 static void bufvec_cancel_and_unlock_page(struct page *page,
 					  const pgoff_t outside_index)
 {
 	/*
 	 * If page is fully outside i_size, cancel dirty.
 	 *
 	 * If page is partially outside i_size, we have to check
 	 * buffers. If all buffers aren't dirty, cancel dirty.
 	 */
 	if (page->index < outside_index)
 		tux3_try_cancel_dirty_page(page);
 	else
 		cancel_dirty_page(page, PAGE_CACHE_SIZE);

 	unlock_page(page);
 }

 /* Cancel dirty buffers fully outside i_size */
 static void bufvec_cancel_dirty_outside(struct bufvec *bufvec)
 {
 	struct sb *sb = tux_sb(bufvec_inode(bufvec)->i_sb);
 	struct tux3_iattr_data *idata = bufvec->idata;
 	struct page *page, *prev_page;
 	struct buffer_head *buffer;
 	pgoff_t outside_index;

 	outside_index = (idata->i_size+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;

 	buffer = buffers_entry(bufvec->buffers->next);
 	page = prev_page = buffer->b_page;
 	lock_page(page);
 	while (1) {
 		trace("cancel dirty: buffer %p, block %Lu",
 		      buffer, bufindex(buffer));

 		/* Cancel buffer dirty of outside i_size */
 		list_del_init(&buffer->b_assoc_buffers);
 		tux3_clear_buffer_dirty_for_io(buffer, sb, 0);
 		tux3_clear_buffer_dirty_for_io_hack(buffer);

 		if (list_empty(bufvec->buffers))
 			break;

 		buffer = buffers_entry(bufvec->buffers->next);
 		if (buffer->b_page != prev_page) {
 			bufvec_cancel_and_unlock_page(page, outside_index);

 			prev_page = page;
 			page = buffer->b_page;
 			lock_page(page);
 		}
 	}
 	bufvec_cancel_and_unlock_page(page, outside_index);
 }

 /*
  * Try to add buffer to bufvec as contiguous range.
  *
  * return value:
  * 1 - success
  * 0 - fail to add
  */
 int bufvec_contig_add(struct bufvec *bufvec, struct buffer_head *buffer)
 {
 	unsigned contig_count = bufvec_contig_count(bufvec);

 	if (contig_count) {
 		block_t last;

 		/* Check contig_count limit */
 		if (bufvec_contig_count(bufvec) == MAX_BUFVEC_COUNT)
 			return 0;

 		/* Check if buffer is logically contiguous */
 		last = bufvec_contig_last_index(bufvec);
 		if (last != bufindex(buffer) - 1)
 			return 0;
 	}

 	bufvec_buffer_move_to_contig(bufvec, buffer);

 	return 1;
 }

 /*
  * Try to collect logically contiguous dirty range from bufvec->buffers.
  *
  * return value:
  * 1 - there is buffers for I/O
  * 0 - no buffers for I/O
  */
 static int bufvec_contig_collect(struct bufvec *bufvec)
 {
 	struct sb *sb = tux_sb(bufvec_inode(bufvec)->i_sb);
 	struct tux3_iattr_data *idata = bufvec->idata;
 	struct buffer_head *buffer;
 	block_t last_index, next_index, outside_block;

 	/* If there is in-progress contiguous range, leave as is */
 	if (bufvec_contig_count(bufvec))
 		return 1;
 	assert(!list_empty(bufvec->buffers));

 	outside_block = (idata->i_size + sb->blockmask) >> sb->blockbits;

 	buffer = buffers_entry(bufvec->buffers->next);
 	next_index = bufindex(buffer);
 	/* If next buffer is fully outside i_size, clear dirty */
 	if (next_index >= outside_block) {
 		bufvec_cancel_dirty_outside(bufvec);
 		return 0;
 	}

 	do {
 		/* Check contig_count limit */
 		if (bufvec_contig_count(bufvec) == MAX_BUFVEC_COUNT)
 			break;
 		bufvec_buffer_move_to_contig(bufvec, buffer);
 		trace("buffer %p", buffer);

 		if (list_empty(bufvec->buffers))
 			break;

 		buffer = buffers_entry(bufvec->buffers->next);
 		last_index = next_index;
 		next_index = bufindex(buffer);

 		/* If next buffer is fully outside i_size, clear dirty */
 		if (next_index >= outside_block) {
 			bufvec_cancel_dirty_outside(bufvec);
 			break;
 		}
 	} while (last_index == next_index - 1);

 	return !!bufvec_contig_count(bufvec);
 }

 static int buffer_index_cmp(void *priv, struct list_head *a,
 			    struct list_head *b)
 {
 	struct buffer_head *buf_a, *buf_b;

 	buf_a = list_entry(a, struct buffer_head, b_assoc_buffers);
 	buf_b = list_entry(b, struct buffer_head, b_assoc_buffers);

 	/*
 	 * Optimized version of the following:
 	 *
 	 * if (bufindex(buf_a) < bufindex(buf_b))
 	 *	return -1;
 	 * else if (bufindex(buf_a) > bufindex(buf_b))
 	 *	return 1;
 	 */
 	if (buf_a->b_page->index < buf_b->b_page->index)
 		return -1;
 	else if (buf_a->b_page->index > buf_b->b_page->index)
 		return 1;
 	else {
 		/* page_offset() is same, compare offset within page */
 		if (buf_a->b_data < buf_b->b_data)
 			return -1;
 		if (buf_a->b_data > buf_b->b_data)
 			return 1;
 	}

 	return 0;
 }

 /*
  * Flush buffers in head
  */
 int flush_list(struct inode *inode, struct tux3_iattr_data *idata,
 	       struct list_head *head, int req_flag)
 {
 	struct bufvec bufvec;
 	int err = 0;

 	/* FIXME: on error path, we have to do something for buffer state */

 	if (list_empty(head))
 		return 0;

 	bufvec_init(&bufvec, mapping(inode), head, idata);

 	/* Sort by bufindex() */
 	list_sort(NULL, head, buffer_index_cmp);

 	while (bufvec_next_buffer_page(&bufvec)) {
 		/* Collect contiguous buffer range */
 		if (bufvec_contig_collect(&bufvec)) {
 			/* Start I/O */
 			err = tux_inode(inode)->io(WRITE | req_flag, &bufvec);
 			if (err)
 				break;
 		}
 	}

 	bufvec_free(&bufvec);

 	return err;
 }

 /*
  * I/O helper for physical index buffers (e.g. buffers on volmap)
  */
 int __tux3_volmap_io(int rw, struct bufvec *bufvec, block_t physical,
 		     unsigned count)
 {
 	return blockio_vec(rw, bufvec, physical, count);
 }

 int tux3_volmap_io(int rw, struct bufvec *bufvec)
 {
 	block_t physical = bufvec_contig_index(bufvec);
 	unsigned count = bufvec_contig_count(bufvec);

 	/* FIXME: For now, this is only for write */
 	assert(rw & WRITE);

 	return __tux3_volmap_io(rw, bufvec, physical, count);
 }