blob: 05c9b1e894952e189327df76ec66cab4e4529d44 [file] [log] [blame]
/*
* Block Fork (Copy-On-Write of logically addressed block).
*
* Copyright (c) 2008-2014 OGAWA Hirofumi
*/
#include <linux/hugetlb.h> /* for PageHuge() */
#include <linux/swap.h> /* for __lru_cache_add() */
#include <linux/cleancache.h>
/*
* Scanning the freeable forked page.
*
* Although we would like to free forked page at early stage (e.g. in
* blockdirty()). To free page, we have to set NULL to page->mapping,
* and free buffers on the page. But reader side can be grabbing the
* forked page, and may use ->mapping or buffers. So, we have to
* keep forked page as is until it can be freed.
*
* So, we check the forked pages periodically. And if all referencer
* are gone (checking page_count()), free forked buffer and page.
*/
#define buffer_link(x) ((struct link *)&(x)->b_end_io)
#define buffer_link_entry(x) __link_entry(x, struct buffer_head, b_end_io)
/*
* Register forked buffer to free the page later.
* FIXME: we should replace the hack link by ->b_end_io with something
*/
static void forked_buffer_add(struct sb *sb, struct buffer_head *buffer)
{
/* Pin buffer. This prevents try_to_free_buffers(). */
get_bh(buffer);
spin_lock(&sb->forked_buffers_lock);
link_add(buffer_link(buffer), &sb->forked_buffers);
spin_unlock(&sb->forked_buffers_lock);
}
static void forked_buffer_del(struct link *prev, struct buffer_head *buffer)
{
link_del_next(prev);
/* Unpin buffer */
put_bh(buffer);
}
/* Cleaning and free forked page */
static void free_forked_page(struct page *page)
{
struct address_space *mapping = page->mapping;
assert(PageForked(page));
lock_page(page);
if (page_has_buffers(page)) {
int ret = try_to_free_buffers(page);
assert(ret);
}
/* Lock is to make sure end_page_writeback() was done completely */
spin_lock_irq(&mapping->tree_lock);
page->mapping = NULL;
spin_unlock_irq(&mapping->tree_lock);
unlock_page(page);
/* Drop the radix-tree reference */
page_cache_release(page);
/* Drop the final reference */
trace_on("page %p, count %u", page, page_count(page));
page_cache_release(page);
}
/* Use same bit with bufdelta though, this buffer never be dirty */
#define buffer_freeable(x) test_bit(BH_PrivateStart, &(x)->b_state)
#define set_buffer_freeable(x) set_bit(BH_PrivateStart, &(x)->b_state)
#define clear_buffer_freeable(x) clear_bit(BH_PrivateStart, &(x)->b_state)
static inline int buffer_busy(struct buffer_head *buffer, int refcount)
{
/*
* Page didn't have dirty and writeback, so this buffer should
* already be flushed. Check if reader is still using this.
*/
assert(!buffer_dirty(buffer));
assert(!buffer_async_write(buffer));
assert(!buffer_async_read(buffer));
return atomic_read(&buffer->b_count) > refcount ||
buffer_locked(buffer);
}
/* There is no referencer? */
static int is_freeable_forked(struct buffer_head *buffer, struct page *page)
{
/*
* There is no reference of buffers? Once reader released
* buffer, it never grab again. So we don't need recheck it.
*/
if (!buffer_freeable(buffer)) {
struct buffer_head *tmp = buffer->b_this_page;
while (tmp != buffer) {
if (buffer_busy(tmp, 0))
return 0;
tmp = tmp->b_this_page;
}
/* we have the refcount of this buffer to pin */
if (buffer_busy(buffer, 1))
return 0;
set_buffer_freeable(buffer);
}
/* Page is freeable? (radix-tree + ->private + own) */
return page_count(page) == 3;
}
/*
* Try to free forked page. (If it is called from umount or evict_inode
* path, there should be no referencer. So we free forked page
* forcefully.)
*
* inode: Free only if page is related to this inode.
* force: If true, even if refcount != 0 try to free.
*
* FIXME: we need the better way, instead of polling the freeable
* forked pages periodically.
*/
void free_forked_buffers(struct sb *sb, struct inode *inode, int force)
{
struct link free_list, *node, *prev, *n;
init_link_circular(&free_list);
/* Move freeable forked page to free_list */
spin_lock(&sb->forked_buffers_lock);
link_for_each_safe(node, prev, n, &sb->forked_buffers) {
struct buffer_head *buffer = buffer_link_entry(node);
struct page *page = buffer->b_page;
trace_on("buffer %p, page %p, count %u",
buffer, page, page_count(page));
if (inode) {
/* Free only if page is related to inode */
if (page->mapping != inode->i_mapping)
continue;
}
#if TUX3_FLUSHER == TUX3_FLUSHER_SYNC
/* The page should already be submitted if no async frontend */
assert(!PageDirty(page));
#endif
assert(!force || (!PageDirty(page) && !PageWriteback(page)));
/*
* I/O was submitted and I/O was done?
*
* NOTE: order of checking flags is important.
*
* free_forked_buffers bufvec_prepare_and_lock_page
* PageWriteback()
* TestSetPageWriteback()
* TestClearPageDirty()
* PageDirty()
* [missed both flags]
*
* Above order has race. So, we have to check "dirty"
* at first, then check "writeback".
*
* FIXME: we would not want to depend on this fragile
* way, and would want to use refcount simply to free
* forked page.
*/
if (!PageDirty(page) && !PageWriteback(page)) {
/* All users were gone or force=1? */
if (force || is_freeable_forked(buffer, page)) {
clear_buffer_freeable(buffer);
link_del_next(prev);
link_add(buffer_link(buffer), &free_list);
}
}
}
spin_unlock(&sb->forked_buffers_lock);
/* Free forked pages */
while (!link_empty(&free_list)) {
struct buffer_head *buffer = buffer_link_entry(free_list.next);
struct page *page = buffer->b_page;
forked_buffer_del(&free_list, buffer);
free_forked_page(page);
}
}
/*
* Block fork core
*/
#include "mmap_builtin_hack.h"
/*
* Clear writable to protect oldpage from following mmap write race.
*
* cpu0 cpu1 cpu2
* [mmap write]
* mmap write(old)
* page fault
* [backend] dirty old
* delta++
* [page_fault]
* page fork
* mmap write(old)
* no page fault
* copy_page(new, old) modify page
* replace_pte(new, old)
* flusher
* page_mkclean(old)
*
* There is delay between delta++ and page_mkclean() for I/O. So,
* while cpu0 copying data on page by page fork, another cpu (cpu2)
* can change data on the same page. If this race happens, new and old
* page can have different data.
*/
static void prepare_clone_page(struct page *page)
{
assert(PageLocked(page));
/*
* If backend flusher is still not clearing the dirty flag and
* (not call page_mkclean()) for I/O. Call it here to prevent
* above race, instead.
*/
if (PageDirty(page))
page_mkclean(page);
}
/*
* This replaces the oldpage on radix-tree with newpage atomically.
*
* Similar to migrate_pages(), but the oldpage is for writeout.
* FIXME: we would have to add mmap handling (e.g. replace PTE)
*/
static int tux3_replace_page_cache(struct page *oldpage, struct page *newpage)
{
struct address_space *mapping = oldpage->mapping;
void **pslot;
/* Get refcount for radix-tree */
page_cache_get(newpage);
/* Replace page in radix tree. */
spin_lock_irq(&mapping->tree_lock);
/* PAGECACHE_TAG_DIRTY represents the view of frontend. Clear it. */
if (PageDirty(oldpage))
radix_tree_tag_clear(&mapping->page_tree, page_index(oldpage),
PAGECACHE_TAG_DIRTY);
/* The refcount to newpage is used for radix tree. */
pslot = radix_tree_lookup_slot(&mapping->page_tree, oldpage->index);
radix_tree_replace_slot(pslot, newpage);
__inc_zone_page_state(newpage, NR_FILE_PAGES);
__dec_zone_page_state(oldpage, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
#if 0 /* FIXME */
/* mem_cgroup codes must not be called under tree_lock */
mem_cgroup_replace_page_cache(oldpage, newpage);
#endif
/* Release refcount for radix-tree */
page_cache_release(oldpage);
return 0;
}
/*
* This delete the page from radix-tree. But leave page->mapping as is.
*
* Similar to truncate_inode_page(), but the oldpage is for writeout.
* FIXME: we would have to add mmap handling (e.g. replace PTE)
*/
static void tux3_delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
/* Delete page from radix tree. */
spin_lock_irq(&mapping->tree_lock);
/*
* if we're uptodate, flush out into the cleancache, otherwise
* invalidate any existing cleancache entries. We can't leave
* stale data around in the cleancache once our page is gone
*/
if (PageUptodate(page) && PageMappedToDisk(page))
cleancache_put_page(page);
else
cleancache_invalidate_page(mapping, page);
radix_tree_delete(&mapping->page_tree, page->index);
#if 0 /* FIXME: backend is assuming page->mapping is available */
page->mapping = NULL;
#endif
/* Leave page->index set: truncation lookup relies upon it */
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
BUG_ON(page_mapped(page));
/*
* The following dirty accounting is done by writeback
* path. So, we don't need to do here.
*
* dec_zone_page_state(page, NR_FILE_DIRTY);
* dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
*/
spin_unlock_irq(&mapping->tree_lock);
#if 0 /* FIXME */
mem_cgroup_uncharge_cache_page(page);
#endif
page_cache_release(page);
}
/*
* Clone buffers. But cloned buffer represents the buffer state after
* flushing buffer.
*/
static void clone_buffers(struct page *oldpage, struct page *newpage)
{
struct sb *sb = tux_sb(oldpage->mapping->host->i_sb);
struct buffer_head *head, *newbuf, *oldbuf;
#if 1 /* For now, writeback doesn't use BH_Lock */
#define USE_FOR_IO \
((1UL << BH_Uptodate_Lock) | (1UL << BH_Async_Write))
#else
#define USE_FOR_IO \
((1UL << BH_Lock) | (1UL << BH_Uptodate_Lock) | (1UL << BH_Async_Write))
#endif
oldbuf = page_buffers(oldpage);
newbuf = page_buffers(newpage);
head = newbuf;
do {
assert(!buffer_locked(oldbuf));
assert(!buffer_async_read(oldbuf));
newbuf->b_state = oldbuf->b_state;
/* Adjust ->b_state to after I/O */
newbuf->b_state &= ~USE_FOR_IO;
if (buffer_dirty(newbuf))
tux3_clear_buffer_dirty_for_io(newbuf, sb, 0);
oldbuf = oldbuf->b_this_page;
newbuf = newbuf->b_this_page;
} while (newbuf != head);
}
/* Based on migrate_page_copy() */
static struct page *clone_page(struct page *oldpage, unsigned blocksize)
{
struct address_space *mapping = oldpage->mapping;
gfp_t gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS;
struct page *newpage = __page_cache_alloc(gfp_mask);
newpage->mapping = oldpage->mapping;
newpage->index = oldpage->index;
copy_highpage(newpage, oldpage);
/* oldpage should be forked page */
BUG_ON(PageForked(oldpage));
/* FIXME: right? */
BUG_ON(PageSwapCache(oldpage));
BUG_ON(PageSwapBacked(oldpage));
BUG_ON(PageHuge(oldpage));
if (PageError(oldpage))
SetPageError(newpage);
if (PageReferenced(oldpage))
SetPageReferenced(newpage);
if (PageUptodate(oldpage))
SetPageUptodate(newpage);
if (PageActive(oldpage))
SetPageActive(newpage);
if (PageMappedToDisk(oldpage))
SetPageMappedToDisk(newpage);
#if 0 /* FIXME: need? */
/*
* Copy NUMA information to the new page, to prevent over-eager
* future migrations of this same page.
*/
cpupid = page_cpupid_xchg_last(oldpage, -1);
page_cpupid_xchg_last(newpage, cpupid);
#endif
mlock_migrate_page(newpage, oldpage);
#if 0
ksm_migrate_page(newpage, oldpage);
#endif
/* Lock newpage before visible via radix tree */
assert(!PageLocked(newpage));
__set_page_locked(newpage);
create_empty_buffers(newpage, blocksize, 0);
clone_buffers(oldpage, newpage);
return newpage;
}
/* Try to remove from LRU list */
static void oldpage_try_remove_from_lru(struct page *page)
{
/* Required functions are not exported at 3.4.4 */
}
/* Schedule to add LRU list (based on putback_lru_page()) */
static void newpage_add_lru(struct page *page)
{
__lru_cache_add(page);
}
enum ret_needfork {
RET_FORKED = 1, /* Someone already forked */
RET_NEED_FORK, /* Need to fork to dirty */
RET_CAN_DIRTY, /* Can dirty without fork */
RET_ALREADY_DIRTY, /* Buffer is already dirtied for delta */
};
static enum ret_needfork
need_fork(struct page *page, struct buffer_head *buffer, unsigned delta)
{
struct buffer_head *tmp;
int bufdelta;
/* Someone already forked this page. */
if (PageForked(page))
return RET_FORKED;
/* Page is under I/O, needs buffer fork */
if (PageWriteback(page))
return RET_NEED_FORK;
/*
* If page isn't dirty (and isn't writeback), this is clean
* page (and all buffers should be clean on this page). So we
* can just dirty the buffer for current delta.
*/
if (!PageDirty(page)) {
assert(!buffer || !buffer_dirty(buffer));
return RET_CAN_DIRTY;
}
if (buffer == NULL) {
/* If the page is dirty, it should have buffers */
assert(page_has_buffers(page));
buffer = page_buffers(page);
}
/*
* (Re-)check the buffer and page under lock_page. (We don't
* allow the buffer has different delta states on same page.)
*/
bufdelta = buffer_check_dirty_delta(buffer->b_state);
if (bufdelta >= 0) {
/* Buffer is dirtied by delta, just modify this buffer */
if (bufdelta == tux3_delta(delta))
return RET_ALREADY_DIRTY;
/* Buffer was dirtied by different delta, we need buffer fork */
return RET_NEED_FORK;
}
/*
* Check other buffers sharing same page.
*/
tmp = buffer->b_this_page;
while (tmp != buffer) {
if (!buffer_can_modify(tmp, delta)) {
/* The buffer can't be modified for delta */
return RET_NEED_FORK;
}
tmp = tmp->b_this_page;
}
/* This page can be modified, dirty this buffer */
return RET_CAN_DIRTY;
}
struct buffer_head *blockdirty(struct buffer_head *buffer, unsigned newdelta)
{
struct page *newpage, *oldpage = buffer->b_page;
struct sb *sb;
struct buffer_head *newbuf;
enum ret_needfork ret_needfork;
int err;
trace("buffer %p, page %p, index %lx, count %u",
buffer, oldpage, oldpage->index, page_count(oldpage));
trace("forked %u, dirty %u, writeback %u",
PageForked(oldpage), PageDirty(oldpage), PageWriteback(oldpage));
/* The simple case: redirty on same delta */
if (buffer_already_dirty(buffer, newdelta))
return buffer;
/* Take page lock to protect buffer list, and concurrent block_fork */
lock_page(oldpage);
/* This happens on partially dirty page. */
// assert(PageUptodate(oldpage));
assert(!page_mapped(oldpage));
switch ((ret_needfork = need_fork(oldpage, buffer, newdelta))) {
case RET_FORKED:
/* This page was already forked. Retry from lookup page. */
buffer = ERR_PTR(-EAGAIN);
WARN_ON(1);
/* FALLTHRU */
case RET_ALREADY_DIRTY:
/* This buffer was already dirtied. Done. */
goto out;
case RET_CAN_DIRTY:
case RET_NEED_FORK:
break;
default:
BUG();
break;
}
/* Checked buffer and oldpage, now oldpage->mapping should be valid. */
sb = tux_sb(oldpage->mapping->host->i_sb);
if (ret_needfork == RET_CAN_DIRTY) {
/* We can dirty this buffer. */
goto dirty_buffer;
}
/*
* We need to buffer fork. Start to clone the oldpage.
*/
newpage = clone_page(oldpage, sb->blocksize);
if (IS_ERR(newpage)) {
buffer = ERR_CAST(newpage);
goto out;
}
newbuf = __get_buffer(newpage, bh_offset(buffer) >> sb->blockbits);
/* Grab buffer to pin page, then release refcount of page */
get_bh(newbuf);
page_cache_release(newpage);
/* We keep page->mapping as is, so get refcount for radix-tree. */
page_cache_get(oldpage);
/* Replace oldpage on radix-tree with newpage */
err = tux3_replace_page_cache(oldpage, newpage);
newpage_add_lru(newpage);
/*
* Referencer are dummy radix-tree + ->private (plus other
* users and lru_cache).
*
* FIXME: We can't remove from LRU, because page can be on
* per-cpu lru cache at here. So, vmscan will try to free
* oldpage. We get refcount to pin oldpage to prevent vmscan
* try to release oldpage.
*/
trace("oldpage count %u", page_count(oldpage));
assert(page_count(oldpage) >= 2);
page_cache_get(oldpage);
oldpage_try_remove_from_lru(oldpage);
/*
* This prevents to re-fork the oldpage. And we guarantee the
* newpage is available on radix-tree here.
*/
SetPageForked(oldpage);
unlock_page(oldpage);
/* Register forked buffer to free forked page later */
forked_buffer_add(sb, buffer);
brelse(buffer);
trace("cloned page %p, buffer %p", newpage, newbuf);
buffer = newbuf;
oldpage = newpage;
dirty_buffer:
assert(!buffer_dirty(buffer));
__tux3_mark_buffer_dirty(buffer, newdelta);
out:
unlock_page(oldpage);
return buffer;
}
/*
* Do buffer fork for oldpage if needed. Then return page with locked.
* Page is locked, so, the caller can call __tux3_mark_buffer_dirty()
* (without checking buffer fork) to dirty buffers on the returned page,
* until unlock page.
*
* Caller must hold refcount of oldpage and hold lock_page(oldpage)
*/
struct page *pagefork_for_blockdirty(struct page *oldpage, unsigned newdelta)
{
struct page *newpage = oldpage;
struct sb *sb;
enum ret_needfork ret_needfork;
int err;
/* Check page lock to protect buffer list, and concurrent block_fork */
assert(PageLocked(oldpage));
trace("page %p, index %lx, count %u",
oldpage, oldpage->index, page_count(oldpage));
trace("forked %u, dirty %u, writeback %u",
PageForked(oldpage), PageDirty(oldpage), PageWriteback(oldpage));
/* This happens on partially dirty page. */
// assert(PageUptodate(page));
switch ((ret_needfork = need_fork(oldpage, NULL, newdelta))) {
case RET_FORKED:
/* This page was already forked. Retry from lookup page. */
newpage = ERR_PTR(-EAGAIN);
WARN_ON(1);
case RET_ALREADY_DIRTY:
/* This buffer was already dirtied. Done. */
goto out;
case RET_CAN_DIRTY:
case RET_NEED_FORK:
break;
default:
BUG();
break;
}
/* Checked buffer and oldpage, now oldpage->mapping should be valid. */
sb = tux_sb(oldpage->mapping->host->i_sb);
if (ret_needfork == RET_CAN_DIRTY) {
/* We can dirty this buffer. */
goto out;
}
/* Clear writable to protect oldpage from mmap write race */
prepare_clone_page(oldpage);
/*
* We need to buffer fork. Start to clone the oldpage.
*/
newpage = clone_page(oldpage, sb->blocksize);
if (IS_ERR(newpage))
goto out;
/*
* We keep page->mapping as is, so inherit refcount of caller
* for radix-tree.
*/
/*page_cache_get(oldpage);*/
/* Replace oldpage on radix-tree with newpage */
err = tux3_replace_page_cache(oldpage, newpage);
newpage_add_lru(newpage);
/*
* Referencer are dummy radix-tree + ->private (plus other
* users and lru_cache).
*
* FIXME: We can't remove from LRU, because page can be on
* per-cpu lru cache at here. So, vmscan will try to free
* oldpage. We get refcount to pin oldpage to prevent vmscan
* try to release oldpage.
*/
trace("oldpage count %u", page_count(oldpage));
assert(page_count(oldpage) >= 2);
page_cache_get(oldpage);
oldpage_try_remove_from_lru(oldpage);
/*
* This prevents to re-fork the oldpage. And we guarantee the
* newpage is available on radix-tree here.
*/
SetPageForked(oldpage);
/*
* Update PTEs for forked page.
*/
page_cow_file(oldpage, newpage);
unlock_page(oldpage);
/* Register forked buffer to free forked page later */
forked_buffer_add(sb, page_buffers(oldpage));
trace("cloned page %p", newpage);
out:
return newpage;
}
/*
* This checks the page whether we can invalidate. If the page is
* stabled, we can't invalidate the buffers on page. So, this forks
* the page without making clone page.
*
* 1 - fork was done to invalidate (i.e. page was removed from radix-tree)
* 0 - fork was not done (i.e. buffers on page can be invalidated)
*/
int bufferfork_to_invalidate(struct address_space *mapping, struct page *page)
{
struct sb *sb = tux_sb(mapping->host->i_sb);
unsigned delta = tux3_inode_delta(mapping->host);
assert(PageLocked(page));
assert(!page_mapped(page));
switch (need_fork(page, NULL, delta)) {
case RET_NEED_FORK:
/* Need to fork, then delete from radix-tree */
break;
case RET_ALREADY_DIRTY:
case RET_CAN_DIRTY:
/* We can invalidate the page */
return 0;
case RET_FORKED:
trace_on("mapping %p, page %p", mapping, page);
/* FALLTHRU */
default:
BUG();
break;
}
/* We keep page->mapping as is, so get refcount for radix-tree. */
page_cache_get(page);
/* FIXME: need this? */
ClearPageMappedToDisk(page);
/* Delete page from radix-tree */
tux3_delete_from_page_cache(page);
/*
* Referencer are dummy radix-tree + ->private (plus other
* users and lru_cache).
*
* FIXME: We can't remove from LRU, because page can be on
* per-cpu lru cache at here. So, vmscan will try to free
* page. We get refcount to pin page to prevent vmscan
* try to release page.
*/
trace("page count %u", page_count(page));
assert(page_count(page) >= 2);
page_cache_get(page);
oldpage_try_remove_from_lru(page);
/*
* This prevents to re-fork the page. And we guarantee the
* newpage is available on radix-tree here.
*/
SetPageForked(page);
/* Register forked buffer to free forked page later */
forked_buffer_add(sb, page_buffers(page));
return 1;
}