| /* |
| * mm/page-writeback.c. |
| * |
| * Copyright (C) 2002, Linus Torvalds. |
| * |
| * Contains functions related to writing back dirty pages at the |
| * address_space level. |
| * |
| * 10Apr2002 akpm@zip.com.au |
| * Initial version |
| */ |
| |
| #include <linux/kernel.h> |
| #include <linux/module.h> |
| #include <linux/spinlock.h> |
| #include <linux/fs.h> |
| #include <linux/mm.h> |
| #include <linux/slab.h> |
| #include <linux/pagemap.h> |
| #include <linux/writeback.h> |
| #include <linux/init.h> |
| #include <linux/sysrq.h> |
| #include <linux/backing-dev.h> |
| |
| /* |
| * The maximum number of pages to writeout in a single bdflush/kupdate |
| * operation. We do this so we don't hold I_LOCK against an inode for |
| * enormous amounts of time, which would block a userspace task which has |
| * been forced to throttle against that inode. |
| */ |
| #define MAX_WRITEBACK_PAGES 1024 |
| |
| /* |
| * Memory thresholds, in percentages |
| * FIXME: expose these via /proc or whatever. |
| */ |
| |
| /* |
| * Start background writeback (via pdflush) at this level |
| */ |
| static int dirty_background_ratio = 40; |
| |
| /* |
| * The generator of dirty data starts async writeback at this level |
| */ |
| static int dirty_async_ratio = 50; |
| |
| /* |
| * The generator of dirty data performs sync writeout at this level |
| */ |
| static int dirty_sync_ratio = 60; |
| |
| static void background_writeout(unsigned long unused); |
| |
| /* |
| * balance_dirty_pages() must be called by processes which are |
| * generating dirty data. It looks at the number of dirty pages |
| * in the machine and either: |
| * |
| * - Starts background writeback or |
| * - Causes the caller to perform async writeback or |
| * - Causes the caller to perform synchronous writeback, then |
| * tells a pdflush thread to perform more writeback or |
| * - Does nothing at all. |
| * |
| * balance_dirty_pages() can sleep. |
| * |
| * FIXME: WB_SYNC_LAST doesn't actually work. It waits on the last dirty |
| * inode on the superblock list. It should wait when nr_to_write is |
| * exhausted. Doesn't seem to matter. |
| */ |
| void balance_dirty_pages(struct address_space *mapping) |
| { |
| const int tot = nr_free_pagecache_pages(); |
| struct page_state ps; |
| int background_thresh, async_thresh, sync_thresh; |
| unsigned long dirty_and_writeback; |
| |
| get_page_state(&ps); |
| dirty_and_writeback = ps.nr_dirty + ps.nr_writeback; |
| |
| background_thresh = (dirty_background_ratio * tot) / 100; |
| async_thresh = (dirty_async_ratio * tot) / 100; |
| sync_thresh = (dirty_sync_ratio * tot) / 100; |
| |
| if (dirty_and_writeback > sync_thresh) { |
| int nr_to_write = 1500; |
| |
| writeback_unlocked_inodes(&nr_to_write, WB_SYNC_LAST, NULL); |
| get_page_state(&ps); |
| } else if (dirty_and_writeback > async_thresh) { |
| int nr_to_write = 1500; |
| |
| writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL); |
| get_page_state(&ps); |
| } |
| |
| if (!writeback_in_progress(mapping->backing_dev_info) && |
| ps.nr_dirty > background_thresh) |
| pdflush_operation(background_writeout, 0); |
| } |
| |
| /** |
| * balance_dirty_pages_ratelimited - balance dirty memory state |
| * @mapping - address_space which was dirtied |
| * |
| * Processes which are dirtying memory should call in here once for each page |
| * which was newly dirtied. The function will periodically check the system's |
| * dirty state and will initiate writeback if needed. |
| * |
| * balance_dirty_pages_ratelimited() may sleep. |
| */ |
| void balance_dirty_pages_ratelimited(struct address_space *mapping) |
| { |
| static struct rate_limit_struct { |
| int count; |
| } ____cacheline_aligned ratelimits[NR_CPUS]; |
| int cpu; |
| |
| cpu = get_cpu(); |
| if (ratelimits[cpu].count++ >= 1000) { |
| ratelimits[cpu].count = 0; |
| put_cpu(); |
| balance_dirty_pages(mapping); |
| return; |
| } |
| put_cpu(); |
| } |
| |
| /* |
| * writeback at least _min_pages, and keep writing until the amount of dirty |
| * memory is less than the background threshold, or until we're all clean. |
| */ |
| static void background_writeout(unsigned long _min_pages) |
| { |
| const int tot = nr_free_pagecache_pages(); |
| const int background_thresh = (dirty_background_ratio * tot) / 100; |
| long min_pages = _min_pages; |
| int nr_to_write; |
| |
| do { |
| struct page_state ps; |
| |
| get_page_state(&ps); |
| if (ps.nr_dirty < background_thresh && min_pages <= 0) |
| break; |
| nr_to_write = MAX_WRITEBACK_PAGES; |
| writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL); |
| min_pages -= MAX_WRITEBACK_PAGES - nr_to_write; |
| } while (nr_to_write <= 0); |
| run_task_queue(&tq_disk); |
| } |
| |
| /* |
| * Start heavy writeback of everything. |
| */ |
| void wakeup_bdflush(void) |
| { |
| struct page_state ps; |
| |
| get_page_state(&ps); |
| pdflush_operation(background_writeout, ps.nr_dirty); |
| } |
| |
| /* |
| * The interval between `kupdate'-style writebacks. |
| * |
| * Traditional kupdate writes back data which is 30-35 seconds old. |
| * This one does that, but it also writes back just 1/6th of the dirty |
| * data. This is to avoid great I/O storms. |
| * |
| * We chunk the writes up and yield, to permit any throttled page-allocators |
| * to perform their I/O against a large file. |
| */ |
| static int wb_writeback_jifs = 5 * HZ; |
| static struct timer_list wb_timer; |
| |
| /* |
| * Periodic writeback of "old" data. |
| * |
| * Define "old": the first time one of an inode's pages is dirtied, we mark the |
| * dirtying-time in the inode's address_space. So this periodic writeback code |
| * just walks the superblock inode list, writing back any inodes which are |
| * older than a specific point in time. |
| * |
| * Try to run once per wb_writeback_jifs jiffies. But if a writeback event |
| * takes longer than a wb_writeback_jifs interval, then leave a one-second |
| * gap. |
| * |
| * older_than_this takes precedence over nr_to_write. So we'll only write back |
| * all dirty pages if they are all attached to "old" mappings. |
| */ |
| static void wb_kupdate(unsigned long arg) |
| { |
| unsigned long oldest_jif; |
| unsigned long start_jif; |
| unsigned long next_jif; |
| struct page_state ps; |
| int nr_to_write; |
| |
| sync_supers(); |
| get_page_state(&ps); |
| |
| oldest_jif = jiffies - 30*HZ; |
| start_jif = jiffies; |
| next_jif = start_jif + wb_writeback_jifs; |
| nr_to_write = ps.nr_dirty; |
| writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, &oldest_jif); |
| run_task_queue(&tq_disk); |
| yield(); |
| |
| if (time_before(next_jif, jiffies + HZ)) |
| next_jif = jiffies + HZ; |
| mod_timer(&wb_timer, next_jif); |
| } |
| |
| static void wb_timer_fn(unsigned long unused) |
| { |
| if (pdflush_operation(wb_kupdate, 0) < 0) |
| mod_timer(&wb_timer, jiffies + HZ); |
| } |
| |
| static int __init wb_timer_init(void) |
| { |
| init_timer(&wb_timer); |
| wb_timer.expires = jiffies + wb_writeback_jifs; |
| wb_timer.data = 0; |
| wb_timer.function = wb_timer_fn; |
| add_timer(&wb_timer); |
| return 0; |
| } |
| module_init(wb_timer_init); |
| |
| /* |
| * A library function, which implements the vm_writeback a_op. It's fairly |
| * lame at this time. The idea is: the VM wants to liberate this page, |
| * so we pass the page to the address_space and give the fs the opportunity |
| * to write out lots of pages around this one. It allows extent-based |
| * filesytems to do intelligent things. It lets delayed-allocate filesystems |
| * perform better file layout. It lets the address_space opportunistically |
| * write back disk-contiguous pages which are in other zones. |
| * |
| * FIXME: the VM wants to start I/O against *this* page. Because its zone |
| * is under pressure. But this function may start writeout against a |
| * totally different set of pages. Unlikely to be a huge problem, but if it |
| * is, we could just writepage the page if it is still (PageDirty && |
| * !PageWriteback) (See below). |
| * |
| * Another option is to just reposition page->mapping->dirty_pages so we |
| * *know* that the page will be written. That will work fine, but seems |
| * unpleasant. (If the page is not for-sure on ->dirty_pages we're dead). |
| * Plus it assumes that the address_space is performing writeback in |
| * ->dirty_pages order. |
| * |
| * So. The proper fix is to leave the page locked-and-dirty and to pass |
| * it all the way down. |
| */ |
| int generic_vm_writeback(struct page *page, int *nr_to_write) |
| { |
| struct inode *inode = page->mapping->host; |
| |
| /* |
| * We don't own this inode, and we don't want the address_space |
| * vanishing while writeback is walking its pages. |
| */ |
| inode = igrab(inode); |
| unlock_page(page); |
| |
| if (inode) { |
| writeback_mapping(inode->i_mapping, nr_to_write); |
| |
| /* |
| * This iput() will internally call ext2_discard_prealloc(), |
| * which is rather bogus. But there is no other way of |
| * dropping our ref to the inode. However, there's no harm |
| * in dropping the prealloc, because there probably isn't any. |
| * Just a waste of cycles. |
| */ |
| iput(inode); |
| #if 0 |
| if (!PageWriteback(page) && PageDirty(page)) { |
| lock_page(page); |
| if (!PageWriteback(page) && TestClearPageDirty(page)) |
| page->mapping->a_ops->writepage(page); |
| else |
| unlock_page(page); |
| } |
| #endif |
| } |
| return 0; |
| } |
| EXPORT_SYMBOL(generic_vm_writeback); |
| |
| /** |
| * generic_writeback_mapping - walk the list of dirty pages of the given |
| * address space and writepage() all of them. |
| * |
| * @mapping: address space structure to write |
| * @nr_to_write: subtract the number of written pages from *@nr_to_write |
| * |
| * This is a library function, which implements the writeback_mapping() |
| * address_space_operation. |
| * |
| * (The next two paragraphs refer to code which isn't here yet, but they |
| * explain the presence of address_space.io_pages) |
| * |
| * Pages can be moved from clean_pages or locked_pages onto dirty_pages |
| * at any time - it's not possible to lock against that. So pages which |
| * have already been added to a BIO may magically reappear on the dirty_pages |
| * list. And generic_writeback_mapping() will again try to lock those pages. |
| * But I/O has not yet been started against the page. Thus deadlock. |
| * |
| * To avoid this, the entire contents of the dirty_pages list are moved |
| * onto io_pages up-front. We then walk io_pages, locking the |
| * pages and submitting them for I/O, moving them to locked_pages. |
| * |
| * This has the added benefit of preventing a livelock which would otherwise |
| * occur if pages are being dirtied faster than we can write them out. |
| * |
| * If a page is already under I/O, generic_writeback_mapping() skips it, even |
| * if it's dirty. This is desirable behaviour for memory-cleaning writeback, |
| * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() |
| * and msync() need to guarentee that all the data which was dirty at the time |
| * the call was made get new I/O started against them. The way to do this is |
| * to run filemap_fdatawait() before calling filemap_fdatawrite(). |
| * |
| * It's fairly rare for PageWriteback pages to be on ->dirty_pages. It |
| * means that someone redirtied the page while it was under I/O. |
| */ |
| int generic_writeback_mapping(struct address_space *mapping, int *nr_to_write) |
| { |
| int (*writepage)(struct page *) = mapping->a_ops->writepage; |
| int ret = 0; |
| int done = 0; |
| int err; |
| |
| write_lock(&mapping->page_lock); |
| |
| list_splice(&mapping->dirty_pages, &mapping->io_pages); |
| INIT_LIST_HEAD(&mapping->dirty_pages); |
| |
| while (!list_empty(&mapping->io_pages) && !done) { |
| struct page *page = list_entry(mapping->io_pages.prev, |
| struct page, list); |
| list_del(&page->list); |
| if (PageWriteback(page)) { |
| if (PageDirty(page)) { |
| list_add(&page->list, &mapping->dirty_pages); |
| continue; |
| } |
| list_add(&page->list, &mapping->locked_pages); |
| continue; |
| } |
| if (!PageDirty(page)) { |
| list_add(&page->list, &mapping->clean_pages); |
| continue; |
| } |
| list_add(&page->list, &mapping->locked_pages); |
| page_cache_get(page); |
| write_unlock(&mapping->page_lock); |
| lock_page(page); |
| |
| /* It may have been removed from swapcache: check ->mapping */ |
| if (page->mapping && TestClearPageDirty(page) && |
| !PageWriteback(page)) { |
| /* FIXME: batch this up */ |
| if (!PageActive(page) && PageLRU(page)) { |
| spin_lock(&pagemap_lru_lock); |
| if (!PageActive(page) && PageLRU(page)) { |
| list_del(&page->lru); |
| list_add(&page->lru, &inactive_list); |
| } |
| spin_unlock(&pagemap_lru_lock); |
| } |
| err = writepage(page); |
| if (!ret) |
| ret = err; |
| if (nr_to_write && --(*nr_to_write) <= 0) |
| done = 1; |
| } else { |
| unlock_page(page); |
| } |
| |
| page_cache_release(page); |
| write_lock(&mapping->page_lock); |
| } |
| if (!list_empty(&mapping->io_pages)) { |
| /* |
| * Put the rest back, in the correct order. |
| */ |
| list_splice(&mapping->io_pages, mapping->dirty_pages.prev); |
| INIT_LIST_HEAD(&mapping->io_pages); |
| } |
| write_unlock(&mapping->page_lock); |
| return ret; |
| } |
| EXPORT_SYMBOL(generic_writeback_mapping); |
| |
| int writeback_mapping(struct address_space *mapping, int *nr_to_write) |
| { |
| if (mapping->a_ops->writeback_mapping) |
| return mapping->a_ops->writeback_mapping(mapping, nr_to_write); |
| return generic_writeback_mapping(mapping, nr_to_write); |
| } |
| |
| /** |
| * write_one_page - write out a single page and optionally wait on I/O |
| * |
| * @page - the page to write |
| * @wait - if true, wait on writeout |
| * |
| * The page must be locked by the caller and will be unlocked upon return. |
| * |
| * write_one_page() returns a negative error code if I/O failed. |
| */ |
| int write_one_page(struct page *page, int wait) |
| { |
| struct address_space *mapping = page->mapping; |
| int ret = 0; |
| |
| BUG_ON(!PageLocked(page)); |
| |
| if (wait && PageWriteback(page)) |
| wait_on_page_writeback(page); |
| |
| write_lock(&mapping->page_lock); |
| list_del(&page->list); |
| if (TestClearPageDirty(page)) { |
| list_add(&page->list, &mapping->locked_pages); |
| page_cache_get(page); |
| write_unlock(&mapping->page_lock); |
| ret = mapping->a_ops->writepage(page); |
| if (ret == 0 && wait) { |
| wait_on_page_writeback(page); |
| if (PageError(page)) |
| ret = -EIO; |
| } |
| page_cache_release(page); |
| } else { |
| list_add(&page->list, &mapping->clean_pages); |
| write_unlock(&mapping->page_lock); |
| unlock_page(page); |
| } |
| return ret; |
| } |
| EXPORT_SYMBOL(write_one_page); |
| |
| /* |
| * Add a page to the dirty page list. |
| * |
| * It is a sad fact of life that this function is called from several places |
| * deeply under spinlocking. It may not sleep. |
| * |
| * If the page has buffers, the uptodate buffers are set dirty, to preserve |
| * dirty-state coherency between the page and the buffers. It the page does |
| * not have buffers then when they are later attached they will all be set |
| * dirty. |
| * |
| * The buffers are dirtied before the page is dirtied. There's a small race |
| * window in which a writepage caller may see the page cleanness but not the |
| * buffer dirtiness. That's fine. If this code were to set the page dirty |
| * before the buffers, a concurrent writepage caller could clear the page dirty |
| * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean |
| * page on the dirty page list. |
| * |
| * There is also a small window where the page is dirty, and not on dirty_pages. |
| * Also a possibility that by the time the page is added to dirty_pages, it has |
| * been set clean. The page lists are somewhat approximate in this regard. |
| * It's better to have clean pages accidentally attached to dirty_pages than to |
| * leave dirty pages attached to clean_pages. |
| * |
| * We use private_lock to lock against try_to_free_buffers while using the |
| * page's buffer list. Also use this to protect against clean buffers being |
| * added to the page after it was set dirty. |
| * |
| * FIXME: may need to call ->reservepage here as well. That's rather up to the |
| * address_space though. |
| * |
| * For now, we treat swapper_space specially. It doesn't use the normal |
| * block a_ops. |
| * |
| * FIXME: this should move over to fs/buffer.c - buffer_heads have no business in mm/ |
| */ |
| #include <linux/buffer_head.h> |
| int __set_page_dirty_buffers(struct page *page) |
| { |
| struct address_space * const mapping = page->mapping; |
| int ret = 0; |
| |
| if (mapping == NULL) { |
| SetPageDirty(page); |
| goto out; |
| } |
| |
| spin_lock(&mapping->private_lock); |
| |
| if (page_has_buffers(page) && !PageSwapCache(page)) { |
| struct buffer_head *head = page_buffers(page); |
| struct buffer_head *bh = head; |
| |
| do { |
| if (buffer_uptodate(bh)) |
| set_buffer_dirty(bh); |
| else |
| buffer_error(); |
| bh = bh->b_this_page; |
| } while (bh != head); |
| } |
| |
| if (!TestSetPageDirty(page)) { |
| write_lock(&mapping->page_lock); |
| list_del(&page->list); |
| list_add(&page->list, &mapping->dirty_pages); |
| write_unlock(&mapping->page_lock); |
| __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
| } |
| |
| spin_unlock(&mapping->private_lock); |
| out: |
| return ret; |
| } |
| EXPORT_SYMBOL(__set_page_dirty_buffers); |
| |
| /* |
| * For address_spaces which do not use buffers. Just set the page's dirty bit |
| * and move it to the dirty_pages list. Also perform space reservation if |
| * required. |
| * |
| * __set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page |
| * is still safe, as long as it actually manages to find some blocks at |
| * writeback time. |
| * |
| * This is also used when a single buffer is being dirtied: we want to set the |
| * page dirty in that case, but not all the buffers. This is a "bottom-up" |
| * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. |
| */ |
| int __set_page_dirty_nobuffers(struct page *page) |
| { |
| int ret = 0; |
| |
| if (!TestSetPageDirty(page)) { |
| struct address_space *mapping = page->mapping; |
| |
| if (mapping) { |
| write_lock(&mapping->page_lock); |
| list_del(&page->list); |
| list_add(&page->list, &mapping->dirty_pages); |
| write_unlock(&mapping->page_lock); |
| __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
| } |
| } |
| return ret; |
| } |
| EXPORT_SYMBOL(__set_page_dirty_nobuffers); |