Btrfs: use a list for dirty log pages
Setting bits in an io tree for the dirty log pages is really expensive,
taking somewhere around 7us on my box. So replace this with a basic list,
which makes this operation take .2 us per dirty eb. This makes my sync test
go from 4.4 mb/s to 5.8 mb/s. Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f4255db..37b8c72 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1552,7 +1552,8 @@
struct btrfs_root_item root_item;
struct btrfs_key root_key;
struct btrfs_fs_info *fs_info;
- struct extent_io_tree dirty_log_pages;
+ spinlock_t log_pages_lock;
+ struct list_head dirty_log_pages[2];
struct kobject root_kobj;
struct completion kobj_unregister;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 31f4ca6..604678dd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1197,8 +1197,9 @@
atomic_set(&root->orphan_inodes, 0);
root->log_transid = 0;
root->last_log_commit = 0;
- extent_io_tree_init(&root->dirty_log_pages,
- fs_info->btree_inode->i_mapping);
+ spin_lock_init(&root->log_pages_lock);
+ INIT_LIST_HEAD(&root->dirty_log_pages[0]);
+ INIT_LIST_HEAD(&root->dirty_log_pages[1]);
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 98af837..6f2d283 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6249,12 +6249,11 @@
* we allow two log transactions at a time, use different
* EXENT bit to differentiate dirty pages.
*/
- if (root->log_transid % 2 == 0)
- set_extent_dirty(&root->dirty_log_pages, buf->start,
- buf->start + buf->len - 1, GFP_NOFS);
- else
- set_extent_new(&root->dirty_log_pages, buf->start,
- buf->start + buf->len - 1, GFP_NOFS);
+ spin_lock(&root->log_pages_lock);
+ list_add_tail(&buf->dirty_list,
+ &root->dirty_log_pages[root->log_transid % 2]);
+ spin_unlock(&root->log_pages_lock);
+ extent_buffer_get(buf);
} else {
set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1b319df..e2c0f69 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -12,6 +12,7 @@
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/cleancache.h>
+#include <linux/list_sort.h>
#include "extent_io.h"
#include "extent_map.h"
#include "compat.h"
@@ -2788,6 +2789,8 @@
struct writeback_control *wbc,
unsigned long nr_written)
{
+ if (!wbc)
+ return;
wbc->nr_to_write -= nr_written;
if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
@@ -3247,6 +3250,61 @@
return ret;
}
+static int eb_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct extent_buffer *eb1, *eb2;
+
+ eb1 = list_entry(a, struct extent_buffer, dirty_list);
+ eb2 = list_entry(b, struct extent_buffer, dirty_list);
+
+ if (eb1->start < eb2->start)
+ return -1;
+ else if (eb1->start > eb2->start)
+ return 1;
+ return 0;
+}
+
+int btrfs_sync_eb_list(struct btrfs_fs_info *fs_info, struct list_head *list)
+{
+ struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+ struct extent_buffer *eb;
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .extent_locked = 0,
+ .sync_io = 1,
+ .bio_flags = 0,
+ };
+ int ret = 0;
+ int err = 0;
+
+ list_sort(NULL, list, eb_cmp);
+
+ list_for_each_entry(eb, list, dirty_list) {
+ if (!lock_extent_buffer_for_io(eb, fs_info, &epd))
+ continue;
+
+ ret = write_one_eb(eb, fs_info, NULL, &epd);
+ if (ret && !err)
+ err = ret;
+ }
+ flush_write_bio(&epd);
+
+ while(!list_empty(list)) {
+ eb = list_first_entry(list, struct extent_buffer, dirty_list);
+ list_del_init(&eb->dirty_list);
+ if (err) {
+ free_extent_buffer(eb);
+ continue;
+ }
+
+ wait_on_extent_buffer_writeback(eb);
+ free_extent_buffer(eb);
+ }
+
+ return err;
+}
+
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -4024,6 +4082,7 @@
atomic_set(&eb->blocking_writers, 0);
atomic_set(&eb->spinning_readers, 0);
atomic_set(&eb->spinning_writers, 0);
+ INIT_LIST_HEAD(&eb->dirty_list);
eb->lock_nested = 0;
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2eacfab..cc01dce 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -136,6 +136,7 @@
atomic_t io_pages;
int read_mirror;
struct list_head leak_list;
+ struct list_head dirty_list;
struct rcu_head rcu_head;
pid_t lock_owner;
@@ -345,4 +346,5 @@
int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
int mirror_num);
+int btrfs_sync_eb_list(struct btrfs_fs_info *fs_info, struct list_head *list);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6d73baf..909bd1e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2278,8 +2278,10 @@
int ret;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
+ struct list_head list;
unsigned long log_transid = 0;
+ INIT_LIST_HEAD(&list);
mutex_lock(&root->log_mutex);
log_transid = root->log_transid;
index1 = root->log_transid % 2;
@@ -2319,17 +2321,6 @@
else
mark = EXTENT_NEW;
- /* we start IO on all the marked extents here, but we don't actually
- * wait for them until later.
- */
- ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- btrfs_free_logged_extents(log, log_transid);
- mutex_unlock(&root->log_mutex);
- goto out;
- }
-
btrfs_set_root_node(&log->root_item, log->node);
root->log_transid++;
@@ -2364,7 +2355,6 @@
goto out;
}
root->fs_info->last_trans_log_full_commit = trans->transid;
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
@@ -2373,7 +2363,6 @@
index2 = log_root_tree->log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
wait_log_commit(trans, log_root_tree,
log_root_tree->log_transid);
btrfs_free_logged_extents(log, log_transid);
@@ -2395,23 +2384,28 @@
* check the full commit flag again
*/
if (root->fs_info->last_trans_log_full_commit == trans->transid) {
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out_wake_log_root;
}
- ret = btrfs_write_and_wait_marked_extents(log_root_tree,
- &log_root_tree->dirty_log_pages,
- EXTENT_DIRTY | EXTENT_NEW);
+ spin_lock(&log->log_pages_lock);
+ list_splice_init(&log->dirty_log_pages[log_transid % 2], &list);
+ spin_unlock(&log->log_pages_lock);
+
+ spin_lock(&log_root_tree->log_pages_lock);
+ list_splice_init(&log_root_tree->dirty_log_pages[0], &list);
+ list_splice_init(&log_root_tree->dirty_log_pages[1], &list);
+ spin_unlock(&log_root_tree->log_pages_lock);
+
+ ret = btrfs_sync_eb_list(log_root_tree->fs_info, &list);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_wait_logged_extents(log, log_transid);
btrfs_set_super_log_root(root->fs_info->super_for_commit,
@@ -2461,25 +2455,27 @@
struct btrfs_root *log)
{
int ret;
- u64 start;
- u64 end;
+ struct list_head list;
struct walk_control wc = {
.free = 1,
.process_func = process_one_buffer
};
+ INIT_LIST_HEAD(&list);
ret = walk_log_tree(trans, log, &wc);
BUG_ON(ret);
- while (1) {
- ret = find_first_extent_bit(&log->dirty_log_pages,
- 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
- NULL);
- if (ret)
- break;
+ spin_lock(&log->log_pages_lock);
+ list_splice_init(&log->dirty_log_pages[0], &list);
+ list_splice_init(&log->dirty_log_pages[1], &list);
+ spin_unlock(&log->log_pages_lock);
- clear_extent_bits(&log->dirty_log_pages, start, end,
- EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
+ while (!list_empty(&list)) {
+ struct extent_buffer *eb;
+
+ eb = list_first_entry(&list, struct extent_buffer, dirty_list);
+ list_del_init(&eb->dirty_list);
+ free_extent_buffer(eb);
}
free_extent_buffer(log->node);