blob: 3496b9b67da9b882b0578a8a27335ff160b04481 [file] [log] [blame]
/*
* Generic btree operations
*
* Original copyright (c) 2008 Daniel Phillips <phillips@phunq.net>
* Portions copyright (c) 2006-2008 Google Inc.
* Licensed under the GPL version 2
*
* By contributing changes to this file you grant the original copyright holder
* the right to distribute those changes under any license.
*/
#include "tux3.h"
#ifndef trace
#define trace trace_off
#endif
/* This value is special case to tell btree doesn't have root yet. */
struct root no_root = {
.block = 0,
.depth = 0,
};
struct bnode {
__be16 magic;
__be16 unused;
__be32 count;
struct index_entry {
__be64 key;
__be64 block;
} entries[];
};
/*
* Note that the first key of an index block is never accessed. This is
* because for a btree, there is always one more key than nodes in each
* index node. In other words, keys lie between node pointers. I
* micro-optimize by placing the node count in the first key, which allows
* a node to contain an esthetically pleasing binary number of pointers.
* (Not done yet.)
*/
unsigned calc_entries_per_node(unsigned blocksize)
{
return (blocksize - sizeof(struct bnode)) / sizeof(struct index_entry);
}
static inline unsigned bcount(struct bnode *node)
{
return be32_to_cpu(node->count);
}
static struct buffer_head *new_block(struct btree *btree)
{
block_t block;
block = balloc_one(btree->sb);
if (block < 0)
return ERR_PTR(block);
struct buffer_head *buffer = vol_getblk(btree->sb, block);
if (!buffer)
return ERR_PTR(-ENOMEM); // ERR_PTR me!!! and bfree?
return buffer;
}
struct buffer_head *new_leaf(struct btree *btree)
{
struct buffer_head *buffer = new_block(btree);
if (!IS_ERR(buffer)) {
memset(bufdata(buffer), 0, bufsize(buffer));
(btree->ops->leaf_init)(btree, bufdata(buffer));
mark_buffer_dirty_atomic(buffer);
}
return buffer;
}
static inline void bnode_buffer_init(struct buffer_head *buffer)
{
struct bnode *bnode = bufdata(buffer);
memset(bnode, 0, bufsize(buffer));
bnode->magic = cpu_to_be16(TUX3_MAGIC_BNODE);
}
static inline int bnode_sniff(struct bnode *bnode)
{
if (bnode->magic != cpu_to_be16(TUX3_MAGIC_BNODE))
return -1;
return 0;
}
static struct buffer_head *new_node(struct btree *btree)
{
struct buffer_head *buffer = new_block(btree);
if (!IS_ERR(buffer)) {
bnode_buffer_init(buffer);
mark_buffer_unify_atomic(buffer);
}
return buffer;
}
/*
* A btree cursor has n + 1 entries for a btree of depth n, with the first n
* entries pointing at internal nodes and entry n + 1 pointing at a leaf.
* The next field points at the next index entry that will be loaded in a left
* to right tree traversal, not the current entry. The next pointer is null
* for the leaf, which has its own specialized traversal algorithms.
*/
static inline struct bnode *level_node(struct cursor *cursor, int level)
{
return bufdata(cursor->path[level].buffer);
}
struct buffer_head *cursor_leafbuf(struct cursor *cursor)
{
assert(cursor->level == cursor->btree->root.depth);
return cursor->path[cursor->level].buffer;
}
static void cursor_root_add(struct cursor *cursor, struct buffer_head *buffer,
struct index_entry *next)
{
#ifdef CURSOR_DEBUG
assert(cursor->level < cursor->maxlevel);
assert(cursor->path[cursor->level + 1].buffer == FREE_BUFFER);
assert(cursor->path[cursor->level + 1].next == FREE_NEXT);
#endif
vecmove(cursor->path + 1, cursor->path, cursor->level + 1);
cursor->level++;
cursor->path[0].buffer = buffer;
cursor->path[0].next = next;
}
static void level_replace_blockput(struct cursor *cursor, int level,
struct buffer_head *buffer,
struct index_entry *next)
{
#ifdef CURSOR_DEBUG
assert(buffer);
assert(level <= cursor->level);
assert(cursor->path[level].buffer != FREE_BUFFER);
assert(cursor->path[level].next != FREE_NEXT);
#endif
blockput(cursor->path[level].buffer);
cursor->path[level].buffer = buffer;
cursor->path[level].next = next;
}
static void cursor_push(struct cursor *cursor, struct buffer_head *buffer,
struct index_entry *next)
{
cursor->level++;
#ifdef CURSOR_DEBUG
assert(cursor->level <= cursor->maxlevel);
assert(cursor->path[cursor->level].buffer == FREE_BUFFER);
assert(cursor->path[cursor->level].next == FREE_NEXT);
#endif
cursor->path[cursor->level].buffer = buffer;
cursor->path[cursor->level].next = next;
}
static struct buffer_head *cursor_pop(struct cursor *cursor)
{
struct buffer_head *buffer;
#ifdef CURSOR_DEBUG
assert(cursor->level >= 0);
#endif
buffer = cursor->path[cursor->level].buffer;
#ifdef CURSOR_DEBUG
cursor->path[cursor->level].buffer = FREE_BUFFER;
cursor->path[cursor->level].next = FREE_NEXT;
#endif
cursor->level--;
return buffer;
}
static inline void cursor_pop_blockput(struct cursor *cursor)
{
blockput(cursor_pop(cursor));
}
/* There is no next entry? */
static inline int level_finished(struct cursor *cursor, int level)
{
struct bnode *node = level_node(cursor, level);
return cursor->path[level].next == node->entries + bcount(node);
}
// also write level_beginning!!!
void release_cursor(struct cursor *cursor)
{
while (cursor->level >= 0)
cursor_pop_blockput(cursor);
}
/* unused */
void show_cursor(struct cursor *cursor, int depth)
{
__tux3_dbg(">>> cursor %p/%i:", cursor, depth);
for (int i = 0; i < depth; i++) {
__tux3_dbg(" [%Lx/%i]",
bufindex(cursor->path[i].buffer),
bufcount(cursor->path[i].buffer));
}
__tux3_dbg("\n");
}
static void cursor_check(struct cursor *cursor)
{
if (cursor->level == -1)
return;
tuxkey_t key = 0;
block_t block = cursor->btree->root.block;
for (int i = 0; i <= cursor->level; i++) {
assert(bufindex(cursor->path[i].buffer) == block);
if (i == cursor->level)
break;
struct bnode *bnode = level_node(cursor, i);
struct index_entry *entry = cursor->path[i].next - 1;
assert(bnode->entries <= entry);
assert(entry < bnode->entries + bcount(bnode));
/*
* If this entry is most left, it should be same key
* with parent. Otherwise, most left key may not be
* correct as next key.
*/
if (bnode->entries == entry)
assert(be64_to_cpu(entry->key) == key);
else
assert(be64_to_cpu(entry->key) > key);
block = be64_to_cpu(entry->block);
key = be64_to_cpu(entry->key);
}
}
static inline int alloc_cursor_size(int count)
{
return sizeof(struct cursor) + sizeof(struct path_level) * count;
}
struct cursor *alloc_cursor(struct btree *btree, int extra)
{
int maxlevel = btree->root.depth + extra;
struct cursor *cursor;
cursor = kmalloc(alloc_cursor_size(maxlevel + 1), GFP_NOFS);
if (cursor) {
cursor->btree = btree;
cursor->level = -1;
#ifdef CURSOR_DEBUG
cursor->maxlevel = maxlevel;
for (int i = 0; i <= maxlevel; i++) {
cursor->path[i].buffer = FREE_BUFFER; /* for debug */
cursor->path[i].next = FREE_NEXT; /* for debug */
}
#endif
}
return cursor;
}
void free_cursor(struct cursor *cursor)
{
#ifdef CURSOR_DEBUG
assert(cursor->level == -1);
#endif
kfree(cursor);
}
/* Lookup the index entry contains key */
static struct index_entry *bnode_lookup(struct bnode *node, tuxkey_t key)
{
struct index_entry *next = node->entries, *top = next + bcount(node);
assert(bcount(node) > 0);
/* binary search goes here */
while (++next < top) {
if (be64_to_cpu(next->key) > key)
break;
}
return next - 1;
}
static int cursor_level_finished(struct cursor *cursor)
{
/* must not be leaf */
assert(cursor->level < cursor->btree->root.depth);
return level_finished(cursor, cursor->level);
}
/*
* Climb up the cursor until we find the first level where we have not yet read
* all the way to the end of the index block, there we find the key that
* separates the subtree we are in (a leaf) from the next subtree to the right.
*/
tuxkey_t cursor_next_key(struct cursor *cursor)
{
int level = cursor->level;
assert(cursor->level == cursor->btree->root.depth);
while (level--) {
if (!level_finished(cursor, level))
return be64_to_cpu(cursor->path[level].next->key);
}
return TUXKEY_LIMIT;
}
static tuxkey_t cursor_level_next_key(struct cursor *cursor)
{
int level = cursor->level;
assert(cursor->level < cursor->btree->root.depth);
while (level >= 0) {
if (!level_finished(cursor, level))
return be64_to_cpu(cursor->path[level].next->key);
level--;
}
return TUXKEY_LIMIT;
}
/* Return key of this leaf */
tuxkey_t cursor_this_key(struct cursor *cursor)
{
assert(cursor->level == cursor->btree->root.depth);
return be64_to_cpu((cursor->path[cursor->level - 1].next - 1)->key);
}
static tuxkey_t cursor_level_this_key(struct cursor *cursor)
{
assert(cursor->level < cursor->btree->root.depth);
return be64_to_cpu((cursor->path[cursor->level].next - 1)->key);
}
/*
* Cursor read root node.
* < 0 - error
* 0 - success
*/
static int cursor_read_root(struct cursor *cursor)
{
struct btree *btree = cursor->btree;
struct buffer_head *buffer;
assert(has_root(btree));
buffer = vol_bread(btree->sb, btree->root.block);
if (!buffer)
return -EIO; /* FIXME: stupid, it might have been NOMEM */
assert(!bnode_sniff(bufdata(buffer)));
cursor_push(cursor, buffer, ((struct bnode *)bufdata(buffer))->entries);
return 0;
}
/*
* Cursor up to parent node.
* 0 - there is no further parent (root was popped)
* 1 - there is parent
*/
static int cursor_advance_up(struct cursor *cursor)
{
assert(cursor->level >= 0);
cursor_pop_blockput(cursor);
return cursor->level >= 0;
}
/*
* Cursor down to child node or leaf, and update ->next.
* < 0 - error
* 0 - there is no further child (leaf was pushed)
* 1 - there is child
*/
static int cursor_advance_down(struct cursor *cursor)
{
struct btree *btree = cursor->btree;
struct buffer_head *buffer;
block_t child;
assert(cursor->level < btree->root.depth);
child = be64_to_cpu(cursor->path[cursor->level].next->block);
buffer = vol_bread(btree->sb, child);
if (!buffer)
return -EIO; /* FIXME: stupid, it might have been NOMEM */
cursor->path[cursor->level].next++;
if (cursor->level < btree->root.depth - 1) {
struct bnode *node = bufdata(buffer);
assert(!bnode_sniff(node));
cursor_push(cursor, buffer, node->entries);
cursor_check(cursor);
return 1;
}
assert(!btree->ops->leaf_sniff(btree, bufdata(buffer)));
cursor_push(cursor, buffer, NULL);
cursor_check(cursor);
return 0;
}
/*
* Cursor advance for btree traverse.
* < 0 - error
* 0 - Finished traverse
* 1 - Reached leaf
*/
static int cursor_advance(struct cursor *cursor)
{
int ret;
do {
if (!cursor_advance_up(cursor))
return 0;
} while (cursor_level_finished(cursor));
do {
ret = cursor_advance_down(cursor);
if (ret < 0)
return ret;
} while (ret);
return 1;
}
/* Lookup index and set it as next down path */
static void cursor_bnode_lookup(struct cursor *cursor, tuxkey_t key)
{
struct path_level *at = &cursor->path[cursor->level];
at->next = bnode_lookup(bufdata(at->buffer), key);
}
int btree_probe(struct cursor *cursor, tuxkey_t key)
{
int ret;
ret = cursor_read_root(cursor);
if (ret < 0)
return ret;
do {
cursor_bnode_lookup(cursor, key);
ret = cursor_advance_down(cursor);
if (ret < 0)
goto error;
} while (ret);
return 0;
error:
release_cursor(cursor);
return ret;
}
/*
* Traverse btree for specified range
* key: start to traverse (cursor should point leaf is including key)
* len: length to traverse
*
* return value:
* < 0 - error
* 0 - traversed all range
* 0 < - traverse was stopped by func, and return value of func
*/
int btree_traverse(struct cursor *cursor, tuxkey_t key, u64 len,
btree_traverse_func_t func, void *data)
{
struct btree *btree = cursor->btree;
int ret;
do {
tuxkey_t bottom = cursor_this_key(cursor);
tuxkey_t limit = cursor_next_key(cursor);
void *leaf = bufdata(cursor_leafbuf(cursor));
assert(!btree->ops->leaf_sniff(btree, leaf));
if (key < bottom) {
len -= min_t(u64, len, bottom - key);
if (len == 0)
break;
key = bottom;
}
ret = func(btree, bottom, limit, leaf, key, len, data);
/* Stop traverse if ret >= 1, or error */
if (ret)
goto out;
/* If next key is out of range, done */
if (key + len <= limit)
break;
ret = cursor_advance(cursor);
if (ret < 0)
goto out;
} while (ret);
ret = 0;
out:
return ret;
}
void show_tree_range(struct btree *btree, tuxkey_t start, unsigned count)
{
__tux3_dbg("%i level btree at %Li:\n",
btree->root.depth, btree->root.block);
if (!has_root(btree))
return;
struct cursor *cursor = alloc_cursor(btree, 0);
if (!cursor) {
tux3_err(btree->sb, "out of memory");
return;
}
if (btree_probe(cursor, start)) {
tux3_fs_error(btree->sb, "tell me why!!!");
goto out;
}
struct buffer_head *buffer;
do {
buffer = cursor_leafbuf(cursor);
assert(!btree->ops->leaf_sniff(btree, bufdata(buffer)));
(btree->ops->leaf_dump)(btree, bufdata(buffer));
} while (--count && cursor_advance(cursor));
out:
free_cursor(cursor);
}
void show_tree(struct btree *btree)
{
show_tree_range(btree, 0, -1);
}
static void level_redirect_blockput(struct cursor *cursor, int level, struct buffer_head *clone)
{
struct buffer_head *buffer = cursor->path[level].buffer;
struct index_entry *next = cursor->path[level].next;
/* If this level has ->next, update ->next to the clone buffer */
if (next)
next = ptr_redirect(next, bufdata(buffer), bufdata(clone));
memcpy(bufdata(clone), bufdata(buffer), bufsize(clone));
level_replace_blockput(cursor, level, clone, next);
}
static int leaf_need_redirect(struct sb *sb, struct buffer_head *buffer)
{
/* FIXME: leaf doesn't have delta number, we might want to
* remove exception for leaf */
/* If this is not re-dirty, we need to redirect */
return !buffer_dirty(buffer);
}
static int bnode_need_redirect(struct sb *sb, struct buffer_head *buffer)
{
/* If this is not re-dirty for sb->unify, we need to redirect */
return !buffer_already_dirty(buffer, sb->unify);
}
/*
* Recursively redirect non-dirty buffers on path to modify leaf.
*
* Redirect order is from root to leaf. Otherwise, blocks of path will
* be allocated by reverse order.
*
* FIXME: We can allocate/copy blocks before change common ancestor
* (before changing common ancestor, changes are not visible for
* reader). With this, we may be able to reduce locking time.
*/
int cursor_redirect(struct cursor *cursor)
{
struct btree *btree = cursor->btree;
struct sb *sb = btree->sb;
int level;
for (level = 0; level <= btree->root.depth; level++) {
struct buffer_head *buffer, *clone;
block_t parent, oldblock, newblock;
struct index_entry *entry;
int redirect, is_leaf = (level == btree->root.depth);
buffer = cursor->path[level].buffer;
/* If buffer needs to redirect to dirty, redirect it */
if (is_leaf)
redirect = leaf_need_redirect(sb, buffer);
else
redirect = bnode_need_redirect(sb, buffer);
/* No need to redirect */
if (!redirect)
continue;
/* Redirect buffer before changing */
clone = new_block(btree);
if (IS_ERR(clone))
return PTR_ERR(clone);
oldblock = bufindex(buffer);
newblock = bufindex(clone);
trace("redirect %Lx to %Lx", oldblock, newblock);
level_redirect_blockput(cursor, level, clone);
if (is_leaf) {
/* This is leaf buffer */
mark_buffer_dirty_atomic(clone);
log_leaf_redirect(sb, oldblock, newblock);
defer_bfree(sb, &sb->defree, oldblock, 1);
} else {
/* This is bnode buffer */
mark_buffer_unify_atomic(clone);
log_bnode_redirect(sb, oldblock, newblock);
defer_bfree(sb, &sb->deunify, oldblock, 1);
}
trace("update parent");
if (!level) {
/* Update pointer in btree->root */
trace("redirect root");
assert(oldblock == btree->root.block);
btree->root.block = newblock;
tux3_mark_btree_dirty(btree);
continue;
}
/* Update entry on parent for the redirected block */
parent = bufindex(cursor->path[level - 1].buffer);
entry = cursor->path[level - 1].next - 1;
entry->block = cpu_to_be64(newblock);
log_bnode_update(sb, parent, newblock, be64_to_cpu(entry->key));
}
cursor_check(cursor);
return 0;
}
/* Deletion */
static void bnode_remove_index(struct bnode *node, struct index_entry *p,
int count)
{
unsigned total = bcount(node);
void *end = node->entries + total;
memmove(p, p + count, end - (void *)(p + count));
node->count = cpu_to_be32(total - count);
}
static int bnode_merge_nodes(struct sb *sb, struct bnode *into,
struct bnode *from)
{
unsigned into_count = bcount(into), from_count = bcount(from);
if (from_count + into_count > sb->entries_per_node)
return 0;
veccopy(&into->entries[into_count], from->entries, from_count);
into->count = cpu_to_be32(into_count + from_count);
return 1;
}
static void adjust_parent_sep(struct cursor *cursor, int level, __be64 newsep)
{
/* Update separating key until nearest common parent */
while (level >= 0) {
struct path_level *parent_at = &cursor->path[level];
struct index_entry *parent = parent_at->next - 1;
assert(0 < be64_to_cpu(parent->key));
assert(be64_to_cpu(parent->key) < be64_to_cpu(newsep));
log_bnode_adjust(cursor->btree->sb,
bufindex(parent_at->buffer),
be64_to_cpu(parent->key),
be64_to_cpu(newsep));
parent->key = newsep;
mark_buffer_unify_non(parent_at->buffer);
if (parent != level_node(cursor, level)->entries)
break;
level--;
}
}
/* Tracking info for chopped bnode indexes */
struct chopped_index_info {
tuxkey_t start;
int count;
};
static void remove_index(struct cursor *cursor, struct chopped_index_info *cii)
{
int level = cursor->level;
struct bnode *node = level_node(cursor, level);
struct chopped_index_info *ciil = &cii[level];
/* Collect chopped index in this node for logging later */
if (!ciil->count)
ciil->start = be64_to_cpu((cursor->path[level].next - 1)->key);
ciil->count++;
/* Remove an index */
bnode_remove_index(node, cursor->path[level].next - 1, 1);
--(cursor->path[level].next);
mark_buffer_unify_non(cursor->path[level].buffer);
/*
* Climb up to common parent and update separating key.
*
* What if index is now empty? (no deleted key)
*
* Then some key above is going to be deleted and used to set sep
* Climb the cursor while at first entry, bail out at root find the
* node with the old sep, set it to deleted key
*/
/* There is no separator for last entry or root node */
if (!level || cursor_level_finished(cursor))
return;
/* If removed index was not first entry, no change to separator */
if (cursor->path[level].next != node->entries)
return;
adjust_parent_sep(cursor, level - 1, cursor->path[level].next->key);
}
static int try_leaf_merge(struct btree *btree, struct buffer_head *intobuf,
struct buffer_head *frombuf)
{
struct vleaf *from = bufdata(frombuf);
struct vleaf *into = bufdata(intobuf);
/* Try to merge leaves */
if (btree->ops->leaf_merge(btree, into, from)) {
struct sb *sb = btree->sb;
/*
* We know frombuf is redirected and dirty. So, in
* here, we can just cancel leaf_redirect by bfree(),
* instead of defered_bfree()
* FIXME: we can optimize freeing leaf without
* leaf_redirect, and if we did, this is not true.
*/
bfree(sb, bufindex(frombuf), 1);
log_leaf_free(sb, bufindex(frombuf));
return 1;
}
return 0;
}
static int try_bnode_merge(struct sb *sb, struct buffer_head *intobuf,
struct buffer_head *frombuf)
{
struct bnode *into = bufdata(intobuf);
struct bnode *from = bufdata(frombuf);
/* Try to merge nodes */
if (bnode_merge_nodes(sb, into, from)) {
/*
* We know frombuf is redirected and dirty. So, in
* here, we can just cancel bnode_redirect by bfree(),
* instead of defered_bfree()
* FIXME: we can optimize freeing bnode without
* bnode_redirect, and if we did, this is not true.
*/
bfree(sb, bufindex(frombuf), 1);
log_bnode_merge(sb, bufindex(frombuf), bufindex(intobuf));
return 1;
}
return 0;
}
/*
* This is range deletion. So, instead of adjusting balance of the
* space on sibling nodes for each change, this just removes the range
* and merges from right to left even if it is not same parent.
*
* +--------------- (A, B, C)--------------------+
* | | |
* +-- (AA, AB, AC) -+ +- (BA, BB, BC) -+ + (CA, CB, CC) +
* | | | | | | | | |
* (AAA,AAB)(ABA,ABB)(ACA,ACB) (BAA,BAB)(BBA)(BCA,BCB) (CAA)(CBA,CBB)(CCA)
*
* [less : A, AA, AAA, AAB, AB, ABA, ABB, AC, ACA, ACB, B, BA ... : greater]
*
* If we merged from cousin (or re-distributed), we may have to update
* the index until common parent. (e.g. removed (ACB), then merged
* from (BAA,BAB) to (ACA), we have to adjust B in root node to BB)
*
* See, adjust_parent_sep().
*
* FIXME: no re-distribute. so, we don't guarantee above than 50%
* space efficiency. And if range is end of key (truncate() case), we
* don't need to merge, and adjust_parent_sep().
*
* FIXME2: we may want to split chop work for each step. instead of
* blocking for a long time.
*/
int btree_chop(struct btree *btree, tuxkey_t start, u64 len)
{
struct sb *sb = btree->sb;
struct btree_ops *ops = btree->ops;
struct buffer_head **prev, *leafprev = NULL;
struct chopped_index_info *cii;
struct cursor *cursor;
tuxkey_t limit;
int ret, done = 0;
if (!has_root(btree))
return 0;
/* Chop all range if len >= TUXKEY_LIMIT */
limit = (len >= TUXKEY_LIMIT) ? TUXKEY_LIMIT : start + len;
prev = kzalloc(sizeof(*prev) * btree->root.depth, GFP_NOFS);
if (prev == NULL)
return -ENOMEM;
cii = kzalloc(sizeof(*cii) * btree->root.depth, GFP_NOFS);
if (cii == NULL) {
ret = -ENOMEM;
goto error_cii;
}
cursor = alloc_cursor(btree, 0);
if (!cursor) {
ret = -ENOMEM;
goto error_alloc_cursor;
}
down_write(&btree->lock);
ret = btree_probe(cursor, start);
if (ret)
goto error_btree_probe;
/* Walk leaves */
while (1) {
struct buffer_head *leafbuf;
tuxkey_t this_key;
/*
* FIXME: If leaf was merged and freed later, we don't
* need to redirect leaf and leaf_chop()
*/
if ((ret = cursor_redirect(cursor)))
goto out;
leafbuf = cursor_pop(cursor);
/* Adjust start and len for this leaf */
this_key = cursor_level_this_key(cursor);
if (start < this_key) {
if (limit < TUXKEY_LIMIT)
len -= this_key - start;
start = this_key;
}
ret = ops->leaf_chop(btree, start, len, bufdata(leafbuf));
if (ret) {
if (ret < 0) {
blockput(leafbuf);
goto out;
}
mark_buffer_dirty_non(leafbuf);
}
/* Try to merge this leaf with prev */
if (leafprev) {
if (try_leaf_merge(btree, leafprev, leafbuf)) {
trace(">>> can merge leaf %p into leaf %p", leafbuf, leafprev);
remove_index(cursor, cii);
mark_buffer_dirty_non(leafprev);
blockput_free(sb, leafbuf);
goto keep_prev_leaf;
}
blockput(leafprev);
}
leafprev = leafbuf;
keep_prev_leaf:
if (cursor_level_next_key(cursor) >= limit)
done = 1;
/* Pop and try to merge finished nodes */
while (done || cursor_level_finished(cursor)) {
struct buffer_head *buf;
int level = cursor->level;
struct chopped_index_info *ciil = &cii[level];
/* Get merge src buffer, and go parent level */
buf = cursor_pop(cursor);
/*
* Logging chopped indexes
* FIXME: If node is freed later (e.g. merged),
* we dont't need to log this
*/
if (ciil->count) {
log_bnode_del(sb, bufindex(buf), ciil->start,
ciil->count);
}
memset(ciil, 0, sizeof(*ciil));
/* Try to merge node with prev */
if (prev[level]) {
assert(level);
if (try_bnode_merge(sb, prev[level], buf)) {
trace(">>> can merge node %p into node %p", buf, prev[level]);
remove_index(cursor, cii);
mark_buffer_unify_non(prev[level]);
blockput_free_unify(sb, buf);
goto keep_prev_node;
}
blockput(prev[level]);
}
prev[level] = buf;
keep_prev_node:
if (!level)
goto chop_root;
}
/* Push back down to leaf level */
do {
ret = cursor_advance_down(cursor);
if (ret < 0)
goto out;
} while (ret);
}
chop_root:
/* Remove depth if possible */
while (btree->root.depth > 1 && bcount(bufdata(prev[0])) == 1) {
trace("drop btree level");
btree->root.block = bufindex(prev[1]);
btree->root.depth--;
tux3_mark_btree_dirty(btree);
/*
* We know prev[0] is redirected and dirty. So, in
* here, we can just cancel bnode_redirect by bfree(),
* instead of defered_bfree()
* FIXME: we can optimize freeing bnode without
* bnode_redirect, and if we did, this is not true.
*/
bfree(sb, bufindex(prev[0]), 1);
log_bnode_free(sb, bufindex(prev[0]));
blockput_free_unify(sb, prev[0]);
vecmove(prev, prev + 1, btree->root.depth);
}
ret = 0;
out:
if (leafprev)
blockput(leafprev);
for (int i = 0; i < btree->root.depth; i++) {
if (prev[i])
blockput(prev[i]);
}
release_cursor(cursor);
error_btree_probe:
up_write(&btree->lock);
free_cursor(cursor);
error_alloc_cursor:
kfree(cii);
error_cii:
kfree(prev);
return ret;
}
/* root must be initialized by zero */
static void bnode_init_root(struct bnode *root, unsigned count, block_t left,
block_t right, tuxkey_t rkey)
{
root->count = cpu_to_be32(count);
root->entries[0].block = cpu_to_be64(left);
root->entries[1].block = cpu_to_be64(right);
root->entries[1].key = cpu_to_be64(rkey);
}
/* Insertion */
static void bnode_add_index(struct bnode *node, struct index_entry *p,
block_t child, u64 childkey)
{
unsigned count = bcount(node);
vecmove(p + 1, p, node->entries + count - p);
p->block = cpu_to_be64(child);
p->key = cpu_to_be64(childkey);
node->count = cpu_to_be32(count + 1);
}
static void bnode_split(struct bnode *src, unsigned pos, struct bnode *dst)
{
dst->count = cpu_to_be32(bcount(src) - pos);
src->count = cpu_to_be32(pos);
memcpy(&dst->entries[0], &src->entries[pos],
bcount(dst) * sizeof(struct index_entry));
}
/*
* Insert new leaf to next cursor position.
* keep == 1: keep current cursor position.
* keep == 0, set cursor position to new leaf.
*/
static int insert_leaf(struct cursor *cursor, tuxkey_t childkey, struct buffer_head *leafbuf, int keep)
{
struct btree *btree = cursor->btree;
struct sb *sb = btree->sb;
int level = btree->root.depth;
block_t childblock = bufindex(leafbuf);
if (keep)
blockput(leafbuf);
else {
cursor_pop_blockput(cursor);
cursor_push(cursor, leafbuf, NULL);
}
while (level--) {
struct path_level *at = &cursor->path[level];
struct buffer_head *parentbuf = at->buffer;
struct bnode *parent = bufdata(parentbuf);
/* insert and exit if not full */
if (bcount(parent) < btree->sb->entries_per_node) {
bnode_add_index(parent, at->next, childblock, childkey);
if (!keep)
at->next++;
log_bnode_add(sb, bufindex(parentbuf), childblock, childkey);
mark_buffer_unify_non(parentbuf);
cursor_check(cursor);
return 0;
}
/* split a full index node */
struct buffer_head *newbuf = new_node(btree);
if (IS_ERR(newbuf))
return PTR_ERR(newbuf);
struct bnode *newnode = bufdata(newbuf);
unsigned half = bcount(parent) / 2;
u64 newkey = be64_to_cpu(parent->entries[half].key);
bnode_split(parent, half, newnode);
log_bnode_split(sb, bufindex(parentbuf), half, bufindex(newbuf));
/* if the cursor is in the new node, use that as the parent */
int child_is_left = at->next <= parent->entries + half;
if (!child_is_left) {
struct index_entry *newnext;
mark_buffer_unify_non(parentbuf);
newnext = newnode->entries + (at->next - &parent->entries[half]);
get_bh(newbuf);
level_replace_blockput(cursor, level, newbuf, newnext);
parentbuf = newbuf;
parent = newnode;
} else
mark_buffer_unify_non(newbuf);
bnode_add_index(parent, at->next, childblock, childkey);
if (!keep)
at->next++;
log_bnode_add(sb, bufindex(parentbuf), childblock, childkey);
mark_buffer_unify_non(parentbuf);
childkey = newkey;
childblock = bufindex(newbuf);
blockput(newbuf);
/*
* If child is in left bnode, we should keep the
* cursor position to child, otherwise adjust cursor
* to new bnode.
*/
keep = child_is_left;
}
/* Make new root bnode */
trace("add tree level");
struct buffer_head *newbuf = new_node(btree);
if (IS_ERR(newbuf))
return PTR_ERR(newbuf);
struct bnode *newroot = bufdata(newbuf);
block_t newrootblock = bufindex(newbuf);
block_t oldrootblock = btree->root.block;
int left_node = bufindex(cursor->path[0].buffer) != childblock;
bnode_init_root(newroot, 2, oldrootblock, childblock, childkey);
cursor_root_add(cursor, newbuf, newroot->entries + 1 + !left_node);
log_bnode_root(sb, newrootblock, 2, oldrootblock, childblock, childkey);
/* Change btree to point the new root */
btree->root.block = newrootblock;
btree->root.depth++;
mark_buffer_unify_non(newbuf);
tux3_mark_btree_dirty(btree);
cursor_check(cursor);
return 0;
}
/* Insert new leaf to next cursor position, then set cursor to new leaf */
int btree_insert_leaf(struct cursor *cursor, tuxkey_t key, struct buffer_head *leafbuf)
{
return insert_leaf(cursor, key, leafbuf, 0);
}
/*
* Split leaf, then insert to parent.
* key: key to add after split (cursor will point leaf which is including key)
* hint: hint for split
*
* return value:
* 0 - success
* < 0 - error
*/
static int btree_leaf_split(struct cursor *cursor, tuxkey_t key, tuxkey_t hint)
{
trace("split leaf");
struct btree *btree = cursor->btree;
struct buffer_head *newbuf;
newbuf = new_leaf(btree);
if (IS_ERR(newbuf))
return PTR_ERR(newbuf);
log_balloc(btree->sb, bufindex(newbuf), 1);
struct buffer_head *leafbuf = cursor_leafbuf(cursor);
tuxkey_t newkey = btree->ops->leaf_split(btree, hint, bufdata(leafbuf),
bufdata(newbuf));
assert(cursor_this_key(cursor) < newkey);
assert(newkey < cursor_next_key(cursor));
if (key < newkey)
mark_buffer_dirty_non(newbuf);
else
mark_buffer_dirty_non(leafbuf);
return insert_leaf(cursor, newkey, newbuf, key < newkey);
}
static int btree_advance(struct cursor *cursor, struct btree_key_range *key)
{
tuxkey_t limit = cursor_next_key(cursor);
int skip = 0;
while (key->start >= limit) {
int ret = cursor_advance(cursor);
assert(ret != 0); /* wrong key range? */
if (ret < 0)
return ret;
limit = cursor_next_key(cursor);
skip++;
}
if (skip > 1) {
/* key should on next leaf */
tux3_dbg("skipped more than 1 leaf: why, and probe is better");
assert(0);
}
return 0;
}
int noop_pre_write(struct btree *btree, tuxkey_t key_bottom, tuxkey_t key_limit,
void *leaf, struct btree_key_range *key)
{
return BTREE_DO_DIRTY;
}
int btree_write(struct cursor *cursor, struct btree_key_range *key)
{
struct btree *btree = cursor->btree;
struct btree_ops *ops = btree->ops;
tuxkey_t split_hint;
int err;
while (key->len > 0) {
tuxkey_t bottom, limit;
void *leaf;
int ret;
err = btree_advance(cursor, key);
if (err)
return err; /* FIXME: error handling */
bottom = cursor_this_key(cursor);
limit = cursor_next_key(cursor);
assert(bottom <= key->start && key->start < limit);
leaf = bufdata(cursor_leafbuf(cursor));
ret = ops->leaf_pre_write(btree, bottom, limit, leaf, key);
assert(ret >= 0);
if (ret == BTREE_DO_RETRY)
continue;
if (ret == BTREE_DO_DIRTY) {
err = cursor_redirect(cursor);
if (err)
return err; /* FIXME: error handling */
/* Reread leaf after redirect */
leaf = bufdata(cursor_leafbuf(cursor));
assert(!ops->leaf_sniff(btree, leaf));
ret = ops->leaf_write(btree, bottom, limit, leaf, key,
&split_hint);
if (ret < 0)
return ret;
if (ret == BTREE_DO_RETRY) {
mark_buffer_dirty_non(cursor_leafbuf(cursor));
continue;
}
}
if (ret == BTREE_DO_SPLIT) {
err = btree_leaf_split(cursor, key->start, split_hint);
if (err)
return err; /* FIXME: error handling */
}
}
return 0;
}
int btree_read(struct cursor *cursor, struct btree_key_range *key)
{
struct btree *btree = cursor->btree;
struct btree_ops *ops = btree->ops;
void *leaf = bufdata(cursor_leafbuf(cursor));
tuxkey_t bottom = cursor_this_key(cursor);
tuxkey_t limit = cursor_next_key(cursor);
/* FIXME: we might be better to support multiple leaves */
assert(bottom <= key->start && key->start < limit);
assert(!ops->leaf_sniff(btree, leaf));
return ops->leaf_read(btree, bottom, limit, leaf, key);
}
void init_btree(struct btree *btree, struct sb *sb, struct root root, struct btree_ops *ops)
{
btree->sb = sb;
btree->ops = ops;
btree->root = root;
init_rwsem(&btree->lock);
ops->btree_init(btree);
}
int alloc_empty_btree(struct btree *btree)
{
struct sb *sb = btree->sb;
struct buffer_head *rootbuf = new_node(btree);
if (IS_ERR(rootbuf))
goto error;
struct buffer_head *leafbuf = new_leaf(btree);
if (IS_ERR(leafbuf))
goto error_leafbuf;
assert(!has_root(btree));
struct bnode *rootnode = bufdata(rootbuf);
block_t rootblock = bufindex(rootbuf);
block_t leafblock = bufindex(leafbuf);
trace("root at %Lx", rootblock);
trace("leaf at %Lx", leafblock);
bnode_init_root(rootnode, 1, leafblock, 0, 0);
log_bnode_root(sb, rootblock, 1, leafblock, 0, 0);
log_balloc(sb, leafblock, 1);
mark_buffer_unify_non(rootbuf);
blockput(rootbuf);
mark_buffer_dirty_non(leafbuf);
blockput(leafbuf);
btree->root = (struct root){ .block = rootblock, .depth = 1 };
tux3_mark_btree_dirty(btree);
return 0;
error_leafbuf:
bfree(sb, bufindex(rootbuf), 1);
blockput(rootbuf);
rootbuf = leafbuf;
error:
return PTR_ERR(rootbuf);
}
/* FIXME: right? and this should be done by btree_chop()? */
int free_empty_btree(struct btree *btree)
{
struct btree_ops *ops = btree->ops;
if (!has_root(btree))
return 0;
assert(btree->root.depth == 1);
struct sb *sb = btree->sb;
struct buffer_head *rootbuf = vol_bread(sb, btree->root.block);
if (!rootbuf)
return -EIO;
assert(!bnode_sniff(bufdata(rootbuf)));
/* Make btree has no root */
btree->root = no_root;
tux3_mark_btree_dirty(btree);
struct bnode *rootnode = bufdata(rootbuf);
assert(bcount(rootnode) == 1);
block_t leaf = be64_to_cpu(rootnode->entries[0].block);
struct buffer_head *leafbuf = vol_find_get_block(sb, leaf);
if (leafbuf && !leaf_need_redirect(sb, leafbuf)) {
/*
* This is redirected leaf. So, in here, we can just
* cancel leaf_redirect by bfree(), instead of
* defered_bfree().
*/
bfree(sb, leaf, 1);
log_leaf_free(sb, leaf);
assert(ops->leaf_can_free(btree, bufdata(leafbuf)));
blockput_free(sb, leafbuf);
} else {
defer_bfree(sb, &sb->defree, leaf, 1);
log_bfree(sb, leaf, 1);
if (leafbuf) {
assert(ops->leaf_can_free(btree, bufdata(leafbuf)));
blockput(leafbuf);
}
}
if (!bnode_need_redirect(sb, rootbuf)) {
/*
* This is redirected bnode. So, in here, we can just
* cancel bnode_redirect by bfree(), instead of
* defered_bfree().
*/
bfree(sb, bufindex(rootbuf), 1);
log_bnode_free(sb, bufindex(rootbuf));
blockput_free_unify(sb, rootbuf);
} else {
defer_bfree(sb, &sb->deunify, bufindex(rootbuf), 1);
log_bfree_on_unify(sb, bufindex(rootbuf), 1);
blockput(rootbuf);
}
return 0;
}
int replay_bnode_redirect(struct replay *rp, block_t oldblock, block_t newblock)
{
struct sb *sb = rp->sb;
struct buffer_head *newbuf, *oldbuf;
int err = 0;
newbuf = vol_getblk(sb, newblock);
if (!newbuf) {
err = -ENOMEM; /* FIXME: error code */
goto error;
}
oldbuf = vol_bread(sb, oldblock);
if (!oldbuf) {
err = -EIO; /* FIXME: error code */
goto error_put_newbuf;
}
assert(!bnode_sniff(bufdata(oldbuf)));
memcpy(bufdata(newbuf), bufdata(oldbuf), bufsize(newbuf));
mark_buffer_unify_atomic(newbuf);
blockput(oldbuf);
error_put_newbuf:
blockput(newbuf);
error:
return err;
}
int replay_bnode_root(struct replay *rp, block_t root, unsigned count,
block_t left, block_t right, tuxkey_t rkey)
{
struct sb *sb = rp->sb;
struct buffer_head *rootbuf;
rootbuf = vol_getblk(sb, root);
if (!rootbuf)
return -ENOMEM;
bnode_buffer_init(rootbuf);
bnode_init_root(bufdata(rootbuf), count, left, right, rkey);
mark_buffer_unify_atomic(rootbuf);
blockput(rootbuf);
return 0;
}
/*
* Before this replay, replay should already dirty the buffer of src.
* (e.g. by redirect)
*/
int replay_bnode_split(struct replay *rp, block_t src, unsigned pos,
block_t dst)
{
struct sb *sb = rp->sb;
struct buffer_head *srcbuf, *dstbuf;
int err = 0;
srcbuf = vol_getblk(sb, src);
if (!srcbuf) {
err = -ENOMEM; /* FIXME: error code */
goto error;
}
dstbuf = vol_getblk(sb, dst);
if (!dstbuf) {
err = -ENOMEM; /* FIXME: error code */
goto error_put_srcbuf;
}
bnode_buffer_init(dstbuf);
bnode_split(bufdata(srcbuf), pos, bufdata(dstbuf));
mark_buffer_unify_non(srcbuf);
mark_buffer_unify_atomic(dstbuf);
blockput(dstbuf);
error_put_srcbuf:
blockput(srcbuf);
error:
return err;
}
/*
* Before this replay, replay should already dirty the buffer of bnodeblock.
* (e.g. by redirect)
*/
static int replay_bnode_change(struct sb *sb, block_t bnodeblock,
u64 val1, u64 val2,
void (*change)(struct bnode *, u64, u64))
{
struct buffer_head *bnodebuf;
bnodebuf = vol_getblk(sb, bnodeblock);
if (!bnodebuf)
return -ENOMEM; /* FIXME: error code */
struct bnode *bnode = bufdata(bnodebuf);
change(bnode, val1, val2);
mark_buffer_unify_non(bnodebuf);
blockput(bnodebuf);
return 0;
}
static void add_func(struct bnode *bnode, u64 child, u64 key)
{
struct index_entry *entry = bnode_lookup(bnode, key) + 1;
bnode_add_index(bnode, entry, child, key);
}
int replay_bnode_add(struct replay *rp, block_t parent, block_t child,
tuxkey_t key)
{
return replay_bnode_change(rp->sb, parent, child, key, add_func);
}
static void update_func(struct bnode *bnode, u64 child, u64 key)
{
struct index_entry *entry = bnode_lookup(bnode, key);
assert(be64_to_cpu(entry->key) == key);
entry->block = cpu_to_be64(child);
}
int replay_bnode_update(struct replay *rp, block_t parent, block_t child,
tuxkey_t key)
{
return replay_bnode_change(rp->sb, parent, child, key, update_func);
}
int replay_bnode_merge(struct replay *rp, block_t src, block_t dst)
{
struct sb *sb = rp->sb;
struct buffer_head *srcbuf, *dstbuf;
int err = 0, ret;
srcbuf = vol_getblk(sb, src);
if (!srcbuf) {
err = -ENOMEM; /* FIXME: error code */
goto error;
}
dstbuf = vol_getblk(sb, dst);
if (!dstbuf) {
err = -ENOMEM; /* FIXME: error code */
goto error_put_srcbuf;
}
ret = bnode_merge_nodes(sb, bufdata(dstbuf), bufdata(srcbuf));
assert(ret == 1);
mark_buffer_unify_non(dstbuf);
mark_buffer_unify_non(srcbuf);
blockput(dstbuf);
error_put_srcbuf:
blockput_free_unify(sb, srcbuf);
error:
return err;
}
static void del_func(struct bnode *bnode, u64 key, u64 count)
{
struct index_entry *entry = bnode_lookup(bnode, key);
assert(be64_to_cpu(entry->key) == key);
bnode_remove_index(bnode, entry, count);
}
int replay_bnode_del(struct replay *rp, block_t bnode, tuxkey_t key,
unsigned count)
{
return replay_bnode_change(rp->sb, bnode, key, count, del_func);
}
static void adjust_func(struct bnode *bnode, u64 from, u64 to)
{
struct index_entry *entry = bnode_lookup(bnode, from);
assert(be64_to_cpu(entry->key) == from);
entry->key = cpu_to_be64(to);
}
int replay_bnode_adjust(struct replay *rp, block_t bnode, tuxkey_t from,
tuxkey_t to)
{
return replay_bnode_change(rp->sb, bnode, from, to, adjust_func);
}