blob: 9cb5d33b24b6868745edc042d114ae500c47963d [file] [log] [blame]
/*
* Inode btree leaf operations.
*
* Copyright (c) 2008-2014 Daniel Phillips
* Copyright (c) 2008-2014 OGAWA Hirofumi
*/
#include "tux3.h"
#include "ileaf.h"
#ifndef trace
#define trace trace_on
#endif
struct ileaf {
__be16 magic; /* Magic number */
__be16 count; /* Counts of used offset info entries */
u32 pad;
__be64 ibase; /* Base inode number */
char table[]; /* ileaf data: inode attrs ... offset info */
};
/*
* inode leaf format
*
* A leaf has a small header followed by a table of attributes. A vector of
* offsets within the block grows down from the top of the leaf towards the
* top of the attribute table, indexed by the difference between inum and
* leaf->ibase, the base inum of the table block.
*/
static inline __be16 *ileaf_dict(struct btree *btree, struct ileaf *ileaf)
{
return (void *)ileaf + btree->sb->blocksize;
}
static inline unsigned __atdict(__be16 *dict, unsigned at)
{
assert(at);
return be16_to_cpu(*(dict - at));
}
static inline unsigned atdict(__be16 *dict, unsigned at)
{
return at ? __atdict(dict, at) : 0;
}
static inline void add_idict(__be16 *dict, int n)
{
be16_add_cpu(dict, n);
}
static inline unsigned icount(struct ileaf *leaf)
{
return be16_to_cpu(leaf->count);
}
static inline tuxkey_t ibase(struct ileaf *leaf)
{
return be64_to_cpu(leaf->ibase);
}
static void ileaf_btree_init(struct btree *btree)
{
btree->entries_per_leaf = 1 << (btree->sb->blockbits - 6);
}
static int ileaf_init(struct btree *btree, void *leaf)
{
struct ileaf_attr_ops *attr_ops = btree->ops->private_ops;
trace("initialize inode leaf %p", leaf);
*(struct ileaf *)leaf = (struct ileaf){
.magic = attr_ops->magic,
};
return 0;
}
static int ileaf_need(struct btree *btree, struct ileaf *ileaf)
{
__be16 *dict = ileaf_dict(btree, ileaf);
unsigned count = icount(ileaf);
return atdict(dict, count) + count * sizeof(*dict);
}
static int ileaf_free(struct btree *btree, struct ileaf *ileaf)
{
return btree->sb->blocksize
- ileaf_need(btree, ileaf) - sizeof(struct ileaf);
}
static int ileaf_sniff(struct btree *btree, void *leaf)
{
struct ileaf_attr_ops *attr_ops = btree->ops->private_ops;
if (((struct ileaf *)leaf)->magic != attr_ops->magic)
return -1;
return 0;
}
static int ileaf_can_free(struct btree *btree, void *leaf)
{
struct ileaf *ileaf = leaf;
return icount(ileaf) == 0;
}
void *ileaf_lookup(struct btree *btree, inum_t inum, struct ileaf *leaf, unsigned *result)
{
assert(inum >= ibase(leaf));
tuxkey_t at = inum - ibase(leaf), size = 0;
void *attrs = NULL;
trace("lookup inode 0x%Lx, %Lx + %Lx", inum, ibase(leaf), at);
if (at < icount(leaf)) {
__be16 *dict = ileaf_dict(btree, leaf);
unsigned offset = atdict(dict, at);
if ((size = __atdict(dict, at + 1) - offset))
attrs = leaf->table + offset;
}
*result = size;
return attrs;
}
static int isinorder(struct btree *btree, struct ileaf *leaf)
{
__be16 *dict = ileaf_dict(btree, leaf);
for (int i = 0, offset = 0, limit; i < icount(leaf); i++, offset = limit)
if ((limit = __atdict(dict, i + 1)) < offset)
return 0;
return 1;
}
/* userland only */
int ileaf_check(struct btree *btree, struct ileaf *leaf)
{
struct ileaf_attr_ops *attr_ops = btree->ops->private_ops;
char *why;
why = "not an ileaf";
if (leaf->magic != attr_ops->magic)
goto eek;
why = "dict out of order";
if (!isinorder(btree, leaf))
goto eek;
return 0;
eek:
tux3_err(btree->sb, "%s!", why);
return -1;
}
static void ileaf_trim(struct btree *btree, struct ileaf *leaf)
{
__be16 *dict = ileaf_dict(btree, leaf);
unsigned count = icount(leaf);
while (count > 1 && *(dict - count) == *(dict - count + 1))
count--;
if (count == 1 && !*(dict - 1))
count = 0;
leaf->count = cpu_to_be16(count);
}
#define SPLIT_AT_INUM
static tuxkey_t ileaf_split(struct btree *btree, tuxkey_t hint,
void *from, void *into)
{
assert(!ileaf_sniff(btree, from));
struct ileaf *leaf = from, *dest = into;
__be16 *dict = ileaf_dict(btree, from);
__be16 *destdict = ileaf_dict(btree, into);
#ifdef SPLIT_AT_INUM
/*
* This is to prevent to have same ibase on both of from and into
* FIXME: we would want to split at better position.
*/
if (hint == ibase(leaf))
hint++;
trace("split at inum 0x%Lx", hint);
unsigned at = min_t(tuxkey_t, hint - ibase(leaf), icount(leaf));
#else
/* binsearch inum starting nearest middle of block */
unsigned at = 1, hi = icount(leaf);
while (at < hi) {
int mid = (at + hi) / 2;
if (*(dict - mid) < (btree->sb->blocksize / 2))
at = mid + 1;
else
hi = mid;
}
#endif
/* should trim leading empty inodes on copy */
unsigned split = atdict(dict, at), free = atdict(dict, icount(leaf));
trace("split at %x of %x", at, icount(leaf));
trace("copy out %x bytes at %x", free - split, split);
assert(free >= split);
memcpy(dest->table, leaf->table + split, free - split);
dest->count = cpu_to_be16(icount(leaf) - at);
veccopy(destdict - icount(dest), dict - icount(leaf), icount(dest));
for (int i = 1; i <= icount(dest); i++)
add_idict(destdict - i, -split);
#ifdef SPLIT_AT_INUM
/* round down to multiple of 64 above ibase */
inum_t round = hint & ~(inum_t)(btree->entries_per_leaf - 1);
dest->ibase = cpu_to_be64(round > ibase(leaf) + icount(leaf) ? round : hint);
#else
dest->ibase = cpu_to_be64(ibase(leaf) + at);
#endif
leaf->count = cpu_to_be16(at);
memset(leaf->table + split, 0, (char *)(dict - icount(leaf)) - (leaf->table + split));
ileaf_trim(btree, leaf);
return ibase(dest);
}
static int ileaf_merge(struct btree *btree, void *vinto, void *vfrom)
{
struct ileaf *into = vinto, *from = vfrom;
unsigned fromcount = icount(from);
/* If "from" is empty, does nothing */
if (!fromcount)
return 1;
assert(ibase(from) > ibase(into));
tuxkey_t fromibase = ibase(from);
unsigned count = icount(into);
int hole = fromibase - ibase(into) + count;
__be16 *dict = ileaf_dict(btree, into);
__be16 *fromdict = ileaf_dict(btree, from);
int need_size = hole * sizeof(*dict) + ileaf_need(btree, from);
if (ileaf_free(btree, into) < need_size)
return 0;
/* Fill hole of dict until from_ibase */
unsigned limit = atdict(dict, count);
__be16 __limit = cpu_to_be16(limit);
while (hole--) {
count++;
*(dict - count) = __limit;
}
/* Copy data from "from" */
unsigned fromlimit = atdict(fromdict, fromcount);
memcpy(into->table + limit, from->table, fromlimit);
/* Adjust copying fromdict */
if (limit) {
int i;
for (i = 1; i <= fromcount; i++)
add_idict(dict - i, limit);
}
veccopy(dict - count - fromcount, fromdict - fromcount, fromcount);
into->count = cpu_to_be16(count + fromcount);
return 1;
}
/*
* Chop inums
* return value:
* < 0 - error
* 1 - modified
* 0 - not modified
*/
static int ileaf_chop(struct btree *btree, tuxkey_t start, u64 len, void *leaf)
{
struct ileaf *ileaf = leaf;
__be16 *dict = ileaf_dict(btree, leaf);
tuxkey_t base = ibase(ileaf);
unsigned count = icount(ileaf);
tuxkey_t at = start - base;
void *startp, *endp, *tailp;
unsigned size;
if (at + 1 > count)
return 0;
len = min_t(u64, len, count - at);
startp = ileaf->table + atdict(dict, at);
endp = ileaf->table + atdict(dict, at + len);
if (startp == endp)
return 0;
/* Remove data */
tailp = ileaf->table + atdict(dict, count);
memmove(startp, endp, tailp - endp);
/* Adjust dict */
size = endp - startp;
while (at < count) {
at++;
add_idict(dict - at, -size);
}
ileaf_trim(btree, leaf);
return 1;
}
static void *ileaf_resize(struct btree *btree, tuxkey_t inum, void *vleaf,
int newsize)
{
struct ileaf *ileaf = vleaf;
__be16 *dict = ileaf_dict(btree, ileaf);
unsigned count = icount(ileaf);
tuxkey_t at = inum - ibase(ileaf);
int extend_dict, offset, size;
assert(inum >= ibase(ileaf));
/* Get existent attributes, and calculate expand/shrink size */
if (at + 1 > count) {
/* Check size roughly to avoid overflow int */
if ((at + 1) * sizeof(*dict) >= btree->sb->blocksize)
goto overflow;
/* Need to extend dict */
extend_dict = (at + 1 - count) * sizeof(*dict);
offset = atdict(dict, count);
size = 0;
} else {
/* "at" is in dict, so get attr size */
extend_dict = 0;
offset = atdict(dict, at);
size = __atdict(dict, at + 1) - offset;
}
if (ileaf_free(btree, ileaf) < newsize - size + extend_dict) {
overflow:
return NULL;
}
/* Extend dict */
if (extend_dict) {
__be16 limit = cpu_to_be16(atdict(dict, count));
while (count < at + 1) {
count++;
*(dict - count) = limit;
}
ileaf->count = cpu_to_be16(count);
}
void *attrs = ileaf->table + offset;
if (newsize != size) {
/* Expand/Shrink attr space */
unsigned limit = __atdict(dict, count);
assert(limit >= offset + size);
memmove(attrs + newsize, attrs + size, limit - offset - size);
/* Adjust dict */
int diff = newsize - size;
at++;
while (at <= count) {
add_idict(dict - at, diff);
at++;
}
}
return attrs;
}
static tuxkey_t ileaf_split_hint(struct btree *btree, struct ileaf *ileaf,
tuxkey_t key, int size)
{
/*
* FIXME: make sure there is space for size.
* FIXME: better split position?
*/
tuxkey_t base = ibase(ileaf);
unsigned count = icount(ileaf);
if (key >= base + count)
return key & ~(btree->entries_per_leaf - 1);
return base + count / 2;
}
/*
* Write inode attributes.
*/
static int ileaf_write(struct btree *btree, tuxkey_t key_bottom,
tuxkey_t key_limit,
void *leaf, struct btree_key_range *key,
tuxkey_t *split_hint)
{
struct ileaf_req *rq = container_of(key, struct ileaf_req, key);
struct ileaf_attr_ops *attr_ops = btree->ops->private_ops;
struct ileaf *ileaf = leaf;
void *attrs;
int size;
assert(key->len == 1);
size = attr_ops->encoded_size(btree, rq->data);
assert(size);
attrs = ileaf_resize(btree, key->start, ileaf, size);
if (attrs == NULL) {
/* There is no space to store */
*split_hint = ileaf_split_hint(btree, ileaf, key->start, size);
return BTREE_DO_SPLIT;
}
attr_ops->encode(btree, rq->data, attrs, size);
key->start++;
key->len--;
return BTREE_DO_RETRY;
}
static int ileaf_read(struct btree *btree, tuxkey_t key_bottom,
tuxkey_t key_limit,
void *leaf, struct btree_key_range *key)
{
struct ileaf_req *rq = container_of(key, struct ileaf_req, key);
struct ileaf_attr_ops *attr_ops = btree->ops->private_ops;
struct ileaf *ileaf = leaf;
void *attrs;
unsigned size;
attrs = ileaf_lookup(btree, key->start, ileaf, &size);
if (attrs == NULL)
return -ENOENT;
return attr_ops->decode(btree, rq->data, attrs, size);
}
struct btree_ops itree_ops = {
.btree_init = ileaf_btree_init,
.leaf_init = ileaf_init,
.leaf_split = ileaf_split,
.leaf_merge = ileaf_merge,
.leaf_chop = ileaf_chop,
.leaf_pre_write = noop_pre_write,
.leaf_write = ileaf_write,
.leaf_read = ileaf_read,
.private_ops = &iattr_ops,
.leaf_sniff = ileaf_sniff,
.leaf_can_free = ileaf_can_free,
};
struct btree_ops otree_ops = {
.btree_init = ileaf_btree_init,
.leaf_init = ileaf_init,
.leaf_split = ileaf_split,
.leaf_merge = ileaf_merge,
.leaf_chop = ileaf_chop,
.leaf_pre_write = noop_pre_write,
.leaf_write = ileaf_write,
.leaf_read = ileaf_read,
.private_ops = &oattr_ops,
.leaf_sniff = ileaf_sniff,
.leaf_can_free = ileaf_can_free,
};
/*
* Find free inum
* (callback for btree_traverse())
*
* return value:
* 1 - found
* 0 - not found
*/
int ileaf_find_free(struct btree *btree, tuxkey_t key_bottom,
tuxkey_t key_limit, void *leaf,
tuxkey_t key, u64 len, void *data)
{
tuxkey_t at = key - ibase(leaf);
unsigned count = icount(leaf);
key_limit = min(key_limit, key + len);
if (at < count) {
__be16 *dict = ileaf_dict(btree, leaf);
unsigned limit, offset = atdict(dict, at);
while (at < count) {
at++;
limit = __atdict(dict, at);
if (offset == limit) {
at--;
break;
}
offset = limit;
}
}
if (ibase(leaf) + at < key_limit) {
*(inum_t *)data = ibase(leaf) + at;
return 1;
}
return 0;
}
/*
* Enumerate inum
* (callback for btree_traverse())
*/
int ileaf_enumerate(struct btree *btree, tuxkey_t key_bottom,
tuxkey_t key_limit, void *leaf,
tuxkey_t key, u64 len, void *data)
{
struct ileaf *ileaf = leaf;
__be16 *dict = ileaf_dict(btree, ileaf);
struct ileaf_enumrate_cb *cb = data;
tuxkey_t at, base = ibase(ileaf);
unsigned count;
at = key - base;
count = min_t(u64, key + len - base, icount(ileaf));
if (at < count) {
unsigned offset = atdict(dict, at);
for (; at < count; at++) {
unsigned size, limit;
inum_t inum;
void *attrs;
int err;
limit = __atdict(dict, at + 1);
if (limit <= offset)
continue;
attrs = ileaf->table + offset;
size = limit - offset;
inum = base + at;
err = cb->callback(btree, inum, attrs, size, cb->data);
if (err)
return err;
offset = limit;
}
}
return 0;
}