blob: cbd32bf2ee60aee72becd46229305e0f86ab121d [file] [log] [blame]
/*
* File index btree leaf operations
*/
/*
* The dleaf extent table is sorted by logical address. Note that
* binsearch may be on the whole 64 bit logical:verhi pair even if
* verhi: has random data, because the logical part of the combined
* quantity is more significant. Until extent versioning arrives,
* verhi:verlo will always be zero.
*
* Explicit extent counts are not present. Extent count is given by
* the difference between two successive logical addresses or the
* difference between the logical addresses of the first entry of one
* block and the final entry of the previous block. (In all cases,
* provided the logical addresses are not equal, see below.)
*
* A hole between allocated data regions is indicated explicitly by an
* extent with physical address zero, which is not valid for any
* actual allocated extent because the filesystem header (including
* superblock) occupies that address.
*
* Two adjacent entries may have the same logical address, which means
* they are different versions of the same extent. All the version
* numbers in a sequence of equal logical extents must be different,
* or all zero. To find the length of a group of equal logical
* extents, scan forward to the first nonequal logical extent, which
* normally will be nearby.
*/
#include "tux3.h"
#include "dleaf.h"
/*
* The uptag is for filesystem integrity checking and corruption
* repair. It provides the low order bits of the delta at which the
* leaf was committed, the inode number to which it belongs, and the
* logical position within that inode.
*/
struct uptag {
__be32 inum;
__be32 offset;
__be16 future;
__be16 delta;
};
/* FIXME: ignoring version at all */
#define VER_BITS 16
#define VER_MASK ((1 << VER_BITS))
#define ADDR_BITS 48
#define ADDR_MASK ((1ULL << ADDR_BITS) - 1)
struct dleaf2 {
__be16 magic; /* dleaf2 magic */
__be16 count; /* count of diskextent2 */
// struct uptag tag;
__be32 __unused;
struct diskextent2 {
__be64 verhi_logical; /* verhi:16, logical:48 */
__be64 verlo_physical; /* verlo:16, physical:48 */
} table[];
};
struct extent {
u32 version; /* version */
block_t logical; /* logical address */
block_t physical; /* physical address */
};
static inline block_t get_logical(struct diskextent2 *dex)
{
return be64_to_cpu(dex->verhi_logical) & ADDR_MASK;
}
static inline void get_extent(struct diskextent2 *dex, struct extent *ex)
{
u64 val;
val = be64_to_cpu(dex->verhi_logical);
ex->version = val >> ADDR_BITS;
ex->logical = val & ADDR_MASK;
val = be64_to_cpu(dex->verlo_physical);
ex->version = (ex->version << VER_BITS) | (val >> ADDR_BITS);
ex->physical = val & ADDR_MASK;
}
static inline void put_extent(struct diskextent2 *dex, u32 version,
block_t logical, block_t physical)
{
u64 verhi = version >> VER_BITS, verlo = version & VER_MASK;
dex->verhi_logical = cpu_to_be64(verhi << ADDR_BITS | logical);
dex->verlo_physical = cpu_to_be64(verlo << ADDR_BITS | physical);
}
static void dleaf2_btree_init(struct btree *btree)
{
struct sb *sb = btree->sb;
unsigned datasize = sb->blocksize - sizeof(struct dleaf2);
btree->entries_per_leaf = datasize / sizeof(struct diskextent2);
}
static int dleaf2_init(struct btree *btree, void *leaf)
{
struct dleaf2 *dleaf = leaf;
*dleaf = (struct dleaf2){
.magic = cpu_to_be16(TUX3_MAGIC_DLEAF2),
.count = 0,
};
return 0;
}
static int dleaf2_sniff(struct btree *btree, void *leaf)
{
struct dleaf2 *dleaf = leaf;
if (dleaf->magic != cpu_to_be16(TUX3_MAGIC_DLEAF2))
return -1;
if (dleaf->count) {
/* Last should be sentinel */
struct extent ex;
get_extent(dleaf->table + be16_to_cpu(dleaf->count) - 1, &ex);
if (ex.physical != 0)
return -1;
}
return 0;
}
static int dleaf2_can_free(struct btree *btree, void *leaf)
{
struct dleaf2 *dleaf = leaf;
unsigned count = be16_to_cpu(dleaf->count);
assert(!dleaf2_sniff(btree, dleaf));
if (count > 1)
return 0;
return 1;
}
static void __dleaf2_dump(struct btree *btree, struct dleaf2 *dleaf,
const char *prefix)
{
if (!tux3_trace)
return;
unsigned i;
__tux3_dbg("%sdleaf %p, magic %x, count %u\n", prefix,
dleaf, be16_to_cpu(dleaf->magic), be16_to_cpu(dleaf->count));
for (i = 0; i < be16_to_cpu(dleaf->count); i++) {
struct extent ex;
get_extent(dleaf->table + i, &ex);
__tux3_dbg(" logical %Lu, physical %Lu, version %u\n",
ex.logical, ex.physical, ex.version);
}
}
static void dleaf2_dump(struct btree *btree, void *leaf)
{
struct dleaf2 *dleaf = leaf;
__dleaf2_dump(btree, dleaf, "");
}
/* Lookup logical address in diskextent2 <= index */
static struct diskextent2 *
__dleaf2_lookup_index(struct btree *btree, struct dleaf2 *dleaf,
struct diskextent2 *start, struct diskextent2 *limit,
tuxkey_t index)
{
#if 1
/* Paranoia check: last should be sentinel (hole) */
if (dleaf->count) {
struct extent ex;
get_extent(dleaf->table + be16_to_cpu(dleaf->count) - 1, &ex);
assert(ex.physical == 0);
}
#endif
/* FIXME: binsearch here */
while (start < limit) {
if (index == get_logical(start))
return start;
else if (index < get_logical(start)) {
/* should have diskextent2 of bottom logical on leaf */
assert(dleaf->table < start);
return start - 1;
}
start++;
}
return start;
}
static struct diskextent2 *
dleaf2_lookup_index(struct btree *btree, struct dleaf2 *dleaf, tuxkey_t index)
{
struct diskextent2 *start = dleaf->table;
struct diskextent2 *limit = start + be16_to_cpu(dleaf->count);
return __dleaf2_lookup_index(btree, dleaf, start, limit, index);
}
/*
* Split diskextent2, and return split key.
*/
static tuxkey_t dleaf2_split(struct btree *btree, tuxkey_t hint,
void *vfrom, void *vinto)
{
struct dleaf2 *from = vfrom, *into = vinto;
struct diskextent2 *dex;
struct extent ex;
unsigned split_at, count = be16_to_cpu(from->count);
/* need 2 extents except sentinel, at least */
assert(count >= 3);
/*
* Honor hint key, then copy and set new sentinel.
*/
dex = dleaf2_lookup_index(btree, from, hint);
if (dex == from->table + count) {
#if 1
get_extent(dex - 1, &ex);
assert(ex.physical == 0);
return ex.logical; /* use sentinel of previous leaf */
#else
/* FIXME: not necessary to store between sentinel and hint */
return hint;
#endif
}
split_at = dex - from->table;
from->count = cpu_to_be16(split_at + 1); /* +1 for sentinel */
into->count = cpu_to_be16(count - split_at);
dex = from->table + split_at;
/* Copy diskextent2 */
memcpy(into->table, dex, sizeof(*dex) * (count - split_at));
/* Put sentinel */
get_extent(dex, &ex);
put_extent(dex, ex.version, ex.logical, 0);
return ex.logical;
}
/*
* Try to merge from vfrom into vinto
* return value:
* 0 - couldn't merge
* 1 - merged
*/
static int dleaf2_merge(struct btree *btree, void *vinto, void *vfrom)
{
struct dleaf2 *into = vinto, *from = vfrom;
struct extent into_ex, from_ex;
unsigned into_count, from_count;
int can_merge, from_size;
/* If "from" is empty or sentinel only, does nothing */
from_count = be16_to_cpu(from->count);
if (from_count <= 1)
return 1;
from_size = sizeof(from->table[0]) * from_count;
/* If "into" is empty, just copy. FIXME: why there is no sentinel? */
into_count = be16_to_cpu(into->count);
if (!into_count) {
into->count = from->count;
from->count = 0;
memcpy(into->table, from->table, from_size);
return 1;
}
/* Try merge end of "from" and start of "into" */
get_extent(into->table + into_count - 1, &into_ex);
get_extent(from->table, &from_ex);
assert(into_ex.logical <= from_ex.logical);
assert(into_ex.physical == 0);
can_merge = 0;
/* If start of "from" is hole, it can be merged */
if (!from_ex.physical)
can_merge = 1;
/* If logical is same, it can be merged */
if (into_ex.logical == from_ex.logical)
can_merge = 1;
if (into_count + from_count - can_merge > btree->entries_per_leaf)
return 0;
if (!from_ex.physical) {
/* If start of "from" is hole, use logical of sentinel */
from_size -= sizeof(from->table[0]) * can_merge;
memcpy(into->table + into_count, from->table + 1, from_size);
} else if (into_ex.logical == from_ex.logical) {
/* If logical is same, use logical of "from" */
memcpy(into->table + into_count - 1, from->table, from_size);
} else {
/* Other cases are just copy */
memcpy(into->table + into_count, from->table, from_size);
}
into->count = cpu_to_be16(into_count + from_count - can_merge);
from->count = 0;
return 1;
}
/*
* Chop diskextent2
* return value:
* < 0 - error
* 1 - modified
* 0 - not modified
*/
static int dleaf2_chop(struct btree *btree, tuxkey_t start, u64 len, void *leaf)
{
struct sb *sb = btree->sb;
struct dleaf2 *dleaf = leaf;
struct diskextent2 *dex, *dex_limit;
struct extent ex;
block_t block;
int need_sentinel;
/* FIXME: range chop is unsupported for now */
assert(len == TUXKEY_LIMIT);
if (!dleaf->count)
return 0;
dex_limit = dleaf->table + be16_to_cpu(dleaf->count);
/* Lookup the extent is including index */
dex = dleaf2_lookup_index(btree, dleaf, start);
if (dex >= dex_limit - 1)
return 0;
need_sentinel = 1;
get_extent(dex, &ex);
if (start == ex.logical) {
if (dex > dleaf->table) {
/* If previous is hole, use it as sentinel */
struct extent prev;
get_extent(dex - 1, &prev);
if (prev.physical == 0) {
dex--;
need_sentinel = 0;
}
}
if (need_sentinel) {
/* Put new sentinel here. */
put_extent(dex, sb->version, start, 0);
}
need_sentinel = 0;
} else if (ex.physical == 0) {
/* If chop point is hole, use it as sentinel */
start = ex.logical;
need_sentinel = 0;
}
/* Shrink space */
dleaf->count = cpu_to_be16((dex - dleaf->table) + 1 + need_sentinel);
block = ex.physical + (start - ex.logical);
dex++;
while (dex < dex_limit) {
unsigned count;
/* Get next diskextent2 */
get_extent(dex, &ex);
count = ex.logical - start;
if (block && count) {
defer_bfree(sb, &sb->defree, block, count);
log_bfree(sb, block, count);
}
if (need_sentinel) {
/* Put new sentinel */
put_extent(dex, sb->version, start, 0);
need_sentinel = 0;
}
start = ex.logical;
block = ex.physical;
dex++;
}
return 1;
}
/* Read extents */
static unsigned __dleaf2_read(struct btree *btree, tuxkey_t key_bottom,
tuxkey_t key_limit,
struct dleaf2 *dleaf, struct btree_key_range *key,
int stop_at_hole)
{
struct dleaf_req *rq = container_of(key, struct dleaf_req, key);
tuxkey_t key_start = key->start;
unsigned key_len = key->len;
struct diskextent2 *dex, *dex_limit;
struct extent next;
block_t physical;
if (rq->seg_cnt >= rq->seg_max)
return 0;
dex_limit = dleaf->table + be16_to_cpu(dleaf->count);
/* Lookup the extent is including index */
dex = dleaf2_lookup_index(btree, dleaf, key_start);
if (dex >= dex_limit - 1) {
/* If sentinel, fill by bottom key */
goto fill_seg;
}
/* Get start position of logical and physical */
get_extent(dex, &next);
physical = next.physical;
if (physical)
physical += key_start - next.logical; /* add offset */
do {
struct block_segment *seg = rq->seg + rq->seg_cnt;
dex++;
get_extent(dex, &next);
/* Check of logical addr range of current and next. */
seg->count = min_t(u64, key_len, next.logical - key_start);
if (physical) {
seg->block = physical;
seg->state = 0;
} else {
seg->block = 0;
seg->state = BLOCK_SEG_HOLE;
}
physical = next.physical;
key_start += seg->count;
key_len -= seg->count;
rq->seg_cnt++;
/* Stop if current is hole and next is segment */
if (stop_at_hole) {
if (!seg->block && physical)
break;
}
} while (key_len && rq->seg_cnt < rq->seg_max && dex + 1 < dex_limit);
fill_seg:
/* Between sentinel and key_limit is hole */
if (key_start < key_limit && key_len && rq->seg_cnt < rq->seg_max) {
struct block_segment *seg = rq->seg + rq->seg_cnt;
seg->count = min_t(tuxkey_t, key_len, key_limit - key_start);
seg->block = 0;
seg->state = BLOCK_SEG_HOLE;
key_start += seg->count;
key_len -= seg->count;
rq->seg_cnt++;
}
return key->len - key_len;
}
/* Read extents */
static int dleaf2_read(struct btree *btree, tuxkey_t key_bottom,
tuxkey_t key_limit,
void *leaf, struct btree_key_range *key)
{
struct dleaf2 *dleaf = leaf;
unsigned len;
len = __dleaf2_read(btree, key_bottom, key_limit, dleaf, key, 0);
key->start += len;
key->len -= len;
return 0;
}
static int dleaf2_pre_write(struct btree *btree, tuxkey_t key_bottom,
tuxkey_t key_limit, void *leaf,
struct btree_key_range *key)
{
struct dleaf_req *rq = container_of(key, struct dleaf_req, key);
struct dleaf2 *dleaf = leaf;
/*
* If overwrite mode, read exists segments. Then, if there are
* hole, allocate segment.
*/
if (rq->overwrite) {
unsigned len;
int last, hole_len;
len = __dleaf2_read(btree, key_bottom, key_limit, dleaf, key,1);
last = rq->seg_cnt;
/* Remove hole from seg[] */
hole_len = 0;
while (last > rq->seg_idx && !rq->seg[last - 1].block) {
len -= rq->seg[last - 1].count;
hole_len += rq->seg[last - 1].count;
last--;
}
key->start += len;
key->len = hole_len;
rq->seg_idx = last;
rq->seg_cnt = last;
/* If there is no hole, return exists segments */
if (!hole_len)
return BTREE_DO_RETRY;
}
return BTREE_DO_DIRTY;
}
/* Resize dleaf2 from head */
static void dleaf2_resize(struct dleaf2 *dleaf, struct diskextent2 *head,
int diff)
{
void *limit = dleaf->table + be16_to_cpu(dleaf->count);
if (diff == 0)
return;
memmove(head + diff, head, limit - (void *)head);
be16_add_cpu(&dleaf->count, diff);
}
/* Initialize sentinel by bottom key */
static inline void dleaf2_init_sentinel(struct sb *sb, struct dleaf2 *dleaf,
tuxkey_t key_bottom)
{
if (!dleaf->count) {
dleaf->count = cpu_to_be16(1);
put_extent(dleaf->table, sb->version, key_bottom, 0);
}
}
/* Return split key of center for split hint */
static tuxkey_t dleaf2_split_at_center(struct dleaf2 *dleaf)
{
struct extent ex;
get_extent(dleaf->table + be16_to_cpu(dleaf->count) / 2, &ex);
return ex.logical;
}
/* dex information to modify */
struct dex_info {
/* start extent */
struct diskextent2 *start_dex;
/* physical range of partial start */
block_t start_block;
unsigned start_count;
/* end extent */
struct diskextent2 *end_dex;
/* remaining physical range of partial end */
block_t end_block;
int dleaf_count; /* count of dex except overwritten */
int overwrite_cnt; /* count of dex overwritten */
int need_sentinel; /* segments needs new sentinel? */
};
static void find_start_dex(struct btree *btree, struct dleaf2 *dleaf,
block_t key_start, struct dex_info *info)
{
struct diskextent2 *dex_limit;
dex_limit = dleaf->table + be16_to_cpu(dleaf->count);
info->start_block = 0;
info->start_count = 0;
/* Lookup the dex for start of seg[]. */
info->start_dex = dleaf2_lookup_index(btree, dleaf, key_start);
if (info->start_dex < dex_limit - 1) {
struct extent ex;
get_extent(info->start_dex, &ex);
/* Start is at middle of dex: can't overwrite this dex */
if (key_start > ex.logical) {
block_t prev = ex.logical, physical = ex.physical;
info->start_dex++;
get_extent(info->start_dex, &ex);
if (physical)
info->start_block = physical + key_start - prev;
info->start_count = ex.logical - key_start;
}
}
}
static void find_end_dex(struct btree *btree, struct dleaf2 *dleaf,
block_t key_end, struct dex_info *info)
{
struct diskextent2 *limit, *dex_limit;
u16 dleaf_count = be16_to_cpu(dleaf->count);
dex_limit = dleaf->table + dleaf_count;
info->need_sentinel = 0;
info->end_block = 0;
info->dleaf_count = dleaf_count;
if (!info->end_dex) {
/* Initial lookup */
limit = dex_limit;
} else {
/* Retry, we can limit lookup region */
limit = min(info->end_dex + 1, dex_limit);
}
/* Lookup the dex for end of seg[]. */
info->end_dex = __dleaf2_lookup_index(btree, dleaf, info->start_dex,
limit, key_end);
if (info->end_dex < dex_limit - 1) {
struct extent ex;
get_extent(info->end_dex, &ex);
if (key_end > ex.logical) {
block_t offset = key_end - ex.logical;
/* End is at middle of dex: can overwrite this dex */
info->end_dex++;
/* Need new end of segment */
info->need_sentinel = 1;
/* Save new physical for end of seg[] */
if (ex.physical)
info->end_block = ex.physical + offset;
}
}
assert(info->start_dex <= info->end_dex);
/*
* Calculate dleaf2 space informations
*/
/* Number of dex can be overwritten */
info->overwrite_cnt = info->end_dex - info->start_dex;
/* Need new dex sentinel? */
info->need_sentinel |= info->end_dex == dex_limit;
/* Calculate new dleaf->count except dex is overwritten by segments. */
info->dleaf_count -= info->overwrite_cnt;
info->dleaf_count += info->need_sentinel;
}
/*
* Write extents.
*/
static int dleaf2_write(struct btree *btree, tuxkey_t key_bottom,
tuxkey_t key_limit,
void *leaf, struct btree_key_range *key,
tuxkey_t *split_hint)
{
struct dleaf_req *rq = container_of(key, struct dleaf_req, key);
struct sb *sb = btree->sb;
struct dleaf2 *dleaf = leaf;
struct diskextent2 *dex;
struct extent ex;
struct dex_info info;
unsigned free_len, seg_len, alloc_len;
int err, diff, seg_cnt, space;
/*
* Strategy: check free space in dleaf2, then allocate
* segments, and write segments to dleaf2. If there is no
* space in dleaf2, shrink segments to fit space of dleaf2,
* and split.
*
* FIXME: should try to merge at start and new last extents.
*/
dleaf2_init_sentinel(sb, dleaf, key_bottom);
/* Get the info of dex for start of seg[]. */
find_start_dex(btree, dleaf, key->start, &info);
seg_len = min_t(tuxkey_t, key_limit - key->start, key->len);
/* Get the info of dex for end of seg[]. */
info.end_dex = NULL;
find_end_dex(btree, dleaf, key->start + seg_len, &info);
/*
* Allocate blocks to seg after dleaf redirect. With this, our
* allocation order is, bnode => dleaf => data, and we can use
* physical address of dleaf as allocation hint for data
* blocks.
*/
space = btree->entries_per_leaf - info.dleaf_count;
#if 0
/* If there is no space to store 1 dex (start and end), split */
if (space <= 2)
goto need_split;
#else
/* If there is no space, split */
if (space <= 0)
goto need_split;
#endif
err = rq->seg_find(btree, rq, space, seg_len, &alloc_len);
if (err < 0) {
assert(err != -ENOSPC); /* block reservation bug */
tux3_err(sb, "extent allocation failed: %d", err);
return err;
}
seg_cnt = rq->seg_cnt - rq->seg_idx;
assert(seg_cnt > 0);
/* Ugh, there was not space enough to store, adjust number of seg[]. */
while (seg_len != alloc_len) {
seg_len = alloc_len;
/* Re-calculate end of seg[] can be allocated */
find_end_dex(btree, dleaf, key->start + alloc_len, &info);
/* Shrink segments to fit to space of dleaf2 */
while (info.dleaf_count + seg_cnt > btree->entries_per_leaf) {
seg_cnt--;
alloc_len -= rq->seg[rq->seg_idx + seg_cnt].count;
if (!seg_cnt) {
/* Didn't fit at all, cancel allocation */
rq->seg_alloc(btree, rq, 0);
goto need_split;
}
}
}
/* Commit allocation of writable segments */
err = rq->seg_alloc(btree, rq, seg_cnt);
assert(key->start + seg_len <= key_limit);
#if 0
tux3_dbg("start %lu, end %lu",
info.start_dex - dleaf->table, info.end_dex - dleaf->table);
tux3_dbg("dleaf_count %u (%u) (seg_cnt %u, overwrite %lu, sentinel %u)",
info.dleaf_count, info.dleaf_count + seg_cnt,
seg_cnt, info.end_dex - info.start_dex, info.need_sentinel);
#endif
/*
* Free segments which is overwritten.
*/
free_len = seg_len;
if (info.start_count) {
unsigned count = min_t(block_t, free_len, info.start_count);
if (info.start_block)
rq->seg_free(btree, info.start_block, count);
free_len -= count;
}
if (info.start_dex < info.end_dex) {
struct diskextent2 *limit = info.end_dex;
if (limit != dleaf->table + be16_to_cpu(dleaf->count))
limit++;
get_extent(info.start_dex, &ex);
for (dex = info.start_dex + 1; free_len && dex < limit; dex++) {
block_t prev = ex.logical, physical = ex.physical;
unsigned count;
get_extent(dex, &ex);
count = min_t(block_t, free_len, ex.logical - prev);
if (physical)
rq->seg_free(btree, physical, count);
free_len -= count;
}
}
/* Calculate difference of dleaf->count on old and new. */
diff = seg_cnt - info.overwrite_cnt + info.need_sentinel;
/*
* Expand/shrink space for segs
*/
dleaf2_resize(dleaf, info.end_dex, diff);
assert(info.dleaf_count + seg_cnt == be16_to_cpu(dleaf->count));
assert(info.dleaf_count + seg_cnt <= btree->entries_per_leaf);
/*
* Fill extents
*/
while (seg_len) {
struct block_segment *seg = rq->seg + rq->seg_idx;
put_extent(info.start_dex, sb->version, key->start, seg->block);
key->start += seg->count;
key->len -= seg->count;
seg_len -= seg->count;
rq->seg_idx++;
info.start_dex++;
}
if (info.need_sentinel) {
/* Fill sentinel */
put_extent(info.start_dex, sb->version, key->start,
info.end_block);
}
if (rq->seg_cnt == rq->seg_max) {
/* Stop if there is no space in seg[] */
key->len = 0;
} else if (key->start < key_limit && key->len) {
/* If there are remaining range, split */
goto need_split;
}
return BTREE_DO_RETRY;
need_split:
/* FIXME: do we should split at sentinel when filling hole? */
if (key_limit == TUXKEY_LIMIT) {
struct diskextent2 *sentinel =
dleaf->table + be16_to_cpu(dleaf->count) - 1;
/* If append write, split at sentinel */
*split_hint = get_logical(sentinel);
if (key->start >= *split_hint) {
tux3_dbg("key %Lu bottom %Lu, limit %Lu, hint %Lu",
key->start, key_bottom, key_limit,
*split_hint);
return BTREE_DO_SPLIT;
}
}
/* FIXME: use better split position */
*split_hint = dleaf2_split_at_center(dleaf);
return BTREE_DO_SPLIT;
}
struct btree_ops dtree2_ops = {
.btree_init = dleaf2_btree_init,
.leaf_init = dleaf2_init,
.leaf_split = dleaf2_split,
.leaf_merge = dleaf2_merge,
.leaf_chop = dleaf2_chop,
.leaf_pre_write = dleaf2_pre_write,
.leaf_write = dleaf2_write,
.leaf_read = dleaf2_read,
.leaf_sniff = dleaf2_sniff,
.leaf_can_free = dleaf2_can_free,
.leaf_dump = dleaf2_dump,
};