blob: cc642d082c6d71d7f12386994f811041e010ea90 [file] [log] [blame]
/*
* Block allocation
*
* Original copyright (c) 2008 Daniel Phillips <phillips@phunq.net>
* Portions copyright (c) 2006-2008 Google Inc.
* Licensed under the GPL version 2
*
* By contributing changes to this file you grant the original copyright holder
* the right to distribute those changes under any license.
*/
#include "tux3.h"
#ifndef trace
#define trace trace_on
#endif
/*
* Group counts
*/
void countmap_put(struct countmap_pin *pin)
{
if (pin->buffer) {
blockput(pin->buffer);
pin->buffer = NULL;
}
}
/*
* Load and pin one block of the groupmap. Returns with spinlock held.
* Access from frontend is read only and write access from backend is single
* threaded, so rw spinlock may reduce frontend contention if there is any.
* This could be extended to pin multiple blocks if contention causes too
* many block reads.
*/
static struct buffer_head *countmap_load(struct sb *sb, block_t group)
{
struct countmap_pin *pin = &sb->countmap_pin;
block_t block = group >> (sb->blockbits - 1);
struct buffer_head *buffer;
spin_lock(&sb->countmap_lock);
buffer = pin->buffer;
if (buffer && bufindex(buffer) == block) {
get_bh(buffer);
spin_unlock(&sb->countmap_lock);
} else {
spin_unlock(&sb->countmap_lock);
buffer = blockread(mapping(sb->countmap), block);
if (!buffer) {
tux3_err(sb, "block read failed");
return ERR_PTR(-EIO);
}
}
return buffer;
}
static void countmap_pin_update(struct sb *sb, struct buffer_head *buffer)
{
if (sb->countmap_pin.buffer != buffer) {
countmap_put(&sb->countmap_pin);
sb->countmap_pin.buffer = buffer;
} else
blockput(buffer);
}
static int countmap_add(struct sb *sb, block_t group, int count)
{
unsigned offset = group & (sb->blockmask >> 1);
struct buffer_head *buffer, *clone;
__be16 *p;
buffer = countmap_load(sb, group);
if (IS_ERR(buffer))
return PTR_ERR(buffer);
trace("add %d to group %Lu", count, group);
/*
* The countmap is modified only by backend. blockdirty()
* should never return -EAGAIN.
*/
clone = blockdirty(buffer, sb->unify);
if (IS_ERR(clone)) {
assert(PTR_ERR(clone) != -EAGAIN);
blockput(buffer);
return PTR_ERR(clone);
}
spin_lock(&sb->countmap_lock);
p = bufdata(clone);
be16_add_cpu(p + offset, count);
countmap_pin_update(sb, clone);
spin_unlock(&sb->countmap_lock);
return 0;
}
static int countmap_add_segment(struct sb *sb, block_t start, unsigned blocks,
int set)
{
block_t group = start >> sb->groupbits;
/* Compile option: support cross-group segments */
if (1 && group != (start + blocks) >> sb->groupbits) {
unsigned groupsize = 1 << sb->groupbits;
unsigned groupmask = groupsize - 1;
while (blocks) {
unsigned grouplen = (~start & groupmask) + 1;
int len = min(grouplen, blocks);
int err = countmap_add(sb, group++, set ? len : -len);
if (err)
return err;
start += len;
blocks -= len;
}
return 0;
}
return countmap_add(sb, group, set ? blocks : -blocks);
}
static int countmap_used(struct sb *sb, block_t group)
{
unsigned offset = group & (sb->blockmask >> 1);
struct buffer_head *buffer;
__be16 *p;
u16 count;
buffer = countmap_load(sb, group);
if (IS_ERR(buffer))
return PTR_ERR(buffer);
spin_lock(&sb->countmap_lock);
p = bufdata(buffer);
count = be16_to_cpup(p + offset);
countmap_pin_update(sb, buffer);
spin_unlock(&sb->countmap_lock);
return count;
}
#ifndef __KERNEL__
void countmap_dump(struct sb *sb, block_t start, block_t count)
{
unsigned groupbits = sb->groupbits, groupsize = 1 << groupbits;
for (block_t group = start; group < count; group++) {
block_t block = group << groupbits;
block_t blocks = min_t(block_t, sb->volblocks - block, groupsize);
__tux3_dbg("%Lu: %i used, ", group, countmap_used(sb, group));
bitmap_dump(sb->bitmap, block, blocks);
}
}
#endif /* !__KERNEL__ */
/*
* Bitmap
*/
#ifndef __KERNEL__
block_t count_range(struct inode *inode, block_t start, block_t count)
{
unsigned char ones[256];
assert(!(start & 7));
for (int i = 0; i < sizeof(ones); i++)
ones[i] = bytebits(i);
struct sb *sb = tux_sb(inode->i_sb);
unsigned mapshift = sb->blockbits + 3;
unsigned mapmask = (1 << mapshift) - 1;
block_t limit = start + count;
block_t blocks = (limit + mapmask) >> mapshift;
block_t tail = (count + 7) >> 3, total = 0;
unsigned offset = (start & mapmask) >> 3;
for (block_t block = start >> mapshift; block < blocks; block++) {
trace_off("count block %x/%x", block, blocks);
struct buffer_head *buffer = blockread(mapping(inode), block);
if (!buffer)
return -1;
unsigned bytes = sb->blocksize - offset;
if (bytes > tail)
bytes = tail;
unsigned char *p = bufdata(buffer) + offset, *top = p + bytes;
while (p < top)
total += ones[*p++];
blockput(buffer);
tail -= bytes;
offset = 0;
}
return total;
}
void bitmap_dump(struct inode *inode, block_t start, block_t count)
{
enum { show_used = 0 };
struct sb *sb = tux_sb(inode->i_sb);
unsigned mapshift = sb->blockbits + 3;
unsigned mapsize = 1 << mapshift;
unsigned mapmask = mapsize - 1;
unsigned offset = (start & mapmask) >> 3, bit = start & 7, total = 0;
block_t limit = start + count, blocks = (limit + mapmask) >> mapshift;
block_t tail = (count + bit + 7) >> 3, begin = -1;
__tux3_dbg("%s regions in %Lu/%Lu: ", show_used ? "used" : "free", start, count);
for (block_t block = start >> mapshift; block < blocks; block++) {
struct buffer_head *buffer = blockread(mapping(inode), block);
assert(buffer);
unsigned bytes = sb->blocksize - offset;
if (bytes > tail)
bytes = tail;
unsigned char *data = bufdata(buffer), *p = data + offset, *top = p + bytes;
for (; p < top; p++, bit = 0) {
unsigned c = *p;
if ((!c && ((begin >= 0) ^ show_used)))
continue;
if (((c == 0xff) && ((begin < 0) ^ show_used)))
continue;
for (int i = bit, mask = 1 << bit; i < 8; i++, mask <<= 1) {
if (!(c & mask) ^ (begin < 0) ^ show_used)
continue;
block_t found = i + ((p - data) << 3) + (block << mapshift);
if (found == limit)
break;
if (begin < 0) {
__tux3_dbg("%Lu", found);
begin = found;
total++;
} else {
__tux3_dbg("/%Lu ", found - begin);
begin = -1;
}
}
}
blockput(buffer);
tail -= bytes;
offset = 0;
}
if ((begin >= 0))
__tux3_dbg("/%Lu ", start + count - begin);
__tux3_dbg("(%u)\n", total);
}
#endif /* !__KERNEL__ */
/*
* Modify bits on one block, then adjust ->freeblocks.
*/
static int bitmap_modify_bits(struct sb *sb, struct buffer_head *buffer,
unsigned offset, unsigned blocks, int set)
{
struct buffer_head *clone;
void (*modify)(u8 *, unsigned, unsigned) = set ? set_bits : clear_bits;
assert(blocks > 0);
assert(offset + blocks <= sb->blocksize << 3);
/*
* The bitmap is modified only by backend.
* blockdirty() should never return -EAGAIN.
*/
clone = blockdirty(buffer, sb->unify);
if (IS_ERR(clone)) {
int err = PTR_ERR(clone);
assert(err != -EAGAIN);
return err;
}
modify(bufdata(clone), offset, blocks);
mark_buffer_dirty_non(clone);
blockput(clone);
if (set)
sb->freeblocks -= blocks;
else
sb->freeblocks += blocks;
return 0;
}
/*
* If bits on multiple blocks is excepted state, modify bits.
*
* FIXME: If error happened on middle of blocks, modified bits and
* ->freeblocks are not restored to original. What to do?
*/
static int __bitmap_modify(struct sb *sb, block_t start, unsigned blocks,
int set, int (*test)(u8 *, unsigned, unsigned))
{
struct inode *bitmap = sb->bitmap;
unsigned mapshift = sb->blockbits + 3;
unsigned mapsize = 1 << mapshift;
unsigned mapmask = mapsize - 1;
unsigned mapoffset = start & mapmask;
block_t mapblock, mapblocks = (start + blocks + mapmask) >> mapshift;
unsigned orig_blocks = blocks;
assert(blocks > 0);
assert(start + blocks <= sb->volblocks);
for (mapblock = start >> mapshift; mapblock < mapblocks; mapblock++) {
struct buffer_head *buffer;
unsigned len;
int err;
buffer = blockread(mapping(bitmap), mapblock);
if (!buffer) {
tux3_err(sb, "block read failed");
return -EIO; /* FIXME: error handling */
}
len = min(mapsize - mapoffset, blocks);
if (test && !test(bufdata(buffer), mapoffset, len)) {
blockput(buffer);
tux3_fs_error(sb, "%s: start 0x%Lx, count %x",
set ? "already allocated" : "double free",
start, blocks);
return -EIO; /* FIXME: error handling */
}
err = bitmap_modify_bits(sb, buffer, mapoffset, len, set);
if (err) {
blockput(buffer);
return err; /* FIXME: error handling */
}
mapoffset = 0;
blocks -= len;
}
return countmap_add_segment(sb, start, orig_blocks, set);
}
static int bitmap_modify(struct sb *sb, block_t start, unsigned blocks, int set)
{
return __bitmap_modify(sb, start, blocks, set, NULL);
}
static int bitmap_test_and_modify(struct sb *sb, block_t start, unsigned blocks,
int set)
{
int (*test)(u8 *, unsigned, unsigned) = set ? all_clear : all_set;
return __bitmap_modify(sb, start, blocks, set, test);
}
static inline int mergable(struct block_segment *seg, block_t block)
{
return seg->block + seg->count == block;
}
/*
* Find blocks available in the specified range.
*
* NOTE: Caller must check "*block" to know how many blocks were
* found. This returns 0 even if no blocks found.
*
* return value:
* < 0 - error
* 0 - succeed to check
*/
int balloc_find_range(struct sb *sb,
struct block_segment *seg, int maxsegs, int *segs,
block_t start, block_t range, unsigned *blocks)
{
struct inode *bitmap = sb->bitmap;
unsigned mapshift = sb->blockbits + 3;
unsigned mapsize = 1 << mapshift;
unsigned mapmask = mapsize - 1;
struct buffer_head *buffer;
trace("find %u blocks in [%Lu/%Lu], segs = %d",
*blocks, start, range, *segs);
assert(*blocks > 0);
assert(start < sb->volblocks);
assert(start + range <= sb->volblocks);
assert(*segs < maxsegs);
assert(tux3_under_backend(sb));
/* Search across blocks */
while (range > 0) {
block_t mapblock = start >> mapshift;
block_t mapbase = mapblock << mapshift;
unsigned offset = start & mapmask;
unsigned maplimit = min_t(block_t, mapsize, offset + range);
unsigned chunk = maplimit - offset;
char *data;
buffer = blockread(mapping(bitmap), mapblock);
if (!buffer) {
tux3_err(sb, "block read failed");
return -EIO;
/* FIXME: error handling */
}
data = bufdata(buffer);
/* Search within block */
do {
block_t end = (block_t)offset + *blocks;
unsigned limit, next;
limit = min_t(block_t, end, maplimit);
next = find_next_bit_le(data, limit, offset);
if (next != offset) {
unsigned count = next - offset;
block_t found = mapbase + offset;
if (*segs && mergable(&seg[*segs - 1], found)) {
trace("append seg [%Lu/%u]", found, count);
seg[*segs - 1].count += count;
} else {
trace("balloc seg [%Lu/%u]", found, count);
seg[(*segs)++] = (struct block_segment){
.block = found,
.count = count,
};
}
*blocks -= count;
if (!*blocks || *segs == maxsegs) {
blockput(buffer);
return 0;
}
}
offset = find_next_zero_bit_le(data, maplimit, next + 1);
/* Remove after tested on arm. (next + 1 can
* be greater than maplimit) */
assert(offset <= maplimit);
} while (offset != maplimit);
assert(start + chunk == mapbase + maplimit);
start += chunk;
range -= chunk;
blockput(buffer);
}
return 0;
}
/*
* Allocate block segments from entire volume. Wrap around volume if needed.
* Returns negative if error, zero if at least one block found
*
* Scan entire volume exactly once. Start at current goal, continue to end
* of group, then continue scanning a group at a time, wrapping around to
* volume base if necessary. Skip any groups with less than some threshold
* of free blocks, depending on original request size. The first and last
* partial groups are scanned regardless of threshold in the first pass
* and never in the second pass. The second pass scans groups skipped in
* the first pass that are not completely full.
*
* return value:
* < 0 - error
* 0 - succeed to allocate blocks, at least >= 1
*/
int balloc_find(struct sb *sb,
struct block_segment *seg, int maxsegs, int *segs,
unsigned *blocks)
{
block_t goal = sb->nextblock, volsize = sb->volblocks, start = goal;
block_t topgroup = volsize >> sb->groupbits;
unsigned groupsize = 1 << sb->groupbits, groupmask = groupsize - 1;
unsigned need = *blocks;
unsigned threshold = min(need, groupsize >> 2);
int err, newsegs = 0, pass = 0;
trace("scan volume for %u blocks, goal = %Lu", need, goal);
trace("groupsize = %u, topgroup = %Lu, threshold = %u",
groupsize, topgroup, threshold);
//bitmap_dump(sb->bitmap, 0, volsize);
//groups_dump(sb, 0, (volsize + groupmask) >> sb->groupbits);
do {
block_t tail = volsize;
trace("--- pass%i ---", pass + 1);
trace("group %Lu: start", goal >> sb->groupbits);
do {
block_t group = goal >> sb->groupbits;
block_t next = goal + groupsize;
int skip = pass > 0, top = group == topgroup;
if (tail != volsize) {
unsigned size, used;
if (tail < groupsize && goal + tail == start) {
trace("group %Lu: back to start",
goal >> sb->groupbits);
next = goal + tail;
goto last;
}
size = top ? (volsize & groupmask) : groupsize;
used = countmap_used(sb, group);
trace("group %Lu: check used", group);
assert(used <= size);
skip = used == size || (size - used < threshold) ^ pass;
next = goal + size;
}
next &= ~groupmask;
if (top)
next = volsize;
last:
trace("goal = %Lu, next = %Lu, skip = %i",
goal, next, skip);
if (!skip) {
err = balloc_find_range(sb, seg, maxsegs,
&newsegs, goal,
next - goal, &need);
if (err)
return err;
if (!need || newsegs == maxsegs)
goto done;
trace("=> result: segs = %d, need = %u, tail = %Lu",
newsegs, need, tail);
}
tail -= next - goal;
/* skip reserved at bottom? */
goal = next == volsize ? 0 : next;
} while (tail);
} while (++pass < 2);
done:
*segs = newsegs;
*blocks = need;
return 0;
}
int balloc_use(struct sb *sb, struct block_segment *seg, int segs)
{
block_t goal;
int i;
assert(segs > 0);
for (i = 0; i < segs; i++) {
/* FIXME: error handling */
int err = bitmap_modify(sb, seg[i].block, seg[i].count, 1);
if (err)
return err;
}
goal = seg[segs - 1].block + seg[segs - 1].count;
sb->nextblock = goal == sb->volblocks ? 0 : goal;
return 0;
}
int balloc_segs(struct sb *sb,
struct block_segment *seg, int maxsegs, int *segs,
unsigned *blocks)
{
int err = balloc_find(sb, seg, maxsegs, segs, blocks);
if (!err)
err = balloc_use(sb, seg, *segs);
return err;
}
block_t balloc_one(struct sb *sb)
{
struct block_segment seg;
unsigned blocks = 1;
int err, segs;
err = balloc_segs(sb, &seg, 1, &segs, &blocks);
if (err)
return err;
assert(segs == 1 && blocks == 0 && seg.count == 1);
return seg.block;
}
int bfree(struct sb *sb, block_t start, unsigned blocks)
{
assert(tux3_under_backend(sb));
trace("bfree extent [%Lu/%u], ", start, blocks);
return bitmap_test_and_modify(sb, start, blocks, 0);
}
int bfree_segs(struct sb *sb, struct block_segment *seg, int segs)
{
int i;
for (i = 0; i < segs; i++) {
/* FIXME: error handling */
int err = bfree(sb, seg[i].block, seg[i].count);
if (err)
return err;
}
return 0;
}
int replay_update_bitmap(struct replay *rp, block_t start, unsigned blocks,
int set)
{
return bitmap_test_and_modify(rp->sb, start, blocks, set);
}