blob: 23056a60542a460756f5748d94822d9e55c6206c [file] [log] [blame]
/*
* Replay log blocks to recreate dirty cache on restart.
*
* Copyright (c) 2009-2014 OGAWA Hirofumi
*/
#include "tux3.h"
#ifndef trace
#define trace trace_on
#endif
static const char *log_name[] = {
#define X(x) [x] = #x
X(LOG_BALLOC),
X(LOG_BFREE),
X(LOG_BFREE_ON_UNIFY),
X(LOG_BFREE_RELOG),
X(LOG_LEAF_REDIRECT),
X(LOG_LEAF_FREE),
X(LOG_BNODE_REDIRECT),
X(LOG_BNODE_ROOT),
X(LOG_BNODE_SPLIT),
X(LOG_BNODE_ADD),
X(LOG_BNODE_UPDATE),
X(LOG_BNODE_MERGE),
X(LOG_BNODE_DEL),
X(LOG_BNODE_ADJUST),
X(LOG_BNODE_FREE),
X(LOG_ORPHAN_ADD),
X(LOG_ORPHAN_DEL),
X(LOG_FREEBLOCKS),
X(LOG_UNIFY),
X(LOG_DELTA),
#undef X
};
static struct replay *alloc_replay(struct sb *sb, unsigned logcount)
{
struct replay *rp;
rp = kmalloc(sizeof(*rp) + logcount * sizeof(block_t), GFP_NOFS);
if (!rp)
return ERR_PTR(-ENOMEM);
rp->sb = sb;
rp->unify_pos = NULL;
rp->unify_index = -1;
memset(rp->blocknrs, 0, logcount * sizeof(block_t));
INIT_LIST_HEAD(&rp->log_orphan_add);
INIT_LIST_HEAD(&rp->orphan_in_otree);
return rp;
}
static void free_replay(struct replay *rp)
{
assert(list_empty(&rp->log_orphan_add));
assert(list_empty(&rp->orphan_in_otree));
kfree(rp);
}
static int replay_check_log(struct replay *rp, struct buffer_head *logbuf)
{
struct sb *sb = rp->sb;
struct logblock *log = bufdata(logbuf);
unsigned char *data = log->data;
if (log->magic != cpu_to_be16(TUX3_MAGIC_LOG)) {
tux3_err(sb, "bad log magic %x", be16_to_cpu(log->magic));
return -EINVAL;
}
if (be16_to_cpu(log->bytes) + sizeof(*log) > sb->blocksize) {
tux3_err(sb, "log bytes is too big");
return -EINVAL;
}
while (data < log->data + be16_to_cpu(log->bytes)) {
u8 code = *data;
/* Find latest unify. */
if (code == LOG_UNIFY && rp->unify_index == -1) {
rp->unify_pos = data;
/* FIXME: index is unnecessary to use. We just
* want to know whether before or after unify
* mark. */
rp->unify_index = bufindex(logbuf);
}
if (log_size[code] == 0) {
tux3_err(sb, "invalid log code: 0x%02x", code);
return -EINVAL;
}
data += log_size[code];
}
return 0;
}
/* Unpin logblocks. */
static void replay_unpin_logblocks(struct sb *sb, unsigned i, unsigned logcount)
{
struct buffer_head *logbuf;
while (i < logcount) {
logbuf = blockget(mapping(sb->logmap), i);
assert(logbuf != NULL);
blockput(logbuf);
blockput(logbuf); /* Unpin */
i++;
}
}
/* Prepare log info for replay and pin logblocks. */
static struct replay *replay_prepare(struct sb *sb)
{
block_t logchain = be64_to_cpu(sb->super.logchain);
unsigned i, logcount = be32_to_cpu(sb->super.logcount);
struct replay *rp;
struct buffer_head *buffer;
int err;
/* FIXME: this address array is quick hack. Rethink about log
* block management and log block address. */
rp = alloc_replay(sb, logcount);
if (IS_ERR(rp))
return rp;
/* FIXME: maybe, we should use bufvec to read log blocks */
trace("load %u logblocks", logcount);
i = logcount;
while (i-- > 0) {
struct logblock *log;
buffer = blockget(mapping(sb->logmap), i);
if (!buffer) {
i++;
err = -ENOMEM;
goto error;
}
assert(bufindex(buffer) == i);
err = blockio(READ, sb, buffer, logchain);
if (err)
goto error;
err = replay_check_log(rp, buffer);
if (err)
goto error;
/* Store index => blocknr map */
rp->blocknrs[bufindex(buffer)] = logchain;
log = bufdata(buffer);
logchain = be64_to_cpu(log->logchain);
}
return rp;
error:
free_replay(rp);
replay_unpin_logblocks(sb, i, logcount);
return ERR_PTR(err);
}
/* Unpin log blocks, and prepare for future logging. */
static void replay_done(struct replay *rp)
{
struct sb *sb = rp->sb;
clean_orphan_list(&rp->log_orphan_add); /* for error path */
free_replay(rp);
sb->lognext = be32_to_cpu(sb->super.logcount);
replay_unpin_logblocks(sb, 0, sb->lognext);
log_finish_cycle(sb, 0);
}
typedef int (*replay_log_t)(struct replay *, struct buffer_head *);
static int replay_log_stage1(struct replay *rp, struct buffer_head *logbuf)
{
struct sb *sb = rp->sb;
struct logblock *log = bufdata(logbuf);
unsigned char *data = log->data;
int err;
/* Check whether array is uptodate */
BUILD_BUG_ON(ARRAY_SIZE(log_name) != LOG_TYPES);
/* If log is before latest unify, those were already applied to FS. */
if (bufindex(logbuf) < rp->unify_index) {
// assert(0); /* older logs should already be freed */
return 0;
}
if (bufindex(logbuf) == rp->unify_index)
data = rp->unify_pos;
while (data < log->data + be16_to_cpu(log->bytes)) {
u8 code = *data++;
switch (code) {
case LOG_BNODE_REDIRECT:
{
u64 oldblock, newblock;
data = decode48(data, &oldblock);
data = decode48(data, &newblock);
trace("%s: oldblock %Lx, newblock %Lx",
log_name[code], oldblock, newblock);
err = replay_bnode_redirect(rp, oldblock, newblock);
if (err)
return err;
break;
}
case LOG_BNODE_ROOT:
{
u64 root, left, right, rkey;
u8 count;
count = *data++;
data = decode48(data, &root);
data = decode48(data, &left);
data = decode48(data, &right);
data = decode48(data, &rkey);
trace("%s: count %u, root block %Lx, left %Lx, right %Lx, rkey %Lx",
log_name[code], count, root, left, right, rkey);
err = replay_bnode_root(rp, root, count, left, right, rkey);
if (err)
return err;
break;
}
case LOG_BNODE_SPLIT:
{
unsigned pos;
u64 src, dst;
data = decode16(data, &pos);
data = decode48(data, &src);
data = decode48(data, &dst);
trace("%s: pos %x, src %Lx, dst %Lx",
log_name[code], pos, src, dst);
err = replay_bnode_split(rp, src, pos, dst);
if (err)
return err;
break;
}
case LOG_BNODE_ADD:
case LOG_BNODE_UPDATE:
{
u64 child, parent, key;
data = decode48(data, &parent);
data = decode48(data, &child);
data = decode48(data, &key);
trace("%s: parent 0x%Lx, child 0x%Lx, key 0x%Lx",
log_name[code], parent, child, key);
if (code == LOG_BNODE_UPDATE)
err = replay_bnode_update(rp, parent, child, key);
else
err = replay_bnode_add(rp, parent, child, key);
if (err)
return err;
break;
}
case LOG_BNODE_MERGE:
{
u64 src, dst;
data = decode48(data, &src);
data = decode48(data, &dst);
trace("%s: src 0x%Lx, dst 0x%Lx",
log_name[code], src, dst);
err = replay_bnode_merge(rp, src, dst);
if (err)
return err;
break;
}
case LOG_BNODE_DEL:
{
unsigned count;
u64 bnode, key;
data = decode16(data, &count);
data = decode48(data, &bnode);
data = decode48(data, &key);
trace("%s: bnode 0x%Lx, count 0x%x, key 0x%Lx",
log_name[code], bnode, count, key);
err = replay_bnode_del(rp, bnode, key, count);
if (err)
return err;
break;
}
case LOG_BNODE_ADJUST:
{
u64 bnode, from, to;
data = decode48(data, &bnode);
data = decode48(data, &from);
data = decode48(data, &to);
trace("%s: bnode 0x%Lx, from 0x%Lx, to 0x%Lx",
log_name[code], bnode, from, to);
err = replay_bnode_adjust(rp, bnode, from, to);
if (err)
return err;
break;
}
case LOG_BNODE_FREE:
{
u64 block;
data = decode48(data, &block);
trace("%s: block %Lx", log_name[code], block);
blockput_free_unify(sb, vol_find_get_block(sb, block));
break;
}
case LOG_FREEBLOCKS:
{
u64 freeblocks;
data = decode48(data, &freeblocks);
trace("%s: freeblocks %llu", log_name[code],
freeblocks);
sb->freeblocks = freeblocks;
break;
}
case LOG_BALLOC:
case LOG_BFREE:
case LOG_BFREE_ON_UNIFY:
case LOG_BFREE_RELOG:
case LOG_LEAF_REDIRECT:
case LOG_LEAF_FREE:
case LOG_ORPHAN_ADD:
case LOG_ORPHAN_DEL:
case LOG_UNIFY:
case LOG_DELTA:
data += log_size[code] - sizeof(code);
break;
default:
tux3_err(rp->sb, "unrecognized log code 0x%x", code);
return -EINVAL;
}
}
return 0;
}
static int replay_log_stage2(struct replay *rp, struct buffer_head *logbuf)
{
struct sb *sb = rp->sb;
struct logblock *log = bufdata(logbuf);
unsigned char *data = log->data;
int err;
/* If log is before latest unify, those were already applied to FS. */
if (bufindex(logbuf) < rp->unify_index) {
// assert(0); /* older logs should already be freed */
return 0;
}
if (bufindex(logbuf) == rp->unify_index)
data = rp->unify_pos;
while (data < log->data + be16_to_cpu(log->bytes)) {
u8 code = *data++;
switch (code) {
case LOG_BALLOC:
case LOG_BFREE:
case LOG_BFREE_ON_UNIFY:
case LOG_BFREE_RELOG:
{
u64 block;
u32 count;
data = decode32(data, &count);
data = decode48(data, &block);
trace("%s: count %u, block %Lx",
log_name[code], count, block);
err = 0;
if (code == LOG_BALLOC)
err = replay_update_bitmap(rp, block, count, 1);
else if (code == LOG_BFREE_ON_UNIFY)
defer_bfree(sb, &sb->deunify, block, count);
else
err = replay_update_bitmap(rp, block, count, 0);
if (err)
return err;
break;
}
case LOG_LEAF_REDIRECT:
case LOG_BNODE_REDIRECT:
{
u64 oldblock, newblock;
data = decode48(data, &oldblock);
data = decode48(data, &newblock);
trace("%s: oldblock %Lx, newblock %Lx",
log_name[code], oldblock, newblock);
err = replay_update_bitmap(rp, newblock, 1, 1);
if (err)
return err;
if (code == LOG_LEAF_REDIRECT) {
err = replay_update_bitmap(rp, oldblock, 1, 0);
if (err)
return err;
} else {
/* newblock is not flushing yet */
defer_bfree(sb, &sb->deunify, oldblock, 1);
}
break;
}
case LOG_LEAF_FREE:
case LOG_BNODE_FREE:
{
u64 block;
data = decode48(data, &block);
trace("%s: block %Lx", log_name[code], block);
err = replay_update_bitmap(rp, block, 1, 0);
if (err)
return err;
break;
}
case LOG_BNODE_ROOT:
{
u64 root, left, right, rkey;
u8 count;
count = *data++;
data = decode48(data, &root);
data = decode48(data, &left);
data = decode48(data, &right);
data = decode48(data, &rkey);
trace("%s: count %u, root block %Lx, left %Lx, right %Lx, rkey %Lx",
log_name[code], count, root, left, right, rkey);
err = replay_update_bitmap(rp, root, 1, 1);
if (err)
return err;
break;
}
case LOG_BNODE_SPLIT:
{
unsigned pos;
u64 src, dst;
data = decode16(data, &pos);
data = decode48(data, &src);
data = decode48(data, &dst);
trace("%s: pos %x, src %Lx, dst %Lx",
log_name[code], pos, src, dst);
err = replay_update_bitmap(rp, dst, 1, 1);
if (err)
return err;
break;
}
case LOG_BNODE_MERGE:
{
u64 src, dst;
data = decode48(data, &src);
data = decode48(data, &dst);
trace("%s: src 0x%Lx, dst 0x%Lx",
log_name[code], src, dst);
err = replay_update_bitmap(rp, src, 1, 0);
if (err)
return err;
break;
}
case LOG_ORPHAN_ADD:
case LOG_ORPHAN_DEL:
{
unsigned version;
u64 inum;
data = decode16(data, &version);
data = decode48(data, &inum);
trace("%s: version 0x%x, inum 0x%Lx",
log_name[code], version, inum);
if (code == LOG_ORPHAN_ADD)
err = replay_orphan_add(rp, version, inum);
else
err = replay_orphan_del(rp, version, inum);
if (err)
return err;
break;
}
case LOG_FREEBLOCKS:
case LOG_BNODE_ADD:
case LOG_BNODE_UPDATE:
case LOG_BNODE_DEL:
case LOG_BNODE_ADJUST:
case LOG_UNIFY:
case LOG_DELTA:
data += log_size[code] - sizeof(code);
break;
default:
tux3_err(sb, "unrecognized log code 0x%x", code);
return -EINVAL;
}
}
return 0;
}
static int replay_logblock_mark(struct replay *rp, struct buffer_head *logbuf)
{
struct sb *sb = rp->sb;
block_t blocknr = rp->blocknrs[bufindex(logbuf)];
int err;
/*
* Log block address itself works as balloc log, and adjust
* bitmap and deunify even if logblocks is before latest
* unify, to prevent to be overwritten. (This must be after
* LOG_FREEBLOCKS replay if there is it.)
*
* Update bitmap after logical update replay, because this
* logblock address can be freed while replaying.
*/
trace("LOG BLOCK: logblock %Lx", blocknr);
err = replay_update_bitmap(rp, blocknr, 1, 1);
if (err)
return err;
/* Mark log block as deunify block */
defer_bfree(sb, &sb->deunify, blocknr, 1);
return 0;
}
static int replay_logblocks(struct replay *rp, replay_log_t replay_log_func)
{
struct sb *sb = rp->sb;
unsigned logcount = be32_to_cpu(sb->super.logcount);
int err;
sb->lognext = 0;
while (sb->lognext < logcount) {
trace("log block %i, blocknr %Lx, unify %Lx", sb->lognext, rp->blocknrs[sb->lognext], rp->unify_index);
log_next(sb);
err = replay_log_func(rp, sb->logbuf);
log_drop(sb);
if (err)
return err;
}
return 0;
}
/* Replay physical update like bnode, buffer state, etc. */
struct replay *replay_stage1(struct sb *sb)
{
struct replay *rp = replay_prepare(sb);
if (!IS_ERR(rp)) {
int err = replay_logblocks(rp, replay_log_stage1);
if (err) {
replay_done(rp);
return ERR_PTR(err);
}
}
return rp;
}
/* Replay logical update like bitmap data pages, etc. */
int replay_stage2(struct replay *rp)
{
int err = replay_logblocks(rp, replay_log_stage2);
if (err)
goto error;
err = replay_logblocks(rp, replay_logblock_mark);
if (err)
goto error;
/*
* Load orphan inodes into sb->orphan_add to decide what to do
* by caller.
*/
err = replay_load_orphan_inodes(rp);
if (err)
goto error;
return 0;
error:
replay_done(rp);
return err;
}
/*
* Replay pending frontend request like orphan, etc. I.e. this starts
* to modify FS.
*/
int replay_stage3(struct replay *rp, int apply)
{
struct sb *sb = rp->sb;
LIST_HEAD(orphan_in_otree);
list_splice_init(&rp->orphan_in_otree, &orphan_in_otree);
replay_done(rp);
/* Start logging after replay_done() */
replay_iput_orphan_inodes(sb, &orphan_in_otree, apply);
return 0;
}