blob: 8a712505386566ae3f34f182f404d1d011e8e9e4 [file] [log] [blame]
/*
* Copyright(c) 2015-2016 Intel Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include <setjmp.h>
#include <signal.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>
#include <ndctl.h>
#include <limits.h>
#include <stdbool.h>
#include <sys/mman.h>
#include <util/log.h>
#include <uuid/uuid.h>
#include <sys/types.h>
#include <util/json.h>
#include <util/size.h>
#include <util/util.h>
#include <util/bitmap.h>
#include <util/fletcher.h>
#include <ndctl/libndctl.h>
#include <ndctl/namespace.h>
#include <ccan/endian/endian.h>
#include <ccan/minmax/minmax.h>
#include <ccan/array_size/array_size.h>
#include <ccan/short_types/short_types.h>
struct check_opts {
bool verbose;
bool force;
bool repair;
bool logfix;
};
struct btt_chk {
char *path;
int fd;
uuid_t parent_uuid;
unsigned long long rawsize;
unsigned long long nlba;
int start_off;
int num_arenas;
long sys_page_size;
struct arena_info *arena;
struct check_opts *opts;
struct log_ctx ctx;
};
struct arena_info {
struct arena_map map;
u64 size; /* Total bytes for this arena */
u64 external_lba_start;
u32 internal_nlba;
u32 internal_lbasize;
u32 external_nlba;
u32 external_lbasize;
u32 nfree;
u16 version_major;
u16 version_minor;
u64 nextoff;
u64 infooff;
u64 dataoff;
u64 mapoff;
u64 logoff;
u64 info2off;
u32 flags;
int num;
struct btt_chk *bttc;
int log_index[2];
};
static sigjmp_buf sj_env;
static void sigbus_hdl(int sig, siginfo_t *siginfo, void *ptr)
{
siglongjmp(sj_env, 1);
}
static int repair_msg(struct btt_chk *bttc)
{
info(bttc, " Run with --repair to make the changes\n");
return 0;
}
/**
* btt_read_info - read an info block from a given offset
* @bttc: the main btt_chk structure for this btt
* @btt_sb: struct btt_sb where the info block will be copied into
* @offset: offset in the raw namespace to read the info block from
*
* This will also use 'pread' to read the info block, and not mmap+loads
* as this is used before the mappings are set up.
*/
static int btt_read_info(struct btt_chk *bttc, struct btt_sb *btt_sb, u64 off)
{
ssize_t size;
size = pread(bttc->fd, btt_sb, sizeof(*btt_sb), off);
if (size < 0) {
err(bttc, "unable to read first info block: %s\n",
strerror(errno));
return -errno;
}
if (size != sizeof(*btt_sb)) {
err(bttc, "short read of first info block: %ld\n", size);
return -ENXIO;
}
return 0;
}
/**
* btt_write_info - write an info block to the given offset
* @bttc: the main btt_chk structure for this btt
* @btt_sb: struct btt_sb where the info block will be copied from
* @offset: offset in the raw namespace to write the info block to
*
* This will also use 'pwrite' to write the info block, and not mmap+stores
* as this is used before the mappings are set up.
*/
static int btt_write_info(struct btt_chk *bttc, struct btt_sb *btt_sb, u64 off)
{
ssize_t size;
int rc;
if (!bttc->opts->repair) {
err(bttc, "BTT info block at offset %#lx needs to be restored\n",
off);
repair_msg(bttc);
return -EIO;
}
info(bttc, "Restoring BTT info block at offset %#lx\n", off);
size = pwrite(bttc->fd, btt_sb, sizeof(*btt_sb), off);
if (size < 0) {
err(bttc, "unable to write the info block: %s\n",
strerror(errno));
return -errno;
}
if (size != sizeof(*btt_sb)) {
err(bttc, "short write of the info block: %ld\n", size);
return -ENXIO;
}
rc = fsync(bttc->fd);
if (rc < 0)
return -errno;
return 0;
}
/**
* btt_copy_to_info2 - restore the backup info block using the main one
* @a: the arena_info handle for this arena
*
* Called when a corrupted backup info block is detected. Copies the
* main info block over to the backup location. This is done using
* mmap + stores, and thus needs a msync.
*/
static int btt_copy_to_info2(struct arena_info *a)
{
void *ms_align;
size_t ms_size;
if (!a->bttc->opts->repair) {
err(a->bttc, "Arena %d: BTT info2 needs to be restored\n",
a->num);
return repair_msg(a->bttc);
}
printf("Arena %d: Restoring BTT info2\n", a->num);
memcpy(a->map.info2, a->map.info, BTT_INFO_SIZE);
ms_align = (void *)rounddown((u64)a->map.info2, a->bttc->sys_page_size);
ms_size = max(BTT_INFO_SIZE, a->bttc->sys_page_size);
if (msync(ms_align, ms_size, MS_SYNC) < 0)
return -errno;
return 0;
}
/*
* btt_map_lookup - given a pre-map Arena Block Address, return the post-map ABA
* @a: the arena_info handle for this arena
* @lba: the logical block address for which we are performing the lookup
*
* This will correctly account for map entries in the 'initial state'
*/
static u32 btt_map_lookup(struct arena_info *a, u32 lba)
{
u32 raw_mapping;
raw_mapping = le32_to_cpu(a->map.map[lba]);
if (raw_mapping & MAP_ENT_NORMAL)
return raw_mapping & MAP_LBA_MASK;
else
return lba;
}
static int btt_map_write(struct arena_info *a, u32 lba, u32 mapping)
{
void *ms_align;
if (!a->bttc->opts->repair) {
err(a->bttc,
"Arena %d: map[%#x] needs to be updated to %#x\n",
a->num, lba, mapping);
return repair_msg(a->bttc);
}
info(a->bttc, "Arena %d: Updating map[%#x] to %#x\n", a->num,
lba, mapping);
/*
* We want to set neither of the Z or E flags, and in the actual
* layout, this means setting the bit positions of both to '1' to
* indicate a 'normal' map entry
*/
mapping |= MAP_ENT_NORMAL;
a->map.map[lba] = cpu_to_le32(mapping);
ms_align = (void *)rounddown((u64)&a->map.map[lba],
a->bttc->sys_page_size);
if (msync(ms_align, a->bttc->sys_page_size, MS_SYNC) < 0)
return -errno;
return 0;
}
static void btt_log_group_read(struct arena_info *a, u32 lane,
struct log_group *log)
{
memcpy(log, &a->map.log[lane], LOG_GRP_SIZE);
}
static void btt_log_group_write(struct arena_info *a, u32 lane,
struct log_group *log)
{
memcpy(&a->map.log[lane], log, LOG_GRP_SIZE);
}
static u32 log_seq(struct log_group *log, int log_idx)
{
return le32_to_cpu(log->ent[log_idx].seq);
}
/*
* This function accepts two log entries, and uses the sequence number to
* find the 'older' entry. The return value indicates which of the two was
* the 'old' entry
*/
static int btt_log_get_old(struct arena_info *a, struct log_group *log)
{
int idx0 = a->log_index[0];
int idx1 = a->log_index[1];
int old;
if (log_seq(log, idx0) == 0) {
log->ent[idx0].seq = cpu_to_le32(1);
return 0;
}
if (log_seq(log, idx0) < log_seq(log, idx1)) {
if ((log_seq(log, idx1) - log_seq(log, idx0)) == 1)
old = 0;
else
old = 1;
} else {
if ((log_seq(log, idx0) - log_seq(log, idx1)) == 1)
old = 1;
else
old = 0;
}
return old;
}
static int btt_log_read(struct arena_info *a, u32 lane, struct log_entry *ent)
{
int new_ent;
struct log_group log;
if (ent == NULL)
return -EINVAL;
btt_log_group_read(a, lane, &log);
new_ent = 1 - btt_log_get_old(a, &log);
memcpy(ent, &log.ent[a->log_index[new_ent]], LOG_ENT_SIZE);
return 0;
}
static int btt_checksum_verify(struct btt_sb *btt_sb)
{
uint64_t sum;
le64 sum_save;
BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K);
sum_save = btt_sb->checksum;
btt_sb->checksum = 0;
sum = fletcher64(btt_sb, sizeof(*btt_sb), 1);
if (sum != sum_save)
return 1;
/* restore the checksum in the buffer */
btt_sb->checksum = sum_save;
return 0;
}
/*
* Never pass a mmapped buffer to this as it will attempt to write to
* the buffer, and we want writes to only happened in a controlled fashion.
* In the non --repair case, even if such a buffer is passed, the write will
* result in a fault due to the readonly mmap flags.
*/
static int btt_info_verify(struct btt_chk *bttc, struct btt_sb *btt_sb)
{
if (memcmp(btt_sb->signature, BTT_SIG, BTT_SIG_LEN) != 0)
return -ENXIO;
if (!uuid_is_null(btt_sb->parent_uuid))
if (uuid_compare(bttc->parent_uuid, btt_sb->parent_uuid) != 0)
return -ENXIO;
if (btt_checksum_verify(btt_sb))
return -ENXIO;
return 0;
}
static int btt_info_read_verify(struct btt_chk *bttc, struct btt_sb *btt_sb,
u64 off)
{
int rc;
rc = btt_read_info(bttc, btt_sb, off);
if (rc)
return rc;
rc = btt_info_verify(bttc, btt_sb);
if (rc)
return rc;
return 0;
}
enum btt_errcodes {
BTT_OK = 0,
BTT_LOG_EQL_SEQ = 0x100,
BTT_LOG_OOB_SEQ,
BTT_LOG_OOB_LBA,
BTT_LOG_OOB_OLD,
BTT_LOG_OOB_NEW,
BTT_LOG_MAP_ERR,
BTT_MAP_OOB,
BTT_BITMAP_ERROR,
BTT_LOGFIX_ERR,
};
static void btt_xlat_status(struct arena_info *a, int errcode)
{
switch(errcode) {
case BTT_OK:
break;
case BTT_LOG_EQL_SEQ:
err(a->bttc,
"arena %d: found a pair of log entries with the same sequence number\n",
a->num);
break;
case BTT_LOG_OOB_SEQ:
err(a->bttc,
"arena %d: found a log entry with an out of bounds sequence number\n",
a->num);
break;
case BTT_LOG_OOB_LBA:
err(a->bttc,
"arena %d: found a log entry with an out of bounds LBA\n",
a->num);
break;
case BTT_LOG_OOB_OLD:
err(a->bttc,
"arena %d: found a log entry with an out of bounds 'old' mapping\n",
a->num);
break;
case BTT_LOG_OOB_NEW:
err(a->bttc,
"arena %d: found a log entry with an out of bounds 'new' mapping\n",
a->num);
break;
case BTT_LOG_MAP_ERR:
info(a->bttc,
"arena %d: found a log entry that does not match with a map entry\n",
a->num);
break;
case BTT_MAP_OOB:
err(a->bttc,
"arena %d: found a map entry that is out of bounds\n",
a->num);
break;
case BTT_BITMAP_ERROR:
err(a->bttc,
"arena %d: bitmap error: internal blocks are incorrectly referenced\n",
a->num);
break;
case BTT_LOGFIX_ERR:
err(a->bttc,
"arena %d: rewrite-log error: log may be in an unknown/unrecoverable state\n",
a->num);
break;
default:
err(a->bttc, "arena %d: unknown error: %d\n",
a->num, errcode);
}
}
/* Check that log entries are self consistent */
static int btt_check_log_entries(struct arena_info *a)
{
int idx0 = a->log_index[0];
int idx1 = a->log_index[1];
unsigned int i;
int rc = 0;
/*
* First, check both 'slots' for sequence numbers being distinct
* and in bounds
*/
for (i = 0; i < a->nfree; i++) {
struct log_group *log = &a->map.log[i];
if (log_seq(log, idx0) == log_seq(log, idx1))
return BTT_LOG_EQL_SEQ;
if (log_seq(log, idx0) > 3 || log_seq(log, idx1) > 3)
return BTT_LOG_OOB_SEQ;
}
/*
* Next, check only the 'new' slot in each lane for the remaining
* fields being in bounds
*/
for (i = 0; i < a->nfree; i++) {
struct log_entry ent;
rc = btt_log_read(a, i, &ent);
if (rc)
return rc;
if (ent.lba >= a->external_nlba)
return BTT_LOG_OOB_LBA;
if (ent.old_map >= a->internal_nlba)
return BTT_LOG_OOB_OLD;
if (ent.new_map >= a->internal_nlba)
return BTT_LOG_OOB_NEW;
}
return rc;
}
/* Check that map entries are self consistent */
static int btt_check_map_entries(struct arena_info *a)
{
unsigned int i;
u32 mapping;
for (i = 0; i < a->external_nlba; i++) {
mapping = btt_map_lookup(a, i);
if (mapping >= a->internal_nlba)
return BTT_MAP_OOB;
}
return 0;
}
/* Check that each flog entry has the correct corresponding map entry */
static int btt_check_log_map(struct arena_info *a)
{
unsigned int i;
u32 mapping;
int rc = 0, rc_saved = 0;
for (i = 0; i < a->nfree; i++) {
struct log_entry ent;
rc = btt_log_read(a, i, &ent);
if (rc)
return rc;
mapping = btt_map_lookup(a, ent.lba);
/*
* Case where the flog was written, but map couldn't be
* updated. The kernel should also be able to detect and
* fix this condition.
*/
if (ent.new_map != mapping && ent.old_map == mapping) {
info(a->bttc,
"arena %d: log[%d].new_map (%#x) doesn't match map[%#x] (%#x)\n",
a->num, i, ent.new_map, ent.lba, mapping);
rc = btt_map_write(a, ent.lba, ent.new_map);
if (rc)
rc_saved = rc;
}
}
return rc_saved ? BTT_LOG_MAP_ERR : 0;
}
static int btt_check_info2(struct arena_info *a)
{
/*
* Repair info2 if needed. The main info-block can be trusted
* as it has been verified during arena discovery
*/
if(memcmp(a->map.info2, a->map.info, BTT_INFO_SIZE))
return btt_copy_to_info2(a);
return 0;
}
/*
* This will create a bitmap where each bit corresponds to an internal
* 'block'. Between the BTT map and flog (representing 'free' blocks),
* every single internal block must be represented exactly once. This
* check will detect cases where either one or more blocks are never
* referenced, or if a block is referenced more than once.
*/
static int btt_check_bitmap(struct arena_info *a)
{
unsigned long *bm;
u32 i, btt_mapping;
int rc = BTT_BITMAP_ERROR;
bm = bitmap_alloc(a->internal_nlba);
if (bm == NULL)
return -ENOMEM;
/* map 'external_nlba' number of map entries */
for (i = 0; i < a->external_nlba; i++) {
btt_mapping = btt_map_lookup(a, i);
if (test_bit(btt_mapping, bm)) {
info(a->bttc,
"arena %d: internal block %#x is referenced by two map entries\n",
a->num, btt_mapping);
goto out;
}
bitmap_set(bm, btt_mapping, 1);
}
/* map 'nfree' number of flog entries */
for (i = 0; i < a->nfree; i++) {
struct log_entry ent;
rc = btt_log_read(a, i, &ent);
if (rc)
goto out;
if (test_bit(ent.old_map, bm)) {
info(a->bttc,
"arena %d: internal block %#x is referenced by two map/log entries\n",
a->num, ent.old_map);
rc = BTT_BITMAP_ERROR;
goto out;
}
bitmap_set(bm, ent.old_map, 1);
}
/* check that the bitmap is full */
if (!bitmap_full(bm, a->internal_nlba))
rc = BTT_BITMAP_ERROR;
out:
free(bm);
return rc;
}
static int btt_rewrite_log(struct arena_info *a)
{
struct log_group log;
int rc;
u32 i;
info(a->bttc, "arena %d: rewriting log\n", a->num);
/*
* To rewrite the log, we implicitly use the 'new' padding scheme of
* (0, 1) but resetting the log to a completely initial state (i.e.
* slot-0 contains a made-up entry containing the 'free' block from
* the existing current log entry, and a sequence number of '1'. All
* other slots are zeroed.
*
* This way of rewriting the log is the most flexible as it can be
* (ab)used to convert a new padding format back to the old one.
* Since it only recreates slot-0, which is common between both
* existing formats, an older kernel will simply initialize the free
* list using those slot-0 entries, and run with it as though slot-2
* is the other valid slot.
*/
memset(&log, 0, LOG_GRP_SIZE);
for (i = 0; i < a->nfree; i++) {
struct log_entry ent;
rc = btt_log_read(a, i, &ent);
if (rc)
return BTT_LOGFIX_ERR;
log.ent[0].lba = ent.lba;
log.ent[0].old_map = ent.old_map;
log.ent[0].new_map = ent.new_map;
log.ent[0].seq = 1;
btt_log_group_write(a, i, &log);
}
return 0;
}
static int btt_check_arenas(struct btt_chk *bttc)
{
struct arena_info *a = NULL;
int i, rc;
for(i = 0; i < bttc->num_arenas; i++) {
info(bttc, "checking arena %d\n", i);
a = &bttc->arena[i];
rc = btt_check_log_entries(a);
if (rc)
break;
rc = btt_check_map_entries(a);
if (rc)
break;
rc = btt_check_log_map(a);
if (rc)
break;
rc = btt_check_info2(a);
if (rc)
break;
/*
* bitmap test has to be after check_log_map so that any
* pending log updates have been performed. Otherwise the
* bitmap test may result in a false positive
*/
rc = btt_check_bitmap(a);
if (rc)
break;
if (bttc->opts->logfix) {
rc = btt_rewrite_log(a);
if (rc)
break;
}
}
if (a && rc != BTT_OK) {
btt_xlat_status(a, rc);
return -ENXIO;
}
return 0;
}
/*
* This copies over information from the info block to the arena_info struct.
* The main difference is that all the offsets (infooff, mapoff etc) were
* relative to the arena in the info block, but in arena_info, we use
* arena_off to make these offsets absolute, i.e. relative to the start of
* the raw namespace.
*/
static int btt_parse_meta(struct arena_info *arena, struct btt_sb *btt_sb,
u64 arena_off)
{
arena->internal_nlba = le32_to_cpu(btt_sb->internal_nlba);
arena->internal_lbasize = le32_to_cpu(btt_sb->internal_lbasize);
arena->external_nlba = le32_to_cpu(btt_sb->external_nlba);
arena->external_lbasize = le32_to_cpu(btt_sb->external_lbasize);
arena->nfree = le32_to_cpu(btt_sb->nfree);
if (arena->internal_nlba - arena->external_nlba != arena->nfree)
return -ENXIO;
if (arena->internal_lbasize != arena->external_lbasize)
return -ENXIO;
arena->version_major = le16_to_cpu(btt_sb->version_major);
arena->version_minor = le16_to_cpu(btt_sb->version_minor);
arena->nextoff = (btt_sb->nextoff == 0) ? 0 : (arena_off +
le64_to_cpu(btt_sb->nextoff));
arena->infooff = arena_off;
arena->dataoff = arena_off + le64_to_cpu(btt_sb->dataoff);
arena->mapoff = arena_off + le64_to_cpu(btt_sb->mapoff);
arena->logoff = arena_off + le64_to_cpu(btt_sb->logoff);
arena->info2off = arena_off + le64_to_cpu(btt_sb->info2off);
arena->size = (le64_to_cpu(btt_sb->nextoff) > 0)
? (le64_to_cpu(btt_sb->nextoff))
: (arena->info2off - arena->infooff + BTT_INFO_SIZE);
arena->flags = le32_to_cpu(btt_sb->flags);
if (btt_sb->flags & IB_FLAG_ERROR_MASK) {
err(arena->bttc, "Info block error flag is set, aborting\n");
return -ENXIO;
}
return 0;
}
static bool ent_is_padding(struct log_entry *ent)
{
return (ent->lba == 0) && (ent->old_map == 0) && (ent->new_map == 0)
&& (ent->seq == 0);
}
/*
* Detecting valid log indices: We read a log group, and iterate over its
* four slots. We expect that a padding slot will be all-zeroes, and use this
* to detect a padding slot vs. an actual entry.
*
* If a log_group is in the initial state, i.e. hasn't been used since the
* creation of this BTT layout, it will have three of the four slots with
* zeroes. We skip over these log_groups for the detection of log_index. If
* all log_groups are in the initial state (i.e. the BTT has never been
* written to), it is safe to assume the 'new format' of log entries in slots
* (0, 1).
*/
static int log_set_indices(struct arena_info *arena)
{
bool idx_set = false, initial_state = true;
int log_index[2] = {-1, -1};
struct log_group log;
int j, next_idx = 0;
u32 pad_count = 0;
u32 i;
for (i = 0; i < arena->nfree; i++) {
btt_log_group_read(arena, i, &log);
for (j = 0; j < 4; j++) {
if (!idx_set) {
if (ent_is_padding(&log.ent[j])) {
pad_count++;
continue;
} else {
/* Skip if index has been recorded */
if ((next_idx == 1) &&
(j == log_index[0]))
continue;
/* valid entry, record index */
log_index[next_idx] = j;
next_idx++;
}
if (next_idx == 2) {
/* two valid entries found */
idx_set = true;
} else if (next_idx > 2) {
/* too many valid indices */
return -ENXIO;
}
} else {
/*
* once the indices have been set, just verify
* that all subsequent log groups are either in
* their initial state or follow the same
* indices.
*/
if (j == log_index[0]) {
/* entry must be 'valid' */
if (ent_is_padding(&log.ent[j]))
return -ENXIO;
} else if (j == log_index[1]) {
;
/*
* log_index[1] can be padding if the
* lane never got used and it is still
* in the initial state (three 'padding'
* entries)
*/
} else {
/* entry must be invalid (padding) */
if (!ent_is_padding(&log.ent[j]))
return -ENXIO;
}
}
}
/*
* If any of the log_groups have more than one valid,
* non-padding entry, then the we are no longer in the
* initial_state
*/
if (pad_count < 3)
initial_state = false;
pad_count = 0;
}
if (!initial_state && !idx_set)
return -ENXIO;
/*
* If all the entries in the log were in the initial state,
* assume new padding scheme
*/
if (initial_state)
log_index[1] = 1;
/*
* Only allow the known permutations of log/padding indices,
* i.e. (0, 1), and (0, 2)
*/
if ((log_index[0] == 0) && ((log_index[1] == 1) || (log_index[1] == 2)))
; /* known index possibilities */
else {
err(arena->bttc, "Found an unknown padding scheme\n");
return -ENXIO;
}
arena->log_index[0] = log_index[0];
arena->log_index[1] = log_index[1];
info(arena->bttc, "arena[%d]: log_index_0 = %d\n",
arena->num, log_index[0]);
info(arena->bttc, "arena[%d]: log_index_1 = %d\n",
arena->num, log_index[1]);
return 0;
}
static int btt_discover_arenas(struct btt_chk *bttc)
{
int ret = 0;
struct arena_info *arena;
struct btt_sb *btt_sb;
size_t remaining = bttc->rawsize;
size_t cur_off = bttc->start_off;
u64 cur_nlba = 0;
int i = 0;
btt_sb = calloc(1, sizeof(*btt_sb));
if (!btt_sb)
return -ENOMEM;
while (remaining) {
/* Alloc memory for arena */
arena = realloc(bttc->arena, (i + 1) * sizeof(*arena));
if (!arena) {
ret = -ENOMEM;
goto out;
} else {
bttc->arena = arena;
arena = &bttc->arena[i];
/* zero the new memory */
memset(arena, 0, sizeof(*arena));
}
arena->infooff = cur_off;
ret = btt_read_info(bttc, btt_sb, cur_off);
if (ret)
goto out;
if (btt_info_verify(bttc, btt_sb) != 0) {
u64 offset;
/* Try to find the backup info block */
if (remaining <= ARENA_MAX_SIZE)
offset = rounddown(bttc->rawsize, SZ_4K) -
BTT_INFO_SIZE;
else
offset = cur_off + ARENA_MAX_SIZE -
BTT_INFO_SIZE;
info(bttc,
"Arena %d: Attempting recover info-block using info2\n", i);
ret = btt_read_info(bttc, btt_sb, offset);
if (ret) {
err(bttc, "Unable to read backup info block (offset %#lx)\n",
offset);
goto out;
}
ret = btt_info_verify(bttc, btt_sb);
if (ret) {
err(bttc, "Backup info block (offset %#lx) verification failed\n",
offset);
goto out;
}
ret = btt_write_info(bttc, btt_sb, cur_off);
if (ret) {
err(bttc, "Restoration of the info block failed: %s (%d)\n",
strerror(abs(ret)), ret);
goto out;
}
}
arena->num = i;
arena->bttc = bttc;
arena->external_lba_start = cur_nlba;
ret = btt_parse_meta(arena, btt_sb, cur_off);
if (ret) {
err(bttc, "Problem parsing arena[%d] metadata\n", i);
goto out;
}
remaining -= arena->size;
cur_off += arena->size;
cur_nlba += arena->external_nlba;
i++;
if (arena->nextoff == 0)
break;
}
bttc->num_arenas = i;
bttc->nlba = cur_nlba;
info(bttc, "found %d BTT arena%s\n", bttc->num_arenas,
(bttc->num_arenas > 1) ? "s" : "");
free(btt_sb);
return ret;
out:
free(bttc->arena);
free(btt_sb);
return ret;
}
static int btt_create_mappings(struct btt_chk *bttc)
{
struct arena_info *a;
int mmap_flags;
int i;
if (!bttc->opts->repair)
mmap_flags = PROT_READ;
else
mmap_flags = PROT_READ|PROT_WRITE;
for (i = 0; i < bttc->num_arenas; i++) {
a = &bttc->arena[i];
a->map.info_len = BTT_INFO_SIZE;
a->map.info = mmap(NULL, a->map.info_len, mmap_flags,
MAP_SHARED, bttc->fd, a->infooff);
if (a->map.info == MAP_FAILED) {
err(bttc, "mmap arena[%d].info [sz = %#lx, off = %#lx] failed: %s\n",
i, a->map.info_len, a->infooff, strerror(errno));
return -errno;
}
a->map.data_len = a->mapoff - a->dataoff;
a->map.data = mmap(NULL, a->map.data_len, mmap_flags,
MAP_SHARED, bttc->fd, a->dataoff);
if (a->map.data == MAP_FAILED) {
err(bttc, "mmap arena[%d].data [sz = %#lx, off = %#lx] failed: %s\n",
i, a->map.data_len, a->dataoff, strerror(errno));
return -errno;
}
a->map.map_len = a->logoff - a->mapoff;
a->map.map = mmap(NULL, a->map.map_len, mmap_flags,
MAP_SHARED, bttc->fd, a->mapoff);
if (a->map.map == MAP_FAILED) {
err(bttc, "mmap arena[%d].map [sz = %#lx, off = %#lx] failed: %s\n",
i, a->map.map_len, a->mapoff, strerror(errno));
return -errno;
}
a->map.log_len = a->info2off - a->logoff;
a->map.log = mmap(NULL, a->map.log_len, mmap_flags,
MAP_SHARED, bttc->fd, a->logoff);
if (a->map.log == MAP_FAILED) {
err(bttc, "mmap arena[%d].log [sz = %#lx, off = %#lx] failed: %s\n",
i, a->map.log_len, a->logoff, strerror(errno));
return -errno;
}
a->map.info2_len = BTT_INFO_SIZE;
a->map.info2 = mmap(NULL, a->map.info2_len, mmap_flags,
MAP_SHARED, bttc->fd, a->info2off);
if (a->map.info2 == MAP_FAILED) {
err(bttc, "mmap arena[%d].info2 [sz = %#lx, off = %#lx] failed: %s\n",
i, a->map.info2_len, a->info2off, strerror(errno));
return -errno;
}
}
return 0;
}
static void btt_remove_mappings(struct btt_chk *bttc)
{
struct arena_info *a;
int i;
for (i = 0; i < bttc->num_arenas; i++) {
a = &bttc->arena[i];
if (a->map.info)
munmap(a->map.info, a->map.info_len);
if (a->map.data)
munmap(a->map.data, a->map.data_len);
if (a->map.map)
munmap(a->map.map, a->map.map_len);
if (a->map.log)
munmap(a->map.log, a->map.log_len);
if (a->map.info2)
munmap(a->map.info2, a->map.info2_len);
}
}
static int btt_sb_get_expected_offset(struct btt_sb *btt_sb)
{
u16 version_major, version_minor;
version_major = le16_to_cpu(btt_sb->version_major);
version_minor = le16_to_cpu(btt_sb->version_minor);
if (version_major == 1 && version_minor == 1)
return BTT1_START_OFFSET;
else if (version_major == 2 && version_minor == 0)
return BTT2_START_OFFSET;
else
return -ENXIO;
}
static int __btt_recover_first_sb(struct btt_chk *bttc, int off)
{
int rc, est_arenas = 0;
u64 offset, remaining;
struct btt_sb *btt_sb;
/* Estimate the number of arenas */
remaining = bttc->rawsize - off;
while (remaining) {
if (remaining < ARENA_MIN_SIZE && est_arenas == 0)
return -EINVAL;
if (remaining > ARENA_MAX_SIZE) {
/* full-size arena */
remaining -= ARENA_MAX_SIZE;
est_arenas++;
continue;
}
if (remaining < ARENA_MIN_SIZE) {
/* 'remaining' was too small for another arena */
break;
} else {
/* last, short arena */
remaining = 0;
est_arenas++;
break;
}
}
info(bttc, "estimated arenas: %d, remaining bytes: %#lx\n",
est_arenas, remaining);
btt_sb = malloc(2 * sizeof(*btt_sb));
if (btt_sb == NULL)
return -ENOMEM;
/* Read the original first info block into btt_sb[0] */
rc = btt_read_info(bttc, &btt_sb[0], off);
if (rc)
goto out;
/* Attepmt 1: try recovery from expected end of the first arena */
if (est_arenas == 1)
offset = rounddown(bttc->rawsize - remaining, SZ_4K) -
BTT_INFO_SIZE;
else
offset = ARENA_MAX_SIZE - BTT_INFO_SIZE + off;
info(bttc, "Attempting recover info-block from end-of-arena offset %#lx\n",
offset);
rc = btt_info_read_verify(bttc, &btt_sb[1], offset);
if (rc == 0) {
int expected_offset = btt_sb_get_expected_offset(&btt_sb[1]);
/*
* The fact that the btt_sb is self-consistent doesn't tell us
* what BTT version it was, if restoring from the end of the
* arena. (i.e. a consistent sb may be found for any valid
* start offset). Use the version information in the sb to
* determine what the expected start offset is.
*/
if ((expected_offset < 0) || (expected_offset != off)) {
rc = -ENXIO;
goto out;
}
rc = btt_write_info(bttc, &btt_sb[1], off);
goto out;
}
/*
* Attempt 2: From the very end of 'rawsize', try to copy the fields
* that are constant in every arena (only valid when multiple arenas
* are present)
*/
if (est_arenas > 1) {
offset = rounddown(bttc->rawsize - remaining, SZ_4K) -
BTT_INFO_SIZE;
info(bttc, "Attempting to recover info-block from end offset %#lx\n",
offset);
rc = btt_info_read_verify(bttc, &btt_sb[1], offset);
if (rc)
goto out;
/* copy over the arena0 specific fields from btt_sb[0] */
btt_sb[1].flags = btt_sb[0].flags;
btt_sb[1].external_nlba = btt_sb[0].external_nlba;
btt_sb[1].internal_nlba = btt_sb[0].internal_nlba;
btt_sb[1].nextoff = btt_sb[0].nextoff;
btt_sb[1].dataoff = btt_sb[0].dataoff;
btt_sb[1].mapoff = btt_sb[0].mapoff;
btt_sb[1].logoff = btt_sb[0].logoff;
btt_sb[1].info2off = btt_sb[0].info2off;
btt_sb[1].checksum = btt_sb[0].checksum;
rc = btt_info_verify(bttc, &btt_sb[1]);
if (rc == 0) {
rc = btt_write_info(bttc, &btt_sb[1], off);
goto out;
}
}
/*
* Attempt 3: use info2off as-is, and check if we find a valid info
* block at that location.
*/
offset = le32_to_cpu(btt_sb[0].info2off);
if (offset > min(bttc->rawsize - BTT_INFO_SIZE,
ARENA_MAX_SIZE - BTT_INFO_SIZE + off)) {
rc = -ENXIO;
goto out;
}
if (offset) {
info(bttc, "Attempting to recover info-block from info2 offset %#lx\n",
offset);
rc = btt_info_read_verify(bttc, &btt_sb[1],
offset + off);
if (rc == 0) {
rc = btt_write_info(bttc, &btt_sb[1], off);
goto out;
}
} else
rc = -ENXIO;
out:
free(btt_sb);
return rc;
}
static int btt_recover_first_sb(struct btt_chk *bttc)
{
int offsets[BTT_NUM_OFFSETS] = {
BTT1_START_OFFSET,
BTT2_START_OFFSET,
};
int i, rc;
for (i = 0; i < BTT_NUM_OFFSETS; i++) {
rc = __btt_recover_first_sb(bttc, offsets[i]);
if (rc == 0) {
bttc->start_off = offsets[i];
return rc;
}
}
return rc;
}
int namespace_check(struct ndctl_namespace *ndns, bool verbose, bool force,
bool repair, bool logfix)
{
const char *devname = ndctl_namespace_get_devname(ndns);
struct check_opts __opts = {
.verbose = verbose,
.force = force,
.repair = repair,
.logfix = logfix,
}, *opts = &__opts;
int raw_mode, rc, disabled_flag = 0, open_flags;
struct btt_sb *btt_sb;
struct btt_chk *bttc;
struct sigaction act;
char path[50];
int i;
bttc = calloc(1, sizeof(*bttc));
if (bttc == NULL)
return -ENOMEM;
log_init(&bttc->ctx, devname, "NDCTL_CHECK_NAMESPACE");
if (opts->verbose)
bttc->ctx.log_priority = LOG_DEBUG;
memset(&act, 0, sizeof(act));
act.sa_sigaction = sigbus_hdl;
act.sa_flags = SA_SIGINFO;
if (sigaction(SIGBUS, &act, 0)) {
err(bttc, "Unable to set sigaction\n");
rc = -errno;
goto out_bttc;
}
if (opts->logfix) {
if (!opts->repair) {
err(bttc, "--rewrite-log also requires --repair\n");
rc = -EINVAL;
goto out_bttc;
}
info(bttc,
"WARNING: interruption may cause unrecoverable metadata corruption\n");
}
bttc->opts = opts;
bttc->sys_page_size = sysconf(_SC_PAGESIZE);
bttc->rawsize = ndctl_namespace_get_size(ndns);
ndctl_namespace_get_uuid(ndns, bttc->parent_uuid);
info(bttc, "checking %s\n", devname);
if (ndctl_namespace_is_active(ndns)) {
if (opts->force) {
rc = ndctl_namespace_disable_safe(ndns);
if (rc)
goto out_bttc;
disabled_flag = 1;
} else {
err(bttc, "%s: check aborted, namespace online\n",
devname);
rc = -EBUSY;
goto out_bttc;
}
}
/* In typical usage, the current raw_mode should be false. */
raw_mode = ndctl_namespace_get_raw_mode(ndns);
/*
* Putting the namespace into raw mode will allow us to access
* the btt metadata.
*/
rc = ndctl_namespace_set_raw_mode(ndns, 1);
if (rc < 0) {
err(bttc, "%s: failed to set the raw mode flag: %s (%d)\n",
devname, strerror(abs(rc)), rc);
goto out_ns;
}
/*
* Now enable the namespace. This will result in a pmem device
* node showing up in /dev that is in raw mode.
*/
rc = ndctl_namespace_enable(ndns);
if (rc != 0) {
err(bttc, "%s: failed to enable in raw mode: %s (%d)\n",
devname, strerror(abs(rc)), rc);
goto out_ns;
}
sprintf(path, "/dev/%s", ndctl_namespace_get_block_device(ndns));
bttc->path = path;
btt_sb = malloc(sizeof(*btt_sb));
if (btt_sb == NULL) {
rc = -ENOMEM;
goto out_ns;
}
if (!bttc->opts->repair)
open_flags = O_RDONLY|O_EXCL;
else
open_flags = O_RDWR|O_EXCL;
bttc->fd = open(bttc->path, open_flags);
if (bttc->fd < 0) {
err(bttc, "unable to open %s: %s\n",
bttc->path, strerror(errno));
rc = -errno;
goto out_sb;
}
/*
* This is where we jump to if we receive a SIGBUS, prior to doing any
* mmaped reads, and can safely abort
*/
if (sigsetjmp(sj_env, 1)) {
err(bttc, "Received a SIGBUS\n");
err(bttc,
"Metadata corruption found, recovery is not possible\n");
rc = -EFAULT;
goto out_close;
}
/* Try reading a BTT1 info block first */
rc = btt_info_read_verify(bttc, btt_sb, BTT1_START_OFFSET);
if (rc == 0)
bttc->start_off = BTT1_START_OFFSET;
if (rc) {
/* Try reading a BTT2 info block */
rc = btt_info_read_verify(bttc, btt_sb, BTT2_START_OFFSET);
if (rc == 0)
bttc->start_off = BTT2_START_OFFSET;
if (rc) {
rc = btt_recover_first_sb(bttc);
if (rc) {
err(bttc, "Unable to recover any BTT info blocks\n");
goto out_close;
}
/*
* btt_recover_first_sb will have set bttc->start_off
* based on the version it found
*/
rc = btt_info_read_verify(bttc, btt_sb, bttc->start_off);
if (rc)
goto out_close;
}
}
rc = btt_discover_arenas(bttc);
if (rc)
goto out_close;
rc = btt_create_mappings(bttc);
if (rc)
goto out_close;
for (i = 0; i < bttc->num_arenas; i++) {
rc = log_set_indices(&bttc->arena[i]);
if (rc) {
err(bttc,
"Unable to deduce log/padding indices\n");
goto out_close;
}
}
rc = btt_check_arenas(bttc);
btt_remove_mappings(bttc);
out_close:
close(bttc->fd);
out_sb:
free(btt_sb);
out_ns:
ndctl_namespace_set_raw_mode(ndns, raw_mode);
ndctl_namespace_disable_invalidate(ndns);
if (disabled_flag)
if(ndctl_namespace_enable(ndns) < 0)
err(bttc, "%s: failed to re-enable namespace\n",
devname);
out_bttc:
free(bttc);
return rc;
}