blob: ec57687c9a4d8a079466d6e9f95724188ec03bc7 [file] [log] [blame]
/*
* Copyright (C) 2011, 2012 STRATO. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/blkdev.h>
#include <linux/ratelimit.h>
#include "ctree.h"
#include "volumes.h"
#include "disk-io.h"
#include "ordered-data.h"
#include "transaction.h"
#include "backref.h"
#include "extent_io.h"
#include "dev-replace.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "raid56.h"
/*
* This is only the first step towards a full-features scrub. It reads all
* extent and super block and verifies the checksums. In case a bad checksum
* is found or the extent cannot be read, good data will be written back if
* any can be found.
*
* Future enhancements:
* - In case an unrepairable extent is encountered, track which files are
* affected and report them
* - track and record media errors, throw out bad devices
* - add a mode to also read unallocated space
*/
struct scrub_block;
struct scrub_ctx;
/*
* the following three values only influence the performance.
* The last one configures the number of parallel and outstanding I/O
* operations. The first two values configure an upper limit for the number
* of (dynamically allocated) pages that are added to a bio.
*/
#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
/*
* the following value times PAGE_SIZE needs to be large enough to match the
* largest node/leaf/sector size that shall be supported.
* Values larger than BTRFS_STRIPE_LEN are not supported.
*/
#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
struct scrub_recover {
atomic_t refs;
struct btrfs_bio *bbio;
u64 map_length;
};
struct scrub_page {
struct scrub_block *sblock;
struct page *page;
struct btrfs_device *dev;
struct list_head list;
u64 flags; /* extent flags */
u64 generation;
u64 logical;
u64 physical;
u64 physical_for_dev_replace;
atomic_t refs;
struct {
unsigned int mirror_num:8;
unsigned int have_csum:1;
unsigned int io_error:1;
};
u8 csum[BTRFS_CSUM_SIZE];
struct scrub_recover *recover;
};
struct scrub_bio {
int index;
struct scrub_ctx *sctx;
struct btrfs_device *dev;
struct bio *bio;
int err;
u64 logical;
u64 physical;
#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
#else
struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
#endif
int page_count;
int next_free;
struct btrfs_work work;
};
struct scrub_block {
struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
int page_count;
atomic_t outstanding_pages;
atomic_t refs; /* free mem on transition to zero */
struct scrub_ctx *sctx;
struct scrub_parity *sparity;
struct {
unsigned int header_error:1;
unsigned int checksum_error:1;
unsigned int no_io_error_seen:1;
unsigned int generation_error:1; /* also sets header_error */
/* The following is for the data used to check parity */
/* It is for the data with checksum */
unsigned int data_corrected:1;
};
};
/* Used for the chunks with parity stripe such RAID5/6 */
struct scrub_parity {
struct scrub_ctx *sctx;
struct btrfs_device *scrub_dev;
u64 logic_start;
u64 logic_end;
int nsectors;
int stripe_len;
atomic_t refs;
struct list_head spages;
/* Work of parity check and repair */
struct btrfs_work work;
/* Mark the parity blocks which have data */
unsigned long *dbitmap;
/*
* Mark the parity blocks which have data, but errors happen when
* read data or check data
*/
unsigned long *ebitmap;
unsigned long bitmap[0];
};
struct scrub_wr_ctx {
struct scrub_bio *wr_curr_bio;
struct btrfs_device *tgtdev;
int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
atomic_t flush_all_writes;
struct mutex wr_lock;
};
struct scrub_ctx {
struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
struct btrfs_root *dev_root;
int first_free;
int curr;
atomic_t bios_in_flight;
atomic_t workers_pending;
spinlock_t list_lock;
wait_queue_head_t list_wait;
u16 csum_size;
struct list_head csum_list;
atomic_t cancel_req;
int readonly;
int pages_per_rd_bio;
u32 sectorsize;
u32 nodesize;
int is_dev_replace;
struct scrub_wr_ctx wr_ctx;
/*
* statistics
*/
struct btrfs_scrub_progress stat;
spinlock_t stat_lock;
/*
* Use a ref counter to avoid use-after-free issues. Scrub workers
* decrement bios_in_flight and workers_pending and then do a wakeup
* on the list_wait wait queue. We must ensure the main scrub task
* doesn't free the scrub context before or while the workers are
* doing the wakeup() call.
*/
atomic_t refs;
};
struct scrub_fixup_nodatasum {
struct scrub_ctx *sctx;
struct btrfs_device *dev;
u64 logical;
struct btrfs_root *root;
struct btrfs_work work;
int mirror_num;
};
struct scrub_nocow_inode {
u64 inum;
u64 offset;
u64 root;
struct list_head list;
};
struct scrub_copy_nocow_ctx {
struct scrub_ctx *sctx;
u64 logical;
u64 len;
int mirror_num;
u64 physical_for_dev_replace;
struct list_head inodes;
struct btrfs_work work;
};
struct scrub_warning {
struct btrfs_path *path;
u64 extent_item_size;
const char *errstr;
sector_t sector;
u64 logical;
struct btrfs_device *dev;
};
static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock, int is_metadata,
int have_csum, u8 *csum, u64 generation,
u16 csum_size, int retry_failed_mirror);
static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int is_metadata, int have_csum,
const u8 *csum, u64 generation,
u16 csum_size);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good);
static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int page_num, int force_write);
static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
int page_num);
static int scrub_checksum_data(struct scrub_block *sblock);
static int scrub_checksum_tree_block(struct scrub_block *sblock);
static int scrub_checksum_super(struct scrub_block *sblock);
static void scrub_block_get(struct scrub_block *sblock);
static void scrub_block_put(struct scrub_block *sblock);
static void scrub_page_get(struct scrub_page *spage);
static void scrub_page_put(struct scrub_page *spage);
static void scrub_parity_get(struct scrub_parity *sparity);
static void scrub_parity_put(struct scrub_parity *sparity);
static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
struct scrub_page *spage);
static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum, int force,
u64 physical_for_dev_replace);
static void scrub_bio_end_io(struct bio *bio, int err);
static void scrub_bio_end_io_worker(struct btrfs_work *work);
static void scrub_block_complete(struct scrub_block *sblock);
static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
u64 extent_logical, u64 extent_len,
u64 *extent_physical,
struct btrfs_device **extent_dev,
int *extent_mirror_num);
static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
struct scrub_wr_ctx *wr_ctx,
struct btrfs_fs_info *fs_info,
struct btrfs_device *dev,
int is_dev_replace);
static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage);
static void scrub_wr_submit(struct scrub_ctx *sctx);
static void scrub_wr_bio_end_io(struct bio *bio, int err);
static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
static int write_page_nocow(struct scrub_ctx *sctx,
u64 physical_for_dev_replace, struct page *page);
static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
struct scrub_copy_nocow_ctx *ctx);
static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
int mirror_num, u64 physical_for_dev_replace);
static void copy_nocow_pages_worker(struct btrfs_work *work);
static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_put_ctx(struct scrub_ctx *sctx);
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
{
atomic_inc(&sctx->refs);
atomic_inc(&sctx->bios_in_flight);
}
static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
{
atomic_dec(&sctx->bios_in_flight);
wake_up(&sctx->list_wait);
scrub_put_ctx(sctx);
}
static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
{
while (atomic_read(&fs_info->scrub_pause_req)) {
mutex_unlock(&fs_info->scrub_lock);
wait_event(fs_info->scrub_pause_wait,
atomic_read(&fs_info->scrub_pause_req) == 0);
mutex_lock(&fs_info->scrub_lock);
}
}
static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
{
atomic_inc(&fs_info->scrubs_paused);
wake_up(&fs_info->scrub_pause_wait);
mutex_lock(&fs_info->scrub_lock);
__scrub_blocked_if_needed(fs_info);
atomic_dec(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
wake_up(&fs_info->scrub_pause_wait);
}
/*
* used for workers that require transaction commits (i.e., for the
* NOCOW case)
*/
static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
{
struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
atomic_inc(&sctx->refs);
/*
* increment scrubs_running to prevent cancel requests from
* completing as long as a worker is running. we must also
* increment scrubs_paused to prevent deadlocking on pause
* requests used for transactions commits (as the worker uses a
* transaction context). it is safe to regard the worker
* as paused for all matters practical. effectively, we only
* avoid cancellation requests from completing.
*/
mutex_lock(&fs_info->scrub_lock);
atomic_inc(&fs_info->scrubs_running);
atomic_inc(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
/*
* check if @scrubs_running=@scrubs_paused condition
* inside wait_event() is not an atomic operation.
* which means we may inc/dec @scrub_running/paused
* at any time. Let's wake up @scrub_pause_wait as
* much as we can to let commit transaction blocked less.
*/
wake_up(&fs_info->scrub_pause_wait);
atomic_inc(&sctx->workers_pending);
}
/* used for workers that require transaction commits */
static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
{
struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
/*
* see scrub_pending_trans_workers_inc() why we're pretending
* to be paused in the scrub counters
*/
mutex_lock(&fs_info->scrub_lock);
atomic_dec(&fs_info->scrubs_running);
atomic_dec(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
atomic_dec(&sctx->workers_pending);
wake_up(&fs_info->scrub_pause_wait);
wake_up(&sctx->list_wait);
scrub_put_ctx(sctx);
}
static void scrub_free_csums(struct scrub_ctx *sctx)
{
while (!list_empty(&sctx->csum_list)) {
struct btrfs_ordered_sum *sum;
sum = list_first_entry(&sctx->csum_list,
struct btrfs_ordered_sum, list);
list_del(&sum->list);
kfree(sum);
}
}
static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
{
int i;
if (!sctx)
return;
scrub_free_wr_ctx(&sctx->wr_ctx);
/* this can happen when scrub is cancelled */
if (sctx->curr != -1) {
struct scrub_bio *sbio = sctx->bios[sctx->curr];
for (i = 0; i < sbio->page_count; i++) {
WARN_ON(!sbio->pagev[i]->page);
scrub_block_put(sbio->pagev[i]->sblock);
}
bio_put(sbio->bio);
}
for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
struct scrub_bio *sbio = sctx->bios[i];
if (!sbio)
break;
kfree(sbio);
}
scrub_free_csums(sctx);
kfree(sctx);
}
static void scrub_put_ctx(struct scrub_ctx *sctx)
{
if (atomic_dec_and_test(&sctx->refs))
scrub_free_ctx(sctx);
}
static noinline_for_stack
struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
{
struct scrub_ctx *sctx;
int i;
struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
int pages_per_rd_bio;
int ret;
/*
* the setting of pages_per_rd_bio is correct for scrub but might
* be wrong for the dev_replace code where we might read from
* different devices in the initial huge bios. However, that
* code is able to correctly handle the case when adding a page
* to a bio fails.
*/
if (dev->bdev)
pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
bio_get_nr_vecs(dev->bdev));
else
pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
if (!sctx)
goto nomem;
atomic_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
sctx->pages_per_rd_bio = pages_per_rd_bio;
sctx->curr = -1;
sctx->dev_root = dev->dev_root;
for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
struct scrub_bio *sbio;
sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
if (!sbio)
goto nomem;
sctx->bios[i] = sbio;
sbio->index = i;
sbio->sctx = sctx;
sbio->page_count = 0;
btrfs_init_work(&sbio->work, btrfs_scrub_helper,
scrub_bio_end_io_worker, NULL, NULL);
if (i != SCRUB_BIOS_PER_SCTX - 1)
sctx->bios[i]->next_free = i + 1;
else
sctx->bios[i]->next_free = -1;
}
sctx->first_free = 0;
sctx->nodesize = dev->dev_root->nodesize;
sctx->sectorsize = dev->dev_root->sectorsize;
atomic_set(&sctx->bios_in_flight, 0);
atomic_set(&sctx->workers_pending, 0);
atomic_set(&sctx->cancel_req, 0);
sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
INIT_LIST_HEAD(&sctx->csum_list);
spin_lock_init(&sctx->list_lock);
spin_lock_init(&sctx->stat_lock);
init_waitqueue_head(&sctx->list_wait);
ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
fs_info->dev_replace.tgtdev, is_dev_replace);
if (ret) {
scrub_free_ctx(sctx);
return ERR_PTR(ret);
}
return sctx;
nomem:
scrub_free_ctx(sctx);
return ERR_PTR(-ENOMEM);
}
static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
void *warn_ctx)
{
u64 isize;
u32 nlink;
int ret;
int i;
struct extent_buffer *eb;
struct btrfs_inode_item *inode_item;
struct scrub_warning *swarn = warn_ctx;
struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
struct inode_fs_paths *ipath = NULL;
struct btrfs_root *local_root;
struct btrfs_key root_key;
struct btrfs_key key;
root_key.objectid = root;
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = (u64)-1;
local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
if (IS_ERR(local_root)) {
ret = PTR_ERR(local_root);
goto err;
}
/*
* this makes the path point to (inum INODE_ITEM ioff)
*/
key.objectid = inum;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
if (ret) {
btrfs_release_path(swarn->path);
goto err;
}
eb = swarn->path->nodes[0];
inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
struct btrfs_inode_item);
isize = btrfs_inode_size(eb, inode_item);
nlink = btrfs_inode_nlink(eb, inode_item);
btrfs_release_path(swarn->path);
ipath = init_ipath(4096, local_root, swarn->path);
if (IS_ERR(ipath)) {
ret = PTR_ERR(ipath);
ipath = NULL;
goto err;
}
ret = paths_from_inode(inum, ipath);
if (ret < 0)
goto err;
/*
* we deliberately ignore the bit ipath might have been too small to
* hold all of the paths here
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu, "
"length %llu, links %u (path: %s)\n", swarn->errstr,
swarn->logical, rcu_str_deref(swarn->dev->name),
(unsigned long long)swarn->sector, root, inum, offset,
min(isize - offset, (u64)PAGE_SIZE), nlink,
(char *)(unsigned long)ipath->fspath->val[i]);
free_ipath(ipath);
return 0;
err:
printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
"resolving failed with ret=%d\n", swarn->errstr,
swarn->logical, rcu_str_deref(swarn->dev->name),
(unsigned long long)swarn->sector, root, inum, offset, ret);
free_ipath(ipath);
return 0;
}
static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
{
struct btrfs_device *dev;
struct btrfs_fs_info *fs_info;
struct btrfs_path *path;
struct btrfs_key found_key;
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct scrub_warning swarn;
unsigned long ptr = 0;
u64 extent_item_pos;
u64 flags = 0;
u64 ref_root;
u32 item_size;
u8 ref_level;
int ret;
WARN_ON(sblock->page_count < 1);
dev = sblock->pagev[0]->dev;
fs_info = sblock->sctx->dev_root->fs_info;
path = btrfs_alloc_path();
if (!path)
return;
swarn.sector = (sblock->pagev[0]->physical) >> 9;
swarn.logical = sblock->pagev[0]->logical;
swarn.errstr = errstr;
swarn.dev = NULL;
ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
&flags);
if (ret < 0)
goto out;
extent_item_pos = swarn.logical - found_key.objectid;
swarn.extent_item_size = found_key.offset;
eb = path->nodes[0];
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
item_size = btrfs_item_size_nr(eb, path->slots[0]);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
do {
ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
item_size, &ref_root,
&ref_level);
printk_in_rcu(KERN_WARNING
"BTRFS: %s at logical %llu on dev %s, "
"sector %llu: metadata %s (level %d) in tree "
"%llu\n", errstr, swarn.logical,
rcu_str_deref(dev->name),
(unsigned long long)swarn.sector,
ref_level ? "node" : "leaf",
ret < 0 ? -1 : ref_level,
ret < 0 ? -1 : ref_root);
} while (ret != 1);
btrfs_release_path(path);
} else {
btrfs_release_path(path);
swarn.path = path;
swarn.dev = dev;
iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, 1,
scrub_print_warning_inode, &swarn);
}
out:
btrfs_free_path(path);
}
static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
{
struct page *page = NULL;
unsigned long index;
struct scrub_fixup_nodatasum *fixup = fixup_ctx;
int ret;
int corrected = 0;
struct btrfs_key key;
struct inode *inode = NULL;
struct btrfs_fs_info *fs_info;
u64 end = offset + PAGE_SIZE - 1;
struct btrfs_root *local_root;
int srcu_index;
key.objectid = root;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
fs_info = fixup->root->fs_info;
srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
local_root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(local_root)) {
srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
return PTR_ERR(local_root);
}
key.type = BTRFS_INODE_ITEM_KEY;
key.objectid = inum;
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
if (IS_ERR(inode))
return PTR_ERR(inode);
index = offset >> PAGE_CACHE_SHIFT;
page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
if (!page) {
ret = -ENOMEM;
goto out;
}
if (PageUptodate(page)) {
if (PageDirty(page)) {
/*
* we need to write the data to the defect sector. the
* data that was in that sector is not in memory,
* because the page was modified. we must not write the
* modified page to that sector.
*
* TODO: what could be done here: wait for the delalloc
* runner to write out that page (might involve
* COW) and see whether the sector is still
* referenced afterwards.
*
* For the meantime, we'll treat this error
* incorrectable, although there is a chance that a
* later scrub will find the bad sector again and that
* there's no dirty page in memory, then.
*/
ret = -EIO;
goto out;
}
ret = repair_io_failure(inode, offset, PAGE_SIZE,
fixup->logical, page,
offset - page_offset(page),
fixup->mirror_num);
unlock_page(page);
corrected = !ret;
} else {
/*
* we need to get good data first. the general readpage path
* will call repair_io_failure for us, we just have to make
* sure we read the bad mirror.
*/
ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
EXTENT_DAMAGED, GFP_NOFS);
if (ret) {
/* set_extent_bits should give proper error */
WARN_ON(ret > 0);
if (ret > 0)
ret = -EFAULT;
goto out;
}
ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
btrfs_get_extent,
fixup->mirror_num);
wait_on_page_locked(page);
corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
end, EXTENT_DAMAGED, 0, NULL);
if (!corrected)
clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
EXTENT_DAMAGED, GFP_NOFS);
}
out:
if (page)
put_page(page);
iput(inode);
if (ret < 0)
return ret;
if (ret == 0 && corrected) {
/*
* we only need to call readpage for one of the inodes belonging
* to this extent. so make iterate_extent_inodes stop
*/
return 1;
}
return -EIO;
}
static void scrub_fixup_nodatasum(struct btrfs_work *work)
{
int ret;
struct scrub_fixup_nodatasum *fixup;
struct scrub_ctx *sctx;
struct btrfs_trans_handle *trans = NULL;
struct btrfs_path *path;
int uncorrectable = 0;
fixup = container_of(work, struct scrub_fixup_nodatasum, work);
sctx = fixup->sctx;
path = btrfs_alloc_path();
if (!path) {
spin_lock(&sctx->stat_lock);
++sctx->stat.malloc_errors;
spin_unlock(&sctx->stat_lock);
uncorrectable = 1;
goto out;
}
trans = btrfs_join_transaction(fixup->root);
if (IS_ERR(trans)) {
uncorrectable = 1;
goto out;
}
/*
* the idea is to trigger a regular read through the standard path. we
* read a page from the (failed) logical address by specifying the
* corresponding copynum of the failed sector. thus, that readpage is
* expected to fail.
* that is the point where on-the-fly error correction will kick in
* (once it's finished) and rewrite the failed sector if a good copy
* can be found.
*/
ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
path, scrub_fixup_readpage,
fixup);
if (ret < 0) {
uncorrectable = 1;
goto out;
}
WARN_ON(ret != 1);
spin_lock(&sctx->stat_lock);
++sctx->stat.corrected_errors;
spin_unlock(&sctx->stat_lock);
out:
if (trans && !IS_ERR(trans))
btrfs_end_transaction(trans, fixup->root);
if (uncorrectable) {
spin_lock(&sctx->stat_lock);
++sctx->stat.uncorrectable_errors;
spin_unlock(&sctx->stat_lock);
btrfs_dev_replace_stats_inc(
&sctx->dev_root->fs_info->dev_replace.
num_uncorrectable_read_errors);
printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
"unable to fixup (nodatasum) error at logical %llu on dev %s\n",
fixup->logical, rcu_str_deref(fixup->dev->name));
}
btrfs_free_path(path);
kfree(fixup);
scrub_pending_trans_workers_dec(sctx);
}
static inline void scrub_get_recover(struct scrub_recover *recover)
{
atomic_inc(&recover->refs);
}
static inline void scrub_put_recover(struct scrub_recover *recover)
{
if (atomic_dec_and_test(&recover->refs)) {
btrfs_put_bbio(recover->bbio);
kfree(recover);
}
}
/*
* scrub_handle_errored_block gets called when either verification of the
* pages failed or the bio failed to read, e.g. with EIO. In the latter
* case, this function handles all pages in the bio, even though only one
* may be bad.
* The goal of this function is to repair the errored block by using the
* contents of one of the mirrors.
*/
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
{
struct scrub_ctx *sctx = sblock_to_check->sctx;
struct btrfs_device *dev;
struct btrfs_fs_info *fs_info;
u64 length;
u64 logical;
u64 generation;
unsigned int failed_mirror_index;
unsigned int is_metadata;
unsigned int have_csum;
u8 *csum;
struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
struct scrub_block *sblock_bad;
int ret;
int mirror_index;
int page_num;
int success;
static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
BUG_ON(sblock_to_check->page_count < 1);
fs_info = sctx->dev_root->fs_info;
if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
/*
* if we find an error in a super block, we just report it.
* They will get written with the next transaction commit
* anyway
*/
spin_lock(&sctx->stat_lock);
++sctx->stat.super_errors;
spin_unlock(&sctx->stat_lock);
return 0;
}
length = sblock_to_check->page_count * PAGE_SIZE;
logical = sblock_to_check->pagev[0]->logical;
generation = sblock_to_check->pagev[0]->generation;
BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
is_metadata = !(sblock_to_check->pagev[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
have_csum = sblock_to_check->pagev[0]->have_csum;
csum = sblock_to_check->pagev[0]->csum;
dev = sblock_to_check->pagev[0]->dev;
if (sctx->is_dev_replace && !is_metadata && !have_csum) {
sblocks_for_recheck = NULL;
goto nodatasum_case;
}
/*
* read all mirrors one after the other. This includes to
* re-read the extent or metadata block that failed (that was
* the cause that this fixup code is called) another time,
* page by page this time in order to know which pages
* caused I/O errors and which ones are good (for all mirrors).
* It is the goal to handle the situation when more than one
* mirror contains I/O errors, but the errors do not
* overlap, i.e. the data can be repaired by selecting the
* pages from those mirrors without I/O error on the
* particular pages. One example (with blocks >= 2 * PAGE_SIZE)
* would be that mirror #1 has an I/O error on the first page,
* the second page is good, and mirror #2 has an I/O error on
* the second page, but the first page is good.
* Then the first page of the first mirror can be repaired by
* taking the first page of the second mirror, and the
* second page of the second mirror can be repaired by
* copying the contents of the 2nd page of the 1st mirror.
* One more note: if the pages of one mirror contain I/O
* errors, the checksum cannot be verified. In order to get
* the best data for repairing, the first attempt is to find
* a mirror without I/O errors and with a validated checksum.
* Only if this is not possible, the pages are picked from
* mirrors with I/O errors without considering the checksum.
* If the latter is the case, at the end, the checksum of the
* repaired area is verified in order to correctly maintain
* the statistics.
*/
sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
sizeof(*sblocks_for_recheck),
GFP_NOFS);
if (!sblocks_for_recheck) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
sctx->stat.read_errors++;
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
goto out;
}
/* setup the context, map the logical blocks and alloc the pages */
ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
if (ret) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
goto out;
}
BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
sblock_bad = sblocks_for_recheck + failed_mirror_index;
/* build and submit the bios for the failed mirror, check checksums */
scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
csum, generation, sctx->csum_size, 1);
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
/*
* the error disappeared after reading page by page, or
* the area was part of a huge bio and other parts of the
* bio caused I/O errors, or the block layer merged several
* read requests into one and the error is caused by a
* different bio (usually one of the two latter cases is
* the cause)
*/
spin_lock(&sctx->stat_lock);
sctx->stat.unverified_errors++;
sblock_to_check->data_corrected = 1;
spin_unlock(&sctx->stat_lock);
if (sctx->is_dev_replace)
scrub_write_block_to_dev_replace(sblock_bad);
goto out;
}
if (!sblock_bad->no_io_error_seen) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
if (__ratelimit(&_rs))
scrub_print_warning("i/o error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
} else if (sblock_bad->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.csum_errors++;
spin_unlock(&sctx->stat_lock);
if (__ratelimit(&_rs))
scrub_print_warning("checksum error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
} else if (sblock_bad->header_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.verify_errors++;
spin_unlock(&sctx->stat_lock);
if (__ratelimit(&_rs))
scrub_print_warning("checksum/header error",
sblock_to_check);
if (sblock_bad->generation_error)
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_GENERATION_ERRS);
else
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
}
if (sctx->readonly) {
ASSERT(!sctx->is_dev_replace);
goto out;
}
if (!is_metadata && !have_csum) {
struct scrub_fixup_nodatasum *fixup_nodatasum;
WARN_ON(sctx->is_dev_replace);
nodatasum_case:
/*
* !is_metadata and !have_csum, this means that the data
* might not be COW'ed, that it might be modified
* concurrently. The general strategy to work on the
* commit root does not help in the case when COW is not
* used.
*/
fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
if (!fixup_nodatasum)
goto did_not_correct_error;
fixup_nodatasum->sctx = sctx;
fixup_nodatasum->dev = dev;
fixup_nodatasum->logical = logical;
fixup_nodatasum->root = fs_info->extent_root;
fixup_nodatasum->mirror_num = failed_mirror_index + 1;
scrub_pending_trans_workers_inc(sctx);
btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
scrub_fixup_nodatasum, NULL, NULL);
btrfs_queue_work(fs_info->scrub_workers,
&fixup_nodatasum->work);
goto out;
}
/*
* now build and submit the bios for the other mirrors, check
* checksums.
* First try to pick the mirror which is completely without I/O
* errors and also does not have a checksum error.
* If one is found, and if a checksum is present, the full block
* that is known to contain an error is rewritten. Afterwards
* the block is known to be corrected.
* If a mirror is found which is completely correct, and no
* checksum is present, only those pages are rewritten that had
* an I/O error in the block to be repaired, since it cannot be
* determined, which copy of the other pages is better (and it
* could happen otherwise that a correct page would be
* overwritten by a bad one).
*/
for (mirror_index = 0;
mirror_index < BTRFS_MAX_MIRRORS &&
sblocks_for_recheck[mirror_index].page_count > 0;
mirror_index++) {
struct scrub_block *sblock_other;
if (mirror_index == failed_mirror_index)
continue;
sblock_other = sblocks_for_recheck + mirror_index;
/* build and submit the bios, check checksums */
scrub_recheck_block(fs_info, sblock_other, is_metadata,
have_csum, csum, generation,
sctx->csum_size, 0);
if (!sblock_other->header_error &&
!sblock_other->checksum_error &&
sblock_other->no_io_error_seen) {
if (sctx->is_dev_replace) {
scrub_write_block_to_dev_replace(sblock_other);
goto corrected_error;
} else {
ret = scrub_repair_block_from_good_copy(
sblock_bad, sblock_other);
if (!ret)
goto corrected_error;
}
}
}
if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
goto did_not_correct_error;
/*
* In case of I/O errors in the area that is supposed to be
* repaired, continue by picking good copies of those pages.
* Select the good pages from mirrors to rewrite bad pages from
* the area to fix. Afterwards verify the checksum of the block
* that is supposed to be repaired. This verification step is
* only done for the purpose of statistic counting and for the
* final scrub report, whether errors remain.
* A perfect algorithm could make use of the checksum and try
* all possible combinations of pages from the different mirrors
* until the checksum verification succeeds. For example, when
* the 2nd page of mirror #1 faces I/O errors, and the 2nd page
* of mirror #2 is readable but the final checksum test fails,
* then the 2nd page of mirror #3 could be tried, whether now
* the final checksum succeedes. But this would be a rare
* exception and is therefore not implemented. At least it is
* avoided that the good copy is overwritten.
* A more useful improvement would be to pick the sectors
* without I/O error based on sector sizes (512 bytes on legacy
* disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
* mirror could be repaired by taking 512 byte of a different
* mirror, even if other 512 byte sectors in the same PAGE_SIZE
* area are unreadable.
*/
success = 1;
for (page_num = 0; page_num < sblock_bad->page_count;
page_num++) {
struct scrub_page *page_bad = sblock_bad->pagev[page_num];
struct scrub_block *sblock_other = NULL;
/* skip no-io-error page in scrub */
if (!page_bad->io_error && !sctx->is_dev_replace)
continue;
/* try to find no-io-error page in mirrors */
if (page_bad->io_error) {
for (mirror_index = 0;
mirror_index < BTRFS_MAX_MIRRORS &&
sblocks_for_recheck[mirror_index].page_count > 0;
mirror_index++) {
if (!sblocks_for_recheck[mirror_index].
pagev[page_num]->io_error) {
sblock_other = sblocks_for_recheck +
mirror_index;
break;
}
}
if (!sblock_other)
success = 0;
}
if (sctx->is_dev_replace) {
/*
* did not find a mirror to fetch the page
* from. scrub_write_page_to_dev_replace()
* handles this case (page->io_error), by
* filling the block with zeros before
* submitting the write request
*/
if (!sblock_other)
sblock_other = sblock_bad;
if (scrub_write_page_to_dev_replace(sblock_other,
page_num) != 0) {
btrfs_dev_replace_stats_inc(
&sctx->dev_root->
fs_info->dev_replace.
num_write_errors);
success = 0;
}
} else if (sblock_other) {
ret = scrub_repair_page_from_good_copy(sblock_bad,
sblock_other,
page_num, 0);
if (0 == ret)
page_bad->io_error = 0;
else
success = 0;
}
}
if (success && !sctx->is_dev_replace) {
if (is_metadata || have_csum) {
/*
* need to verify the checksum now that all
* sectors on disk are repaired (the write
* request for data to be repaired is on its way).
* Just be lazy and use scrub_recheck_block()
* which re-reads the data before the checksum
* is verified, but most likely the data comes out
* of the page cache.
*/
scrub_recheck_block(fs_info, sblock_bad,
is_metadata, have_csum, csum,
generation, sctx->csum_size, 1);
if (!sblock_bad->header_error &&
!sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen)
goto corrected_error;
else
goto did_not_correct_error;
} else {
corrected_error:
spin_lock(&sctx->stat_lock);
sctx->stat.corrected_errors++;
sblock_to_check->data_corrected = 1;
spin_unlock(&sctx->stat_lock);
printk_ratelimited_in_rcu(KERN_ERR
"BTRFS: fixed up error at logical %llu on dev %s\n",
logical, rcu_str_deref(dev->name));
}
} else {
did_not_correct_error:
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
printk_ratelimited_in_rcu(KERN_ERR
"BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
logical, rcu_str_deref(dev->name));
}
out:
if (sblocks_for_recheck) {
for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
mirror_index++) {
struct scrub_block *sblock = sblocks_for_recheck +
mirror_index;
struct scrub_recover *recover;
int page_index;
for (page_index = 0; page_index < sblock->page_count;
page_index++) {
sblock->pagev[page_index]->sblock = NULL;
recover = sblock->pagev[page_index]->recover;
if (recover) {
scrub_put_recover(recover);
sblock->pagev[page_index]->recover =
NULL;
}
scrub_page_put(sblock->pagev[page_index]);
}
}
kfree(sblocks_for_recheck);
}
return 0;
}
static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
{
if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
return 2;
else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
return 3;
else
return (int)bbio->num_stripes;
}
static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
u64 *raid_map,
u64 mapped_length,
int nstripes, int mirror,
int *stripe_index,
u64 *stripe_offset)
{
int i;
if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
/* RAID5/6 */
for (i = 0; i < nstripes; i++) {
if (raid_map[i] == RAID6_Q_STRIPE ||
raid_map[i] == RAID5_P_STRIPE)
continue;
if (logical >= raid_map[i] &&
logical < raid_map[i] + mapped_length)
break;
}
*stripe_index = i;
*stripe_offset = logical - raid_map[i];
} else {
/* The other RAID type */
*stripe_index = mirror;
*stripe_offset = 0;
}
}
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck)
{
struct scrub_ctx *sctx = original_sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
u64 length = original_sblock->page_count * PAGE_SIZE;
u64 logical = original_sblock->pagev[0]->logical;
struct scrub_recover *recover;
struct btrfs_bio *bbio;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
int stripe_index;
int page_index = 0;
int mirror_index;
int nmirrors;
int ret;
/*
* note: the two members refs and outstanding_pages
* are not used (and not set) in the blocks that are used for
* the recheck procedure
*/
while (length > 0) {
sublen = min_t(u64, length, PAGE_SIZE);
mapped_length = sublen;
bbio = NULL;
/*
* with a length of PAGE_SIZE, each returned stripe
* represents one mirror
*/
ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
&mapped_length, &bbio, 0, 1);
if (ret || !bbio || mapped_length < sublen) {
btrfs_put_bbio(bbio);
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
if (!recover) {
btrfs_put_bbio(bbio);
return -ENOMEM;
}
atomic_set(&recover->refs, 1);
recover->bbio = bbio;
recover->map_length = mapped_length;
BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
struct scrub_block *sblock;
struct scrub_page *page;
sblock = sblocks_for_recheck + mirror_index;
sblock->sctx = sctx;
page = kzalloc(sizeof(*page), GFP_NOFS);
if (!page) {
leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_put_recover(recover);
return -ENOMEM;
}
scrub_page_get(page);
sblock->pagev[page_index] = page;
page->logical = logical;
scrub_stripe_index_and_offset(logical,
bbio->map_type,
bbio->raid_map,
mapped_length,
bbio->num_stripes -
bbio->num_tgtdevs,
mirror_index,
&stripe_index,
&stripe_offset);
page->physical = bbio->stripes[stripe_index].physical +
stripe_offset;
page->dev = bbio->stripes[stripe_index].dev;
BUG_ON(page_index >= original_sblock->page_count);
page->physical_for_dev_replace =
original_sblock->pagev[page_index]->
physical_for_dev_replace;
/* for missing devices, dev->bdev is NULL */
page->mirror_num = mirror_index + 1;
sblock->page_count++;
page->page = alloc_page(GFP_NOFS);
if (!page->page)
goto leave_nomem;
scrub_get_recover(recover);
page->recover = recover;
}
scrub_put_recover(recover);
length -= sublen;
logical += sublen;
page_index++;
}
return 0;
}
struct scrub_bio_ret {
struct completion event;
int error;
};
static void scrub_bio_wait_endio(struct bio *bio, int error)
{
struct scrub_bio_ret *ret = bio->bi_private;
ret->error = error;
complete(&ret->event);
}
static inline int scrub_is_page_on_raid56(struct scrub_page *page)
{
return page->recover &&
(page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
struct bio *bio,
struct scrub_page *page)
{
struct scrub_bio_ret done;
int ret;
init_completion(&done.event);
done.error = 0;
bio->bi_iter.bi_sector = page->logical >> 9;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
page->recover->map_length,
page->mirror_num, 0);
if (ret)
return ret;
wait_for_completion(&done.event);
if (done.error)
return -EIO;
return 0;
}
/*
* this function will check the on disk data for checksum errors, header
* errors and read I/O errors. If any I/O errors happen, the exact pages
* which are errored are marked as being bad. The goal is to enable scrub
* to take those pages that are not errored from all the mirrors so that
* the pages that are errored in the just handled mirror can be repaired.
*/
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock, int is_metadata,
int have_csum, u8 *csum, u64 generation,
u16 csum_size, int retry_failed_mirror)
{
int page_num;
sblock->no_io_error_seen = 1;
sblock->header_error = 0;
sblock->checksum_error = 0;
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct bio *bio;
struct scrub_page *page = sblock->pagev[page_num];
if (page->dev->bdev == NULL) {
page->io_error = 1;
sblock->no_io_error_seen = 0;
continue;
}
WARN_ON(!page->page);
bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
if (!bio) {
page->io_error = 1;
sblock->no_io_error_seen = 0;
continue;
}
bio->bi_bdev = page->dev->bdev;
bio_add_page(bio, page->page, PAGE_SIZE, 0);
if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
sblock->no_io_error_seen = 0;
} else {
bio->bi_iter.bi_sector = page->physical >> 9;
if (btrfsic_submit_bio_wait(READ, bio))
sblock->no_io_error_seen = 0;
}
bio_put(bio);
}
if (sblock->no_io_error_seen)
scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
have_csum, csum, generation,
csum_size);
return;
}
static inline int scrub_check_fsid(u8 fsid[],
struct scrub_page *spage)
{
struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
int ret;
ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
return !ret;
}
static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int is_metadata, int have_csum,
const u8 *csum, u64 generation,
u16 csum_size)
{
int page_num;
u8 calculated_csum[BTRFS_CSUM_SIZE];
u32 crc = ~(u32)0;
void *mapped_buffer;
WARN_ON(!sblock->pagev[0]->page);
if (is_metadata) {
struct btrfs_header *h;
mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
h = (struct btrfs_header *)mapped_buffer;
if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
!scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE)) {
sblock->header_error = 1;
} else if (generation != btrfs_stack_header_generation(h)) {
sblock->header_error = 1;
sblock->generation_error = 1;
}
csum = h->csum;
} else {
if (!have_csum)
return;
mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
}
for (page_num = 0;;) {
if (page_num == 0 && is_metadata)
crc = btrfs_csum_data(
((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
else
crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
kunmap_atomic(mapped_buffer);
page_num++;
if (page_num >= sblock->page_count)
break;
WARN_ON(!sblock->pagev[page_num]->page);
mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
}
btrfs_csum_final(crc, calculated_csum);
if (memcmp(calculated_csum, csum, csum_size))
sblock->checksum_error = 1;
}
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good)
{
int page_num;
int ret = 0;
for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
int ret_sub;
ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
sblock_good,
page_num, 1);
if (ret_sub)
ret = ret_sub;
}
return ret;
}
static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int page_num, int force_write)
{
struct scrub_page *page_bad = sblock_bad->pagev[page_num];
struct scrub_page *page_good = sblock_good->pagev[page_num];
BUG_ON(page_bad->page == NULL);
BUG_ON(page_good->page == NULL);
if (force_write || sblock_bad->header_error ||
sblock_bad->checksum_error || page_bad->io_error) {
struct bio *bio;
int ret;
if (!page_bad->dev->bdev) {
printk_ratelimited(KERN_WARNING "BTRFS: "
"scrub_repair_page_from_good_copy(bdev == NULL) "
"is unexpected!\n");
return -EIO;
}
bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
bio->bi_bdev = page_bad->dev->bdev;
bio->bi_iter.bi_sector = page_bad->physical >> 9;
ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
if (PAGE_SIZE != ret) {
bio_put(bio);
return -EIO;
}
if (btrfsic_submit_bio_wait(WRITE, bio)) {
btrfs_dev_stat_inc_and_print(page_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
btrfs_dev_replace_stats_inc(
&sblock_bad->sctx->dev_root->fs_info->
dev_replace.num_write_errors);
bio_put(bio);
return -EIO;
}
bio_put(bio);
}
return 0;
}
static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
{
int page_num;
/*
* This block is used for the check of the parity on the source device,
* so the data needn't be written into the destination device.
*/
if (sblock->sparity)
return;
for (page_num = 0; page_num < sblock->page_count; page_num++) {
int ret;
ret = scrub_write_page_to_dev_replace(sblock, page_num);
if (ret)
btrfs_dev_replace_stats_inc(
&sblock->sctx->dev_root->fs_info->dev_replace.
num_write_errors);
}
}
static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
int page_num)
{
struct scrub_page *spage = sblock->pagev[page_num];
BUG_ON(spage->page == NULL);
if (spage->io_error) {
void *mapped_buffer = kmap_atomic(spage->page);
memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
flush_dcache_page(spage->page);
kunmap_atomic(mapped_buffer);
}
return scrub_add_page_to_wr_bio(sblock->sctx, spage);
}
static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage)
{
struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
struct scrub_bio *sbio;
int ret;
mutex_lock(&wr_ctx->wr_lock);
again:
if (!wr_ctx->wr_curr_bio) {
wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
GFP_NOFS);
if (!wr_ctx->wr_curr_bio) {
mutex_unlock(&wr_ctx->wr_lock);
return -ENOMEM;
}
wr_ctx->wr_curr_bio->sctx = sctx;
wr_ctx->wr_curr_bio->page_count = 0;
}
sbio = wr_ctx->wr_curr_bio;
if (sbio->page_count == 0) {
struct bio *bio;
sbio->physical = spage->physical_for_dev_replace;
sbio->logical = spage->logical;
sbio->dev = wr_ctx->tgtdev;
bio = sbio->bio;
if (!bio) {
bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
if (!bio) {
mutex_unlock(&wr_ctx->wr_lock);
return -ENOMEM;
}
sbio->bio = bio;
}
bio->bi_private = sbio;
bio->bi_end_io = scrub_wr_bio_end_io;
bio->bi_bdev = sbio->dev->bdev;
bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->err = 0;
} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
spage->physical_for_dev_replace ||
sbio->logical + sbio->page_count * PAGE_SIZE !=
spage->logical) {
scrub_wr_submit(sctx);
goto again;
}
ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
if (ret != PAGE_SIZE) {
if (sbio->page_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
mutex_unlock(&wr_ctx->wr_lock);
return -EIO;
}
scrub_wr_submit(sctx);
goto again;
}
sbio->pagev[sbio->page_count] = spage;
scrub_page_get(spage);
sbio->page_count++;
if (sbio->page_count == wr_ctx->pages_per_wr_bio)
scrub_wr_submit(sctx);
mutex_unlock(&wr_ctx->wr_lock);
return 0;
}
static void scrub_wr_submit(struct scrub_ctx *sctx)
{
struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
struct scrub_bio *sbio;
if (!wr_ctx->wr_curr_bio)
return;
sbio = wr_ctx->wr_curr_bio;
wr_ctx->wr_curr_bio = NULL;
WARN_ON(!sbio->bio->bi_bdev);
scrub_pending_bio_inc(sctx);
/* process all writes in a single worker thread. Then the block layer
* orders the requests before sending them to the driver which
* doubled the write performance on spinning disks when measured
* with Linux 3.5 */
btrfsic_submit_bio(WRITE, sbio->bio);
}
static void scrub_wr_bio_end_io(struct bio *bio, int err)
{
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
sbio->err = err;
sbio->bio = bio;
btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
scrub_wr_bio_end_io_worker, NULL, NULL);
btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
}
static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
{
struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
struct scrub_ctx *sctx = sbio->sctx;
int i;
WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
if (sbio->err) {
struct btrfs_dev_replace *dev_replace =
&sbio->sctx->dev_root->fs_info->dev_replace;
for (i = 0; i < sbio->page_count; i++) {
struct scrub_page *spage = sbio->pagev[i];
spage->io_error = 1;
btrfs_dev_replace_stats_inc(&dev_replace->
num_write_errors);
}
}
for (i = 0; i < sbio->page_count; i++)
scrub_page_put(sbio->pagev[i]);
bio_put(sbio->bio);
kfree(sbio);
scrub_pending_bio_dec(sctx);
}
static int scrub_checksum(struct scrub_block *sblock)
{
u64 flags;
int ret;
WARN_ON(sblock->page_count < 1);
flags = sblock->pagev[0]->flags;
ret = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA)
ret = scrub_checksum_data(sblock);
else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
ret = scrub_checksum_tree_block(sblock);
else if (flags & BTRFS_EXTENT_FLAG_SUPER)
(void)scrub_checksum_super(sblock);
else
WARN_ON(1);
if (ret)
scrub_handle_errored_block(sblock);
return ret;
}
static int scrub_checksum_data(struct scrub_block *sblock)
{
struct scrub_ctx *sctx = sblock->sctx;
u8 csum[BTRFS_CSUM_SIZE];
u8 *on_disk_csum;
struct page *page;
void *buffer;
u32 crc = ~(u32)0;
int fail = 0;
u64 len;
int index;
BUG_ON(sblock->page_count < 1);
if (!sblock->pagev[0]->have_csum)
return 0;
on_disk_csum = sblock->pagev[0]->csum;
page = sblock->pagev[0]->page;
buffer = kmap_atomic(page);
len = sctx->sectorsize;
index = 0;
for (;;) {
u64 l = min_t(u64, len, PAGE_SIZE);
crc = btrfs_csum_data(buffer, crc, l);
kunmap_atomic(buffer);
len -= l;
if (len == 0)
break;
index++;
BUG_ON(index >= sblock->page_count);
BUG_ON(!sblock->pagev[index]->page);
page = sblock->pagev[index]->page;
buffer = kmap_atomic(page);
}
btrfs_csum_final(crc, csum);
if (memcmp(csum, on_disk_csum, sctx->csum_size))
fail = 1;
return fail;
}
static int scrub_checksum_tree_block(struct scrub_block *sblock)
{
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_header *h;
struct btrfs_root *root = sctx->dev_root;
struct btrfs_fs_info *fs_info = root->fs_info;
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
struct page *page;
void *mapped_buffer;
u64 mapped_size;
void *p;
u32 crc = ~(u32)0;
int fail = 0;
int crc_fail = 0;
u64 len;
int index;
BUG_ON(sblock->page_count < 1);
page = sblock->pagev[0]->page;
mapped_buffer = kmap_atomic(page);
h = (struct btrfs_header *)mapped_buffer;
memcpy(on_disk_csum, h->csum, sctx->csum_size);
/*
* we don't use the getter functions here, as we
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
++fail;
if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
++fail;
if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
++fail;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE))
++fail;
len = sctx->nodesize - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
index = 0;
for (;;) {
u64 l = min_t(u64, len, mapped_size);
crc = btrfs_csum_data(p, crc, l);
kunmap_atomic(mapped_buffer);
len -= l;
if (len == 0)
break;
index++;
BUG_ON(index >= sblock->page_count);
BUG_ON(!sblock->pagev[index]->page);
page = sblock->pagev[index]->page;
mapped_buffer = kmap_atomic(page);
mapped_size = PAGE_SIZE;
p = mapped_buffer;
}
btrfs_csum_final(crc, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
++crc_fail;
return fail || crc_fail;
}
static int scrub_checksum_super(struct scrub_block *sblock)
{
struct btrfs_super_block *s;
struct scrub_ctx *sctx = sblock->sctx;
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
struct page *page;
void *mapped_buffer;
u64 mapped_size;
void *p;
u32 crc = ~(u32)0;
int fail_gen = 0;
int fail_cor = 0;
u64 len;
int index;
BUG_ON(sblock->page_count < 1);
page = sblock->pagev[0]->page;
mapped_buffer = kmap_atomic(page);
s = (struct btrfs_super_block *)mapped_buffer;
memcpy(on_disk_csum, s->csum, sctx->csum_size);
if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
++fail_cor;
if (sblock->pagev[0]->generation != btrfs_super_generation(s))
++fail_gen;
if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
++fail_cor;
len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
index = 0;
for (;;) {
u64 l = min_t(u64, len, mapped_size);
crc = btrfs_csum_data(p, crc, l);
kunmap_atomic(mapped_buffer);
len -= l;
if (len == 0)
break;
index++;
BUG_ON(index >= sblock->page_count);
BUG_ON(!sblock->pagev[index]->page);
page = sblock->pagev[index]->page;
mapped_buffer = kmap_atomic(page);
mapped_size = PAGE_SIZE;
p = mapped_buffer;
}
btrfs_csum_final(crc, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
++fail_cor;
if (fail_cor + fail_gen) {
/*
* if we find an error in a super block, we just report it.
* They will get written with the next transaction commit
* anyway
*/
spin_lock(&sctx->stat_lock);
++sctx->stat.super_errors;
spin_unlock(&sctx->stat_lock);
if (fail_cor)
btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
else
btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
BTRFS_DEV_STAT_GENERATION_ERRS);
}
return fail_cor + fail_gen;
}
static void scrub_block_get(struct scrub_block *sblock)
{
atomic_inc(&sblock->refs);
}
static void scrub_block_put(struct scrub_block *sblock)
{
if (atomic_dec_and_test(&sblock->refs)) {
int i;
if (sblock->sparity)
scrub_parity_put(sblock->sparity);
for (i = 0; i < sblock->page_count; i++)
scrub_page_put(sblock->pagev[i]);
kfree(sblock);
}
}
static void scrub_page_get(struct scrub_page *spage)
{
atomic_inc(&spage->refs);
}
static void scrub_page_put(struct scrub_page *spage)
{
if (atomic_dec_and_test(&spage->refs)) {
if (spage->page)
__free_page(spage->page);
kfree(spage);
}
}
static void scrub_submit(struct scrub_ctx *sctx)
{
struct scrub_bio *sbio;
if (sctx->curr == -1)
return;
sbio = sctx->bios[sctx->curr];
sctx->curr = -1;
scrub_pending_bio_inc(sctx);
if (!sbio->bio->bi_bdev) {
/*
* this case should not happen. If btrfs_map_block() is
* wrong, it could happen for dev-replace operations on
* missing devices when no mirrors are available, but in
* this case it should already fail the mount.
* This case is handled correctly (but _very_ slowly).
*/
printk_ratelimited(KERN_WARNING
"BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
bio_endio(sbio->bio, -EIO);
} else {
btrfsic_submit_bio(READ, sbio->bio);
}
}
static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
struct scrub_page *spage)
{
struct scrub_block *sblock = spage->sblock;
struct scrub_bio *sbio;
int ret;
again:
/*
* grab a fresh bio or wait for one to become available
*/
while (sctx->curr == -1) {
spin_lock(&sctx->list_lock);
sctx->curr = sctx->first_free;
if (sctx->curr != -1) {
sctx->first_free = sctx->bios[sctx->curr]->next_free;
sctx->bios[sctx->curr]->next_free = -1;
sctx->bios[sctx->curr]->page_count = 0;
spin_unlock(&sctx->list_lock);
} else {
spin_unlock(&sctx->list_lock);
wait_event(sctx->list_wait, sctx->first_free != -1);
}
}
sbio = sctx->bios[sctx->curr];
if (sbio->page_count == 0) {
struct bio *bio;
sbio->physical = spage->physical;
sbio->logical = spage->logical;
sbio->dev = spage->dev;
bio = sbio->bio;
if (!bio) {
bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
if (!bio)
return -ENOMEM;
sbio->bio = bio;
}
bio->bi_private = sbio;
bio->bi_end_io = scrub_bio_end_io;
bio->bi_bdev = sbio->dev->bdev;
bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->err = 0;
} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
spage->physical ||
sbio->logical + sbio->page_count * PAGE_SIZE !=
spage->logical ||
sbio->dev != spage->dev) {
scrub_submit(sctx);
goto again;
}
sbio->pagev[sbio->page_count] = spage;
ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
if (ret != PAGE_SIZE) {
if (sbio->page_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
return -EIO;
}
scrub_submit(sctx);
goto again;
}
scrub_block_get(sblock); /* one for the page added to the bio */
atomic_inc(&sblock->outstanding_pages);
sbio->page_count++;
if (sbio->page_count == sctx->pages_per_rd_bio)
scrub_submit(sctx);
return 0;
}
static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum, int force,
u64 physical_for_dev_replace)
{
struct scrub_block *sblock;
int index;
sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
if (!sblock) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
return -ENOMEM;
}
/* one ref inside this function, plus one for each page added to
* a bio later on */
atomic_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
for (index = 0; len > 0; index++) {
struct scrub_page *spage;
u64 l = min_t(u64, len, PAGE_SIZE);
spage = kzalloc(sizeof(*spage), GFP_NOFS);
if (!spage) {
leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_block_put(sblock);
return -ENOMEM;
}
BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
scrub_page_get(spage);
sblock->pagev[index] = spage;
spage->sblock = sblock;
spage->dev = dev;
spage->flags = flags;
spage->generation = gen;
spage->logical = logical;
spage->physical = physical;
spage->physical_for_dev_replace = physical_for_dev_replace;
spage->mirror_num = mirror_num;
if (csum) {
spage->have_csum = 1;
memcpy(spage->csum, csum, sctx->csum_size);
} else {
spage->have_csum = 0;
}
sblock->page_count++;
spage->page = alloc_page(GFP_NOFS);
if (!spage->page)
goto leave_nomem;
len -= l;
logical += l;
physical += l;
physical_for_dev_replace += l;
}
WARN_ON(sblock->page_count == 0);
for (index = 0; index < sblock->page_count; index++) {
struct scrub_page *spage = sblock->pagev[index];
int ret;
ret = scrub_add_page_to_rd_bio(sctx, spage);
if (ret) {
scrub_block_put(sblock);
return ret;
}
}
if (force)
scrub_submit(sctx);
/* last one frees, either here or in bio completion for last page */
scrub_block_put(sblock);
return 0;
}
static void scrub_bio_end_io(struct bio *bio, int err)
{
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
sbio->err = err;
sbio->bio = bio;
btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
}
static void scrub_bio_end_io_worker(struct btrfs_work *work)
{
struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
struct scrub_ctx *sctx = sbio->sctx;
int i;
BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
if (sbio->err) {
for (i = 0; i < sbio->page_count; i++) {
struct scrub_page *spage = sbio->pagev[i];
spage->io_error = 1;
spage->sblock->no_io_error_seen = 0;
}
}
/* now complete the scrub_block items that have all pages completed */
for (i = 0; i < sbio->page_count; i++) {
struct scrub_page *spage = sbio->pagev[i];
struct scrub_block *sblock = spage->sblock;
if (atomic_dec_and_test(&sblock->outstanding_pages))
scrub_block_complete(sblock);
scrub_block_put(sblock);
}
bio_put(sbio->bio);
sbio->bio = NULL;
spin_lock(&sctx->list_lock);
sbio->next_free = sctx->first_free;
sctx->first_free = sbio->index;
spin_unlock(&sctx->list_lock);
if (sctx->is_dev_replace &&
atomic_read(&sctx->wr_ctx.flush_all_writes)) {
mutex_lock(&sctx->wr_ctx.wr_lock);
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_ctx.wr_lock);
}
scrub_pending_bio_dec(sctx);
}
static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
unsigned long *bitmap,
u64 start, u64 len)
{
int offset;
int nsectors;
int sectorsize = sparity->sctx->dev_root->sectorsize;
if (len >= sparity->stripe_len) {
bitmap_set(bitmap, 0, sparity->nsectors);
return;
}
start -= sparity->logic_start;
offset = (int)do_div(start, sparity->stripe_len);
offset /= sectorsize;
nsectors = (int)len / sectorsize;
if (offset + nsectors <= sparity->nsectors) {
bitmap_set(bitmap, offset, nsectors);
return;
}
bitmap_set(bitmap, offset, sparity->nsectors - offset);
bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
}
static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
u64 start, u64 len)
{
__scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
}
static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
u64 start, u64 len)
{
__scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
}
static void scrub_block_complete(struct scrub_block *sblock)
{
int corrupted = 0;
if (!sblock->no_io_error_seen) {
corrupted = 1;
scrub_handle_errored_block(sblock);
} else {
/*
* if has checksum error, write via repair mechanism in
* dev replace case, otherwise write here in dev replace
* case.
*/
corrupted = scrub_checksum(sblock);
if (!corrupted && sblock->sctx->is_dev_replace)
scrub_write_block_to_dev_replace(sblock);
}
if (sblock->sparity && corrupted && !sblock->data_corrected) {
u64 start = sblock->pagev[0]->logical;
u64 end = sblock->pagev[sblock->page_count - 1]->logical +
PAGE_SIZE;
scrub_parity_mark_sectors_error(sblock->sparity,
start, end - start);
}
}
static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
u8 *csum)
{
struct btrfs_ordered_sum *sum = NULL;
unsigned long index;
unsigned long num_sectors;
while (!list_empty(&sctx->csum_list)) {
sum = list_first_entry(&sctx->csum_list,
struct btrfs_ordered_sum, list);
if (sum->bytenr > logical)
return 0;
if (sum->bytenr + sum->len > logical)
break;
++sctx->stat.csum_discards;
list_del(&sum->list);
kfree(sum);
sum = NULL;
}
if (!sum)
return 0;
index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
num_sectors = sum->len / sctx->sectorsize;
memcpy(csum, sum->sums + index, sctx->csum_size);
if (index == num_sectors - 1) {
list_del(&sum->list);
kfree(sum);
}
return 1;
}
/* scrub extent tries to collect up to 64 kB for each bio */
static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u64 physical_for_dev_replace)
{
int ret;
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
blocksize = sctx->sectorsize;
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed++;
sctx->stat.data_bytes_scrubbed += len;
spin_unlock(&sctx->stat_lock);
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
blocksize = sctx->nodesize;
spin_lock(&sctx->stat_lock);
sctx->stat.tree_extents_scrubbed++;
sctx->stat.tree_bytes_scrubbed += len;
spin_unlock(&sctx->stat_lock);
} else {
blocksize = sctx->sectorsize;
WARN_ON(1);
}
while (len) {
u64 l = min_t(u64, len, blocksize);
int have_csum = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
/* push csums to sbio */
have_csum = scrub_find_csum(sctx, logical, l, csum);
if (have_csum == 0)
++sctx->stat.no_csum;
if (sctx->is_dev_replace && !have_csum) {
ret = copy_nocow_pages(sctx, logical, l,
mirror_num,
physical_for_dev_replace);
goto behind_scrub_pages;
}
}
ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
mirror_num, have_csum ? csum : NULL, 0,
physical_for_dev_replace);
behind_scrub_pages:
if (ret)
return ret;
len -= l;
logical += l;
physical += l;
physical_for_dev_replace += l;
}
return 0;
}
static int scrub_pages_for_parity(struct scrub_parity *sparity,
u64 logical, u64 len,
u64 physical, struct btrfs_device *dev,
u64 flags, u64 gen, int mirror_num, u8 *csum)
{
struct scrub_ctx *sctx = sparity->sctx;
struct scrub_block *sblock;
int index;
sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
if (!sblock) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
return -ENOMEM;
}
/* one ref inside this function, plus one for each page added to
* a bio later on */
atomic_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
sblock->sparity = sparity;
scrub_parity_get(sparity);
for (index = 0; len > 0; index++) {
struct scrub_page *spage;
u64 l = min_t(u64, len, PAGE_SIZE);
spage = kzalloc(sizeof(*spage), GFP_NOFS);
if (!spage) {
leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_block_put(sblock);
return -ENOMEM;
}
BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
/* For scrub block */
scrub_page_get(spage);
sblock->pagev[index] = spage;
/* For scrub parity */
scrub_page_get(spage);
list_add_tail(&spage->list, &sparity->spages);
spage->sblock = sblock;
spage->dev = dev;
spage->flags = flags;
spage->generation = gen;
spage->logical = logical;
spage->physical = physical;
spage->mirror_num = mirror_num;
if (csum) {
spage->have_csum = 1;
memcpy(spage->csum, csum, sctx->csum_size);
} else {
spage->have_csum = 0;
}
sblock->page_count++;
spage->page = alloc_page(GFP_NOFS);
if (!spage->page)
goto leave_nomem;
len -= l;
logical += l;
physical += l;
}
WARN_ON(sblock->page_count == 0);
for (index = 0; index < sblock->page_count; index++) {
struct scrub_page *spage = sblock->pagev[index];
int ret;
ret = scrub_add_page_to_rd_bio(sctx, spage);
if (ret) {
scrub_block_put(sblock);
return ret;
}
}
/* last one frees, either here or in bio completion for last page */
scrub_block_put(sblock);
return 0;
}
static int scrub_extent_for_parity(struct scrub_parity *sparity,
u64 logical, u64 len,
u64 physical, struct btrfs_device *dev,
u64 flags, u64 gen, int mirror_num)
{
struct scrub_ctx *sctx = sparity->sctx;
int ret;
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
blocksize = sctx->sectorsize;
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
blocksize = sctx->nodesize;
} else {
blocksize = sctx->sectorsize;
WARN_ON(1);
}
while (len) {
u64 l = min_t(u64, len, blocksize);
int have_csum = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
/* push csums to sbio */
have_csum = scrub_find_csum(sctx, logical, l, csum);
if (have_csum == 0)
goto skip;
}
ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
flags, gen, mirror_num,
have_csum ? csum : NULL);
if (ret)
return ret;
skip:
len -= l;
logical += l;
physical += l;
}
return 0;
}
/*
* Given a physical address, this will calculate it's
* logical offset. if this is a parity stripe, it will return
* the most left data stripe's logical offset.
*
* return 0 if it is a data stripe, 1 means parity stripe.
*/
static int get_raid56_logic_offset(u64 physical, int num,
struct map_lookup *map, u64 *offset,
u64 *stripe_start)
{
int i;
int j = 0;
u64 stripe_nr;
u64 last_offset;
int stripe_index;
int rot;
last_offset = (physical - map->stripes[num].physical) *
nr_data_stripes(map);
if (stripe_start)
*stripe_start = last_offset;
*offset = last_offset;
for (i = 0; i < nr_data_stripes(map); i++) {
*offset = last_offset + i * map->stripe_len;
stripe_nr = *offset;
do_div(stripe_nr, map->stripe_len);
do_div(stripe_nr, nr_data_stripes(map));
/* Work out the disk rotation on this stripe-set */
rot = do_div(stripe_nr, map->num_stripes);
/* calculate which stripe this data locates */
rot += i;
stripe_index = rot % map->num_stripes;
if (stripe_index == num)
return 0;
if (stripe_index < num)
j++;
}
*offset = last_offset + j * map->stripe_len;
return 1;
}
static void scrub_free_parity(struct scrub_parity *sparity)
{
struct scrub_ctx *sctx = sparity->sctx;
struct scrub_page *curr, *next;
int nbits;
nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
if (nbits) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors += nbits;
sctx->stat.uncorrectable_errors += nbits;
spin_unlock(&sctx->stat_lock);
}
list_for_each_entry_safe(curr, next, &sparity->spages, list) {
list_del_init(&curr->list);
scrub_page_put(curr);
}
kfree(sparity);
}
static void scrub_parity_bio_endio(struct bio *bio, int error)
{
struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
struct scrub_ctx *sctx = sparity->sctx;
if (error)
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
scrub_free_parity(sparity);
scrub_pending_bio_dec(sctx);
bio_put(bio);
}
static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
{
struct scrub_ctx *sctx = sparity->sctx;
struct bio *bio;
struct btrfs_raid_bio *rbio;
struct scrub_page *spage;
struct btrfs_bio *bbio = NULL;
u64 length;
int ret;
if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
sparity->nsectors))
goto out;
length = sparity->logic_end - sparity->logic_start + 1;
ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
sparity->logic_start,
&length, &bbio, 0, 1);
if (ret || !bbio || !bbio->raid_map)
goto bbio_out;
bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
if (!bio)
goto bbio_out;
bio->bi_iter.bi_sector = sparity->logic_start >> 9;
bio->bi_private = sparity;
bio->bi_end_io = scrub_parity_bio_endio;
rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
length, sparity->scrub_dev,
sparity->dbitmap,
sparity->nsectors);
if (!rbio)
goto rbio_out;
list_for_each_entry(spage, &sparity->spages, list)
raid56_parity_add_scrub_pages(rbio, spage->page,
spage->logical);
scrub_pending_bio_inc(sctx);
raid56_parity_submit_scrub_rbio(rbio);
return;
rbio_out:
bio_put(bio);
bbio_out:
btrfs_put_bbio(bbio);
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
out:
scrub_free_parity(sparity);
}
static inline int scrub_calc_parity_bitmap_len(int nsectors)
{
return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
}
static void scrub_parity_get(struct scrub_parity *sparity)
{
atomic_inc(&sparity->refs);
}
static void scrub_parity_put(struct scrub_parity *sparity)
{
if (!atomic_dec_and_test(&sparity->refs))
return;
scrub_parity_check_and_repair(sparity);
}
static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *sdev,
struct btrfs_path *path,
u64 logic_start,
u64 logic_end)
{
struct btrfs_fs_info *fs_info =