blob: 0a87e5b6da9fab4af354394b01ecba44f83e4d0d [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2006 Silicon Graphics, Inc.
* All Rights Reserved.
*/
#include "libxfs_priv.h"
#include "init.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode_buf.h"
#include "xfs_inode_fork.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "libfrog/platform.h"
#include "libxfs/xfile.h"
#include "libxfs/buf_mem.h"
#include "libxfs.h"
static void libxfs_brelse(struct cache_node *node);
/*
* Important design/architecture note:
*
* The userspace code that uses the buffer cache is much less constrained than
* the kernel code. The userspace code is pretty nasty in places, especially
* when it comes to buffer error handling. Very little of the userspace code
* outside libxfs clears bp->b_error - very little code even checks it - so the
* libxfs code is tripping on stale errors left by the userspace code.
*
* We can't clear errors or zero buffer contents in libxfs_buf_get-* like we do
* in the kernel, because those functions are used by the libxfs_readbuf_*
* functions and hence need to leave the buffers unchanged on cache hits. This
* is actually the only way to gather a write error from a libxfs_writebuf()
* call - you need to get the buffer again so you can check bp->b_error field -
* assuming that the buffer is still in the cache when you check, that is.
*
* This is very different to the kernel code which does not release buffers on a
* write so we can wait on IO and check errors. The kernel buffer cache also
* guarantees a buffer of a known initial state from xfs_buf_get() even on a
* cache hit.
*
* IOWs, userspace is behaving quite differently to the kernel and as a result
* it leaks errors from reads, invalidations and writes through
* libxfs_buf_get/libxfs_buf_read.
*
* The result of this is that until the userspace code outside libxfs is cleaned
* up, functions that release buffers from userspace control (i.e
* libxfs_writebuf/libxfs_buf_relse) need to zero bp->b_error to prevent
* propagation of stale errors into future buffer operations.
*/
#define BDSTRAT_SIZE (256 * 1024)
#define IO_BCOMPARE_CHECK
/* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
int
libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
{
int fd = btp->bt_bdev_fd;
xfs_off_t start_offset, end_offset, offset;
ssize_t zsize, bytes;
size_t len_bytes;
char *z;
int error;
if (xfs_buftarg_is_mem(btp))
return -EOPNOTSUPP;
start_offset = LIBXFS_BBTOOFF64(start);
/* try to use special zeroing methods, fall back to writes if needed */
len_bytes = LIBXFS_BBTOOFF64(len);
error = fallocate(fd, FALLOC_FL_ZERO_RANGE, start_offset, len_bytes);
if (!error) {
xfs_buftarg_trip_write(btp);
return 0;
}
zsize = min(BDSTRAT_SIZE, BBTOB(len));
if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) {
fprintf(stderr,
_("%s: %s can't memalign %d bytes: %s\n"),
progname, __FUNCTION__, (int)zsize, strerror(errno));
exit(1);
}
memset(z, 0, zsize);
if ((lseek(fd, start_offset, SEEK_SET)) < 0) {
fprintf(stderr, _("%s: %s seek to offset %llu failed: %s\n"),
progname, __FUNCTION__,
(unsigned long long)start_offset, strerror(errno));
exit(1);
}
end_offset = LIBXFS_BBTOOFF64(start + len) - start_offset;
for (offset = 0; offset < end_offset; ) {
bytes = min((ssize_t)(end_offset - offset), zsize);
if ((bytes = write(fd, z, bytes)) < 0) {
fprintf(stderr, _("%s: %s write failed: %s\n"),
progname, __FUNCTION__, strerror(errno));
exit(1);
} else if (bytes == 0) {
fprintf(stderr, _("%s: %s not progressing?\n"),
progname, __FUNCTION__);
exit(1);
}
xfs_buftarg_trip_write(btp);
offset += bytes;
}
free(z);
return 0;
}
static void unmount_record(void *p)
{
xlog_op_header_t *op = (xlog_op_header_t *)p;
/* the data section must be 32 bit size aligned */
struct {
uint16_t magic;
uint16_t pad1;
uint32_t pad2; /* may as well make it 64 bits */
} magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
memset(p, 0, BBSIZE);
/* dummy tid to mark this as written from userspace */
op->oh_tid = cpu_to_be32(0xb0c0d0d0);
op->oh_len = cpu_to_be32(sizeof(magic));
op->oh_clientid = XFS_LOG;
op->oh_flags = XLOG_UNMOUNT_TRANS;
op->oh_res2 = 0;
/* and the data for this op */
memcpy((char *)p + sizeof(xlog_op_header_t), &magic, sizeof(magic));
}
static char *next(
char *ptr,
int offset,
void *private)
{
struct xfs_buf *buf = (struct xfs_buf *)private;
if (buf &&
(BBTOB(buf->b_length) < (int)(ptr - (char *)buf->b_addr) + offset))
abort();
return ptr + offset;
}
struct xfs_buf *
libxfs_getsb(
struct xfs_mount *mp)
{
struct xfs_buf *bp;
libxfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, XFS_FSS_TO_BB(mp, 1),
0, &bp, &xfs_sb_buf_ops);
return bp;
}
struct kmem_cache *xfs_buf_cache;
static struct cache_mru xfs_buf_freelist =
{{&xfs_buf_freelist.cm_list, &xfs_buf_freelist.cm_list},
0, PTHREAD_MUTEX_INITIALIZER };
/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
#define CACHE_LINE_SIZE 64
unsigned int
libxfs_bhash(cache_key_t key, unsigned int hashsize, unsigned int hashshift)
{
uint64_t hashval = ((struct xfs_bufkey *)key)->blkno;
uint64_t tmp;
tmp = hashval ^ (GOLDEN_RATIO_PRIME + hashval) / CACHE_LINE_SIZE;
tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> hashshift);
return tmp % hashsize;
}
int
libxfs_bcompare(
struct cache_node *node,
cache_key_t key)
{
struct xfs_buf *bp = container_of(node, struct xfs_buf,
b_node);
struct xfs_bufkey *bkey = (struct xfs_bufkey *)key;
struct cache *bcache = bkey->buftarg->bcache;
if (bp->b_cache_key == bkey->blkno) {
if (bp->b_length == bkey->bblen)
return CACHE_HIT;
#ifdef IO_BCOMPARE_CHECK
if (!(bcache->c_flags & CACHE_MISCOMPARE_PURGE)) {
fprintf(stderr,
"%lx: Badness in key lookup (length)\n"
"bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
pthread_self(),
(unsigned long long)xfs_buf_daddr(bp),
BBTOB(bp->b_length),
(unsigned long long)bkey->blkno,
BBTOB(bkey->bblen));
}
#endif
return CACHE_PURGE;
}
return CACHE_MISS;
}
static void
__initbuf(struct xfs_buf *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
unsigned int bytes)
{
ASSERT(!xfs_buftarg_is_mem(btp));
bp->b_flags = 0;
bp->b_cache_key = bno;
bp->b_length = BTOBB(bytes);
bp->b_target = btp;
bp->b_mount = btp->bt_mount;
bp->b_error = 0;
if (!bp->b_addr)
bp->b_addr = memalign(libxfs_device_alignment(), bytes);
if (!bp->b_addr) {
fprintf(stderr,
_("%s: %s can't memalign %u bytes: %s\n"),
progname, __FUNCTION__, bytes,
strerror(errno));
exit(1);
}
memset(bp->b_addr, 0, bytes);
pthread_mutex_init(&bp->b_lock, NULL);
bp->b_holder = 0;
bp->b_recur = 0;
bp->b_ops = NULL;
INIT_LIST_HEAD(&bp->b_li_list);
if (!bp->b_maps)
bp->b_maps = &bp->__b_map;
if (bp->b_maps == &bp->__b_map) {
bp->b_nmaps = 1;
bp->b_maps[0].bm_bn = bno;
bp->b_maps[0].bm_len = bp->b_length;
}
}
static void
libxfs_initbuf(struct xfs_buf *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
unsigned int bytes)
{
__initbuf(bp, btp, bno, bytes);
}
static void
libxfs_initbuf_map(struct xfs_buf *bp, struct xfs_buftarg *btp,
struct xfs_buf_map *map, int nmaps)
{
unsigned int bytes = 0;
int i;
bytes = sizeof(struct xfs_buf_map) * nmaps;
bp->b_maps = malloc(bytes);
if (!bp->b_maps) {
fprintf(stderr,
_("%s: %s can't malloc %u bytes: %s\n"),
progname, __FUNCTION__, bytes,
strerror(errno));
exit(1);
}
bp->b_nmaps = nmaps;
bytes = 0;
for ( i = 0; i < nmaps; i++) {
bp->b_maps[i].bm_bn = map[i].bm_bn;
bp->b_maps[i].bm_len = map[i].bm_len;
bytes += BBTOB(map[i].bm_len);
}
__initbuf(bp, btp, map[0].bm_bn, bytes);
bp->b_flags |= LIBXFS_B_DISCONTIG;
}
static struct xfs_buf *
__libxfs_getbufr(int blen)
{
struct xfs_buf *bp;
/*
* first look for a buffer that can be used as-is,
* if one cannot be found, see if there is a buffer,
* and if so, free its buffer and set b_addr to NULL
* before calling libxfs_initbuf.
*/
pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
if (!list_empty(&xfs_buf_freelist.cm_list)) {
list_for_each_entry(bp, &xfs_buf_freelist.cm_list, b_node.cn_mru) {
if (bp->b_length == BTOBB(blen)) {
list_del_init(&bp->b_node.cn_mru);
break;
}
}
if (&bp->b_node.cn_mru == &xfs_buf_freelist.cm_list) {
bp = list_entry(xfs_buf_freelist.cm_list.next,
struct xfs_buf, b_node.cn_mru);
list_del_init(&bp->b_node.cn_mru);
free(bp->b_addr);
bp->b_addr = NULL;
if (bp->b_maps != &bp->__b_map)
free(bp->b_maps);
bp->b_maps = NULL;
}
} else
bp = kmem_cache_zalloc(xfs_buf_cache, 0);
pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
bp->b_ops = NULL;
if (bp->b_flags & LIBXFS_B_DIRTY)
fprintf(stderr, "found dirty buffer (bulk) on free list!\n");
return bp;
}
static struct xfs_buf *
libxfs_getbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen)
{
struct xfs_buf *bp;
int blen = BBTOB(bblen);
bp =__libxfs_getbufr(blen);
if (bp)
libxfs_initbuf(bp, btp, blkno, blen);
return bp;
}
static struct xfs_buf *
libxfs_getbufr_map(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen,
struct xfs_buf_map *map, int nmaps)
{
struct xfs_buf *bp;
int blen = BBTOB(bblen);
if (!map || !nmaps) {
fprintf(stderr,
_("%s: %s invalid map %p or nmaps %d\n"),
progname, __FUNCTION__, map, nmaps);
exit(1);
}
if (blkno != map[0].bm_bn) {
fprintf(stderr,
_("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
progname, __FUNCTION__, (long long)map[0].bm_bn,
(long long)blkno);
exit(1);
}
bp =__libxfs_getbufr(blen);
if (bp)
libxfs_initbuf_map(bp, btp, map, nmaps);
return bp;
}
void
xfs_buf_lock(
struct xfs_buf *bp)
{
if (use_xfs_buf_lock)
pthread_mutex_lock(&bp->b_lock);
}
void
xfs_buf_unlock(
struct xfs_buf *bp)
{
if (use_xfs_buf_lock)
pthread_mutex_unlock(&bp->b_lock);
}
static int
__cache_lookup(
struct xfs_bufkey *key,
unsigned int flags,
struct xfs_buf **bpp)
{
struct cache_node *cn = NULL;
struct cache *bcache = key->buftarg->bcache;
struct xfs_buf *bp;
*bpp = NULL;
cache_node_get(bcache, key, &cn);
if (!cn)
return -ENOMEM;
bp = container_of(cn, struct xfs_buf, b_node);
if (use_xfs_buf_lock) {
int ret;
ret = pthread_mutex_trylock(&bp->b_lock);
if (ret) {
ASSERT(ret == EAGAIN);
if (flags & LIBXFS_GETBUF_TRYLOCK) {
cache_node_put(bcache, cn);
return -EAGAIN;
}
if (pthread_equal(bp->b_holder, pthread_self())) {
fprintf(stderr,
_("Warning: recursive buffer locking at block %" PRIu64 " detected\n"),
key->blkno);
bp->b_recur++;
*bpp = bp;
return 0;
} else {
pthread_mutex_lock(&bp->b_lock);
}
}
bp->b_holder = pthread_self();
}
cache_node_set_priority(bcache, cn,
cache_node_get_priority(cn) - CACHE_PREFETCH_PRIORITY);
*bpp = bp;
return 0;
}
static int
libxfs_getbuf_flags(
struct xfs_buftarg *btp,
xfs_daddr_t blkno,
int len,
unsigned int flags,
struct xfs_buf **bpp)
{
struct xfs_bufkey key = {NULL};
int ret;
key.buftarg = btp;
key.blkno = blkno;
key.bblen = len;
ret = __cache_lookup(&key, flags, bpp);
if (ret)
return ret;
if (btp == btp->bt_mount->m_ddev_targp) {
(*bpp)->b_pag = xfs_perag_get(btp->bt_mount,
xfs_daddr_to_agno(btp->bt_mount, blkno));
}
return 0;
}
/*
* Clean the buffer flags for libxfs_getbuf*(), which wants to return
* an unused buffer with clean state. This prevents CRC errors on a
* re-read of a corrupt block that was prefetched and freed. This
* can happen with a massively corrupt directory that is discarded,
* but whose blocks are then recycled into expanding lost+found.
*
* Note however that if the buffer's dirty (prefetch calls getbuf)
* we'll leave the state alone because we don't want to discard blocks
* that have been fixed.
*/
static void
reset_buf_state(
struct xfs_buf *bp)
{
if (bp && !(bp->b_flags & LIBXFS_B_DIRTY))
bp->b_flags &= ~(LIBXFS_B_UNCHECKED | LIBXFS_B_STALE |
LIBXFS_B_UPTODATE);
}
static int
__libxfs_buf_get_map(
struct xfs_buftarg *btp,
struct xfs_buf_map *map,
int nmaps,
int flags,
struct xfs_buf **bpp)
{
struct xfs_bufkey key = {NULL};
int i;
if (nmaps == 1)
return libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
flags, bpp);
key.buftarg = btp;
key.blkno = map[0].bm_bn;
for (i = 0; i < nmaps; i++) {
key.bblen += map[i].bm_len;
}
key.map = map;
key.nmaps = nmaps;
return __cache_lookup(&key, flags, bpp);
}
int
libxfs_buf_get_map(
struct xfs_buftarg *btp,
struct xfs_buf_map *map,
int nmaps,
int flags,
struct xfs_buf **bpp)
{
int error;
error = __libxfs_buf_get_map(btp, map, nmaps, flags, bpp);
if (error)
return error;
reset_buf_state(*bpp);
return 0;
}
void
libxfs_buf_relse(
struct xfs_buf *bp)
{
/*
* ensure that any errors on this use of the buffer don't carry
* over to the next user.
*/
bp->b_error = 0;
if (use_xfs_buf_lock) {
if (bp->b_recur) {
bp->b_recur--;
} else {
bp->b_holder = 0;
pthread_mutex_unlock(&bp->b_lock);
}
}
if (!list_empty(&bp->b_node.cn_hash))
cache_node_put(bp->b_target->bcache, &bp->b_node);
else if (--bp->b_node.cn_count == 0) {
if (bp->b_flags & LIBXFS_B_DIRTY)
libxfs_bwrite(bp);
libxfs_brelse(&bp->b_node);
}
}
static struct cache_node *
libxfs_balloc(
cache_key_t key)
{
struct xfs_bufkey *bufkey = (struct xfs_bufkey *)key;
struct xfs_buf *bp;
if (bufkey->map)
bp = libxfs_getbufr_map(bufkey->buftarg, bufkey->blkno,
bufkey->bblen, bufkey->map, bufkey->nmaps);
else
bp = libxfs_getbufr(bufkey->buftarg, bufkey->blkno,
bufkey->bblen);
return &bp->b_node;
}
static int
__read_buf(int fd, void *buf, int len, off_t offset, int flags)
{
int sts;
sts = pread(fd, buf, len, offset);
if (sts < 0) {
int error = errno;
fprintf(stderr, _("%s: read failed: %s\n"),
progname, strerror(error));
return -error;
} else if (sts != len) {
fprintf(stderr, _("%s: error - read only %d of %d bytes\n"),
progname, sts, len);
return -EIO;
}
return 0;
}
int
libxfs_readbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, struct xfs_buf *bp,
int len, int flags)
{
int fd = btp->bt_bdev_fd;
int bytes = BBTOB(len);
int error;
ASSERT(len <= bp->b_length);
if (xfs_buftarg_is_mem(btp))
return 0;
error = __read_buf(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno), flags);
if (!error &&
bp->b_target == btp &&
bp->b_cache_key == blkno &&
bp->b_length == len)
bp->b_flags |= LIBXFS_B_UPTODATE;
bp->b_error = error;
return error;
}
int
libxfs_readbuf_verify(
struct xfs_buf *bp,
const struct xfs_buf_ops *ops)
{
if (!ops)
return bp->b_error;
bp->b_ops = ops;
bp->b_ops->verify_read(bp);
bp->b_flags &= ~LIBXFS_B_UNCHECKED;
return bp->b_error;
}
int
libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags)
{
int fd = btp->bt_bdev_fd;
int error = 0;
void *buf;
int i;
if (xfs_buftarg_is_mem(btp))
return 0;
buf = bp->b_addr;
for (i = 0; i < bp->b_nmaps; i++) {
off_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
int len = BBTOB(bp->b_maps[i].bm_len);
error = __read_buf(fd, buf, len, offset, flags);
if (error) {
bp->b_error = error;
break;
}
buf += len;
}
if (!error)
bp->b_flags |= LIBXFS_B_UPTODATE;
return error;
}
int
libxfs_buf_read_map(
struct xfs_buftarg *btp,
struct xfs_buf_map *map,
int nmaps,
int flags,
struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
bool salvage = flags & LIBXFS_READBUF_SALVAGE;
int error = 0;
*bpp = NULL;
if (nmaps == 1)
error = libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
0, &bp);
else
error = __libxfs_buf_get_map(btp, map, nmaps, 0, &bp);
if (error)
return error;
/*
* If the buffer was prefetched, it is likely that it was not validated.
* Hence if we are supplied an ops function and the buffer is marked as
* unchecked, we need to validate it now.
*
* We do this verification even if the buffer is dirty - the
* verification is almost certainly going to fail the CRC check in this
* case as a dirty buffer has not had the CRC recalculated. However, we
* should not be dirtying unchecked buffers and therefore failing it
* here because it's dirty and unchecked indicates we've screwed up
* somewhere else.
*
* Note that if the caller passes in LIBXFS_READBUF_SALVAGE, that means
* they want the buffer even if it fails verification.
*/
bp->b_error = 0;
if (bp->b_flags & (LIBXFS_B_UPTODATE | LIBXFS_B_DIRTY)) {
if (bp->b_flags & LIBXFS_B_UNCHECKED)
error = libxfs_readbuf_verify(bp, ops);
if (error && !salvage)
goto err;
goto ok;
}
/*
* Set the ops on a cache miss (i.e. first physical read) as the
* verifier may change the ops to match the type of buffer it contains.
* A cache hit might reset the verifier to the original type if we set
* it again, but it won't get called again and set to match the buffer
* contents. *cough* xfs_da_node_buf_ops *cough*.
*/
if (nmaps == 1)
error = libxfs_readbufr(btp, map[0].bm_bn, bp, map[0].bm_len,
flags);
else
error = libxfs_readbufr_map(btp, bp, flags);
if (error)
goto err;
error = libxfs_readbuf_verify(bp, ops);
if (error && !salvage)
goto err;
ok:
*bpp = bp;
return 0;
err:
libxfs_buf_relse(bp);
return error;
}
/* Allocate a raw uncached buffer. */
static inline struct xfs_buf *
libxfs_getbufr_uncached(
struct xfs_buftarg *targ,
xfs_daddr_t daddr,
size_t bblen)
{
struct xfs_buf *bp;
bp = libxfs_getbufr(targ, daddr, bblen);
if (!bp)
return NULL;
INIT_LIST_HEAD(&bp->b_node.cn_hash);
bp->b_node.cn_count = 1;
return bp;
}
/*
* Allocate an uncached buffer that points nowhere. The refcount will be 1,
* and the cache node hash list will be empty to indicate that it's uncached.
*/
int
libxfs_buf_get_uncached(
struct xfs_buftarg *targ,
size_t bblen,
int flags,
struct xfs_buf **bpp)
{
*bpp = libxfs_getbufr_uncached(targ, XFS_BUF_DADDR_NULL, bblen);
return *bpp != NULL ? 0 : -ENOMEM;
}
/*
* Allocate and read an uncached buffer. The refcount will be 1, and the cache
* node hash list will be empty to indicate that it's uncached.
*/
int
libxfs_buf_read_uncached(
struct xfs_buftarg *targ,
xfs_daddr_t daddr,
size_t bblen,
int flags,
struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
int error;
*bpp = NULL;
bp = libxfs_getbufr_uncached(targ, daddr, bblen);
if (!bp)
return -ENOMEM;
error = libxfs_readbufr(targ, daddr, bp, bblen, flags);
if (error)
goto err;
error = libxfs_readbuf_verify(bp, ops);
if (error)
goto err;
*bpp = bp;
return 0;
err:
libxfs_buf_relse(bp);
return error;
}
static int
__write_buf(int fd, void *buf, int len, off_t offset, int flags)
{
int sts;
sts = pwrite(fd, buf, len, offset);
if (sts < 0) {
int error = errno;
fprintf(stderr, _("%s: pwrite failed: %s\n"),
progname, strerror(error));
return -error;
} else if (sts != len) {
fprintf(stderr, _("%s: error - pwrite only %d of %d bytes\n"),
progname, sts, len);
return -EIO;
}
return 0;
}
int
libxfs_bwrite(
struct xfs_buf *bp)
{
int fd = bp->b_target->bt_bdev_fd;
/*
* we never write buffers that are marked stale. This indicates they
* contain data that has been invalidated, and even if the buffer is
* dirty it must *never* be written. Verifiers are wonderful for finding
* bugs like this. Make sure the error is obvious as to the cause.
*/
if (bp->b_flags & LIBXFS_B_STALE) {
bp->b_error = -ESTALE;
return bp->b_error;
}
/* Trigger the writeback hook if there is one. */
if (bp->b_mount->m_buf_writeback_fn)
bp->b_mount->m_buf_writeback_fn(bp);
/*
* clear any pre-existing error status on the buffer. This can occur if
* the buffer is corrupt on disk and the repair process doesn't clear
* the error before fixing and writing it back.
*/
bp->b_error = 0;
if (bp->b_ops) {
bp->b_ops->verify_write(bp);
if (bp->b_error) {
fprintf(stderr,
_("%s: write verifier failed on %s bno 0x%llx/0x%x\n"),
__func__, bp->b_ops->name,
(unsigned long long)xfs_buf_daddr(bp),
bp->b_length);
return bp->b_error;
}
}
if (xfs_buftarg_is_mem(bp->b_target)) {
bp->b_error = 0;
} else if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) {
bp->b_error = __write_buf(fd, bp->b_addr, BBTOB(bp->b_length),
LIBXFS_BBTOOFF64(xfs_buf_daddr(bp)),
bp->b_flags);
} else {
int i;
void *buf = bp->b_addr;
for (i = 0; i < bp->b_nmaps; i++) {
off_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
int len = BBTOB(bp->b_maps[i].bm_len);
bp->b_error = __write_buf(fd, buf, len, offset,
bp->b_flags);
if (bp->b_error)
break;
buf += len;
}
}
if (bp->b_error) {
fprintf(stderr,
_("%s: write failed on %s bno 0x%llx/0x%x, err=%d\n"),
__func__, bp->b_ops ? bp->b_ops->name : "(unknown)",
(unsigned long long)xfs_buf_daddr(bp),
bp->b_length, -bp->b_error);
} else {
bp->b_flags |= LIBXFS_B_UPTODATE;
bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_UNCHECKED);
xfs_buftarg_trip_write(bp->b_target);
}
return bp->b_error;
}
/*
* Mark a buffer dirty. The dirty data will be written out when the cache
* is flushed (or at release time if the buffer is uncached).
*/
void
libxfs_buf_mark_dirty(
struct xfs_buf *bp)
{
/*
* Clear any error hanging over from reading the buffer. This prevents
* subsequent reads after this write from seeing stale errors.
*/
bp->b_error = 0;
bp->b_flags &= ~LIBXFS_B_STALE;
bp->b_flags |= LIBXFS_B_DIRTY;
}
/* Prepare a buffer to be sent to the MRU list. */
static inline void
libxfs_buf_prepare_mru(
struct xfs_buf *bp)
{
if (bp->b_pag)
xfs_perag_put(bp->b_pag);
bp->b_pag = NULL;
ASSERT(!xfs_buftarg_is_mem(btp));
if (!(bp->b_flags & LIBXFS_B_DIRTY))
return;
/* Complain about (and remember) dropping dirty buffers. */
fprintf(stderr, _("%s: Releasing dirty buffer to free list!\n"),
progname);
if (bp->b_error == -EFSCORRUPTED)
bp->b_target->flags |= XFS_BUFTARG_CORRUPT_WRITE;
bp->b_target->flags |= XFS_BUFTARG_LOST_WRITE;
}
static void
libxfs_brelse(
struct cache_node *node)
{
struct xfs_buf *bp = container_of(node, struct xfs_buf,
b_node);
if (!bp)
return;
libxfs_buf_prepare_mru(bp);
pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
}
static unsigned int
libxfs_bulkrelse(
struct cache *cache,
struct list_head *list)
{
struct xfs_buf *bp;
int count = 0;
if (list_empty(list))
return 0 ;
list_for_each_entry(bp, list, b_node.cn_mru) {
libxfs_buf_prepare_mru(bp);
count++;
}
pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
list_splice(list, &xfs_buf_freelist.cm_list);
pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
return count;
}
/*
* Free everything from the xfs_buf_freelist MRU, used at final teardown
*/
void
libxfs_bcache_free(void)
{
struct list_head *cm_list;
struct xfs_buf *bp, *next;
cm_list = &xfs_buf_freelist.cm_list;
list_for_each_entry_safe(bp, next, cm_list, b_node.cn_mru) {
free(bp->b_addr);
if (bp->b_maps != &bp->__b_map)
free(bp->b_maps);
kmem_cache_free(xfs_buf_cache, bp);
}
}
/*
* When a buffer is marked dirty, the error is cleared. Hence if we are trying
* to flush a buffer prior to cache reclaim that has an error on it it means
* we've already tried to flush it and it failed. Prevent repeated corruption
* errors from being reported by skipping such buffers - when the corruption is
* fixed the buffer will be marked dirty again and we can write it again.
*/
static int
libxfs_bflush(
struct cache_node *node)
{
struct xfs_buf *bp = container_of(node, struct xfs_buf,
b_node);
if (!bp->b_error && bp->b_flags & LIBXFS_B_DIRTY)
return libxfs_bwrite(bp);
return bp->b_error;
}
void
libxfs_bcache_purge(struct xfs_mount *mp)
{
if (!mp)
return;
cache_purge(mp->m_ddev_targp->bcache);
cache_purge(mp->m_logdev_targp->bcache);
cache_purge(mp->m_rtdev_targp->bcache);
}
void
libxfs_bcache_flush(struct xfs_mount *mp)
{
if (!mp)
return;
cache_flush(mp->m_ddev_targp->bcache);
cache_flush(mp->m_logdev_targp->bcache);
cache_flush(mp->m_rtdev_targp->bcache);
}
int
libxfs_bcache_overflowed(struct xfs_mount *mp)
{
return cache_overflowed(mp->m_ddev_targp->bcache) ||
cache_overflowed(mp->m_logdev_targp->bcache) ||
cache_overflowed(mp->m_rtdev_targp->bcache);
}
struct cache_operations libxfs_bcache_operations = {
.hash = libxfs_bhash,
.alloc = libxfs_balloc,
.flush = libxfs_bflush,
.relse = libxfs_brelse,
.compare = libxfs_bcompare,
.bulkrelse = libxfs_bulkrelse
};
/*
* Verify an on-disk magic value against the magic value specified in the
* verifier structure. The verifier magic is in disk byte order so the caller is
* expected to pass the value directly from disk.
*/
bool
xfs_verify_magic(
struct xfs_buf *bp,
__be32 dmagic)
{
struct xfs_mount *mp = bp->b_mount;
int idx;
idx = xfs_has_crc(mp);
if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])))
return false;
return dmagic == bp->b_ops->magic[idx];
}
/*
* Verify an on-disk magic value against the magic value specified in the
* verifier structure. The verifier magic is in disk byte order so the caller is
* expected to pass the value directly from disk.
*/
bool
xfs_verify_magic16(
struct xfs_buf *bp,
__be16 dmagic)
{
struct xfs_mount *mp = bp->b_mount;
int idx;
idx = xfs_has_crc(mp);
if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])))
return false;
return dmagic == bp->b_ops->magic16[idx];
}
/*
* Inode cache stubs.
*/
struct kmem_cache *xfs_inode_cache;
extern struct kmem_cache *xfs_ili_cache;
int
libxfs_iget(
struct xfs_mount *mp,
struct xfs_trans *tp,
xfs_ino_t ino,
uint lock_flags,
struct xfs_inode **ipp)
{
struct xfs_inode *ip;
struct xfs_buf *bp;
struct xfs_perag *pag;
int error = 0;
/* reject inode numbers outside existing AGs */
if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
return -EINVAL;
ip = kmem_cache_zalloc(xfs_inode_cache, 0);
if (!ip)
return -ENOMEM;
VFS_I(ip)->i_count = 1;
ip->i_ino = ino;
ip->i_mount = mp;
ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
spin_lock_init(&VFS_I(ip)->i_lock);
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, 0);
xfs_perag_put(pag);
if (error)
goto out_destroy;
error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
if (error)
goto out_destroy;
error = xfs_inode_from_disk(ip,
xfs_buf_offset(bp, ip->i_imap.im_boffset));
if (!error)
xfs_buf_set_ref(bp, XFS_INO_REF);
xfs_trans_brelse(tp, bp);
if (error)
goto out_destroy;
*ipp = ip;
return 0;
out_destroy:
kmem_cache_free(xfs_inode_cache, ip);
*ipp = NULL;
return error;
}
static void
libxfs_idestroy(xfs_inode_t *ip)
{
switch (VFS_I(ip)->i_mode & S_IFMT) {
case S_IFREG:
case S_IFDIR:
case S_IFLNK:
libxfs_idestroy_fork(&ip->i_df);
break;
}
libxfs_ifork_zap_attr(ip);
if (ip->i_cowfp) {
libxfs_idestroy_fork(ip->i_cowfp);
kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
}
}
void
libxfs_irele(
struct xfs_inode *ip)
{
VFS_I(ip)->i_count--;
if (VFS_I(ip)->i_count == 0) {
ASSERT(ip->i_itemp == NULL);
libxfs_idestroy(ip);
kmem_cache_free(xfs_inode_cache, ip);
}
}
/*
* Flush everything dirty in the kernel and disk write caches to stable media.
* Returns 0 for success or a negative error code.
*/
int
libxfs_blkdev_issue_flush(
struct xfs_buftarg *btp)
{
int ret;
if (btp->bt_bdev == 0)
return 0;
ret = platform_flush_device(btp->bt_bdev_fd, btp->bt_bdev);
return ret ? -errno : 0;
}
/*
* Write out a buffer list synchronously.
*
* This will take the @buffer_list, write all buffers out and wait for I/O
* completion on all of the buffers. @buffer_list is consumed by the function,
* so callers must have some other way of tracking buffers if they require such
* functionality.
*/
int
xfs_buf_delwri_submit(
struct list_head *buffer_list)
{
struct xfs_buf *bp, *n;
int error = 0, error2;
list_for_each_entry_safe(bp, n, buffer_list, b_list) {
list_del_init(&bp->b_list);
error2 = libxfs_bwrite(bp);
if (!error)
error = error2;
libxfs_buf_relse(bp);
}
return error;
}
/*
* Cancel a delayed write list.
*
* Remove each buffer from the list, clear the delwri queue flag and drop the
* associated buffer reference.
*/
void
xfs_buf_delwri_cancel(
struct list_head *list)
{
struct xfs_buf *bp;
while (!list_empty(list)) {
bp = list_first_entry(list, struct xfs_buf, b_list);
list_del_init(&bp->b_list);
libxfs_buf_relse(bp);
}
}
/*
* Format the log. The caller provides either a buftarg which is used to access
* the log via buffers or a direct pointer to a buffer that encapsulates the
* entire log.
*/
int
libxfs_log_clear(
struct xfs_buftarg *btp,
char *dptr,
xfs_daddr_t start,
uint length, /* basic blocks */
uuid_t *fs_uuid,
int version,
int sunit, /* bytes */
int fmt,
int cycle,
bool max)
{
struct xfs_buf *bp = NULL;
int len;
xfs_lsn_t lsn;
xfs_lsn_t tail_lsn;
xfs_daddr_t blk;
xfs_daddr_t end_blk;
char *ptr;
if (((btp && dptr) || (!btp && !dptr)) ||
(btp && !btp->bt_bdev) || !fs_uuid)
return -EINVAL;
/* first zero the log */
if (btp)
libxfs_device_zero(btp, start, length);
else
memset(dptr, 0, BBTOB(length));
/*
* Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
* special reset case where we only write a single record where the lsn
* and tail_lsn match. Otherwise, the record lsn starts at block 0 of
* the specified cycle and points tail_lsn at the last record of the
* previous cycle.
*/
len = ((version == 2) && sunit) ? BTOBB(sunit) : 2;
len = max(len, 2);
lsn = xlog_assign_lsn(cycle, 0);
if (cycle == XLOG_INIT_CYCLE)
tail_lsn = lsn;
else
tail_lsn = xlog_assign_lsn(cycle - 1, length - len);
/* write out the first log record */
ptr = dptr;
if (btp) {
bp = libxfs_getbufr_uncached(btp, start, len);
ptr = bp->b_addr;
}
libxfs_log_header(ptr, fs_uuid, version, sunit, fmt, lsn, tail_lsn,
next, bp);
if (bp) {
libxfs_buf_mark_dirty(bp);
libxfs_buf_relse(bp);
}
/*
* There's nothing else to do if this is a log reset. The kernel detects
* the rest of the log is zeroed and starts at cycle 1.
*/
if (cycle == XLOG_INIT_CYCLE)
return 0;
/*
* Bump the record size for a full log format if the caller allows it.
* This is primarily for performance reasons and most callers don't care
* about record size since the log is clean after we're done.
*/
if (max)
len = BTOBB(BDSTRAT_SIZE);
/*
* Otherwise, fill everything beyond the initial record with records of
* the previous cycle so the kernel head/tail detection works correctly.
*
* We don't particularly care about the record size or content here.
* It's only important that the headers are in place such that the
* kernel finds 1.) a clean log and 2.) the correct current cycle value.
* Therefore, bump up the record size to the max to use larger I/Os and
* improve performance.
*/
cycle--;
blk = start + len;
if (dptr)
dptr += BBTOB(len);
end_blk = start + length;
len = min(end_blk - blk, len);
while (blk < end_blk) {
lsn = xlog_assign_lsn(cycle, blk - start);
tail_lsn = xlog_assign_lsn(cycle, blk - start - len);
ptr = dptr;
if (btp) {
bp = libxfs_getbufr_uncached(btp, blk, len);
ptr = bp->b_addr;
}
/*
* Note: pass the full buffer length as the sunit to initialize
* the entire buffer.
*/
libxfs_log_header(ptr, fs_uuid, version, BBTOB(len), fmt, lsn,
tail_lsn, next, bp);
if (bp) {
libxfs_buf_mark_dirty(bp);
libxfs_buf_relse(bp);
}
blk += len;
if (dptr)
dptr += BBTOB(len);
len = min(end_blk - blk, len);
}
return 0;
}
int
libxfs_log_header(
char *caddr,
uuid_t *fs_uuid,
int version,
int sunit,
int fmt,
xfs_lsn_t lsn,
xfs_lsn_t tail_lsn,
libxfs_get_block_t *nextfunc,
void *private)
{
xlog_rec_header_t *head = (xlog_rec_header_t *)caddr;
char *p = caddr;
__be32 cycle_lsn;
int i, len;
int hdrs = 1;
if (lsn == NULLCOMMITLSN)
lsn = xlog_assign_lsn(XLOG_INIT_CYCLE, 0);
if (tail_lsn == NULLCOMMITLSN)
tail_lsn = lsn;
len = ((version == 2) && sunit) ? BTOBB(sunit) : 1;
memset(p, 0, BBSIZE);
head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
head->h_cycle = cpu_to_be32(CYCLE_LSN(lsn));
head->h_version = cpu_to_be32(version);
head->h_crc = cpu_to_le32(0);
head->h_prev_block = cpu_to_be32(-1);
head->h_num_logops = cpu_to_be32(1);
head->h_fmt = cpu_to_be32(fmt);
head->h_size = cpu_to_be32(max(sunit, XLOG_BIG_RECORD_BSIZE));
head->h_lsn = cpu_to_be64(lsn);
head->h_tail_lsn = cpu_to_be64(tail_lsn);
memcpy(&head->h_fs_uuid, fs_uuid, sizeof(uuid_t));
/*
* The kernel expects to see either a log record header magic value or
* the LSN cycle at the top of every log block. The first word of each
* non-header block is copied to the record headers and replaced with
* the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
* details).
*
* Even though we only ever write an unmount record (one block), we
* support writing log records up to the max log buffer size of 256k to
* improve log format performance. This means a record can require up
* to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
* data (each header supports 32k of data).
*/
cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
if (version == 2 && sunit > XLOG_HEADER_CYCLE_SIZE) {
hdrs = sunit / XLOG_HEADER_CYCLE_SIZE;
if (sunit % XLOG_HEADER_CYCLE_SIZE)
hdrs++;
}
/*
* A fixed number of extended headers is expected based on h_size. If
* required, format those now so the unmount record is located
* correctly.
*
* Since we only write an unmount record, we only need one h_cycle_data
* entry for the unmount record block. The subsequent record data
* blocks are zeroed, which means we can stamp them directly with the
* cycle and zero the rest of the cycle data in the extended headers.
*/
if (hdrs > 1) {
for (i = 1; i < hdrs; i++) {
p = nextfunc(p, BBSIZE, private);
memset(p, 0, BBSIZE);
/* xlog_rec_ext_header.xh_cycle */
*(__be32 *)p = cycle_lsn;
}
}
/*
* The total length is the max of the stripe unit or 2 basic block
* minimum (1 hdr blk + 1 data blk). The record length is the total
* minus however many header blocks are required.
*/
head->h_len = cpu_to_be32(max(BBTOB(2), sunit) - hdrs * BBSIZE);
/*
* Write out the unmount record, pack the first word into the record
* header and stamp the block with the cycle.
*/
p = nextfunc(p, BBSIZE, private);
unmount_record(p);
head->h_cycle_data[0] = *(__be32 *)p;
*(__be32 *)p = cycle_lsn;
/*
* Finally, zero all remaining blocks in the record and stamp each with
* the cycle. We don't need to pack any of these blocks because the
* cycle data in the headers has already been zeroed.
*/
len = max(len, hdrs + 1);
for (i = hdrs + 1; i < len; i++) {
p = nextfunc(p, BBSIZE, private);
memset(p, 0, BBSIZE);
*(__be32 *)p = cycle_lsn;
}
return BBTOB(len);
}
void
libxfs_buf_set_priority(
struct xfs_buf *bp,
int priority)
{
cache_node_set_priority(bp->b_target->bcache, &bp->b_node, priority);
}
int
libxfs_buf_priority(
struct xfs_buf *bp)
{
return cache_node_get_priority(&bp->b_node);
}
/*
* Log a message about and stale a buffer that a caller has decided is corrupt.
*
* This function should be called for the kinds of metadata corruption that
* cannot be detect from a verifier, such as incorrect inter-block relationship
* data. Do /not/ call this function from a verifier function.
*
* The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will
* be marked stale, but b_error will not be set. The caller is responsible for
* releasing the buffer or fixing it.
*/
void
__xfs_buf_mark_corrupt(
struct xfs_buf *bp,
xfs_failaddr_t fa)
{
ASSERT(bp->b_flags & XBF_DONE);
xfs_buf_corruption_error(bp, fa);
xfs_buf_stale(bp);
}