blob: a00360e703a6a6e15816c5f08e134cfca4780694 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2006 Silicon Graphics, Inc.
* All Rights Reserved.
*/
#include "libxfs_priv.h"
#include "init.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode_buf.h"
#include "xfs_inode_fork.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "libxfs.h" /* for LIBXFS_EXIT_ON_FAILURE */
/*
* Important design/architecture note:
*
* The userspace code that uses the buffer cache is much less constrained than
* the kernel code. The userspace code is pretty nasty in places, especially
* when it comes to buffer error handling. Very little of the userspace code
* outside libxfs clears bp->b_error - very little code even checks it - so the
* libxfs code is tripping on stale errors left by the userspace code.
*
* We can't clear errors or zero buffer contents in libxfs_getbuf-* like we do
* in the kernel, because those functions are used by the libxfs_readbuf_*
* functions and hence need to leave the buffers unchanged on cache hits. This
* is actually the only way to gather a write error from a libxfs_writebuf()
* call - you need to get the buffer again so you can check bp->b_error field -
* assuming that the buffer is still in the cache when you check, that is.
*
* This is very different to the kernel code which does not release buffers on a
* write so we can wait on IO and check errors. The kernel buffer cache also
* guarantees a buffer of a known initial state from xfs_buf_get() even on a
* cache hit.
*
* IOWs, userspace is behaving quite differently to the kernel and as a result
* it leaks errors from reads, invalidations and writes through
* libxfs_getbuf/libxfs_readbuf.
*
* The result of this is that until the userspace code outside libxfs is cleaned
* up, functions that release buffers from userspace control (i.e
* libxfs_writebuf/libxfs_putbuf) need to zero bp->b_error to prevent
* propagation of stale errors into future buffer operations.
*/
#define BDSTRAT_SIZE (256 * 1024)
#define IO_BCOMPARE_CHECK
/* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
int
libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
{
xfs_off_t start_offset, end_offset, offset;
ssize_t zsize, bytes;
char *z;
int fd;
zsize = min(BDSTRAT_SIZE, BBTOB(len));
if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) {
fprintf(stderr,
_("%s: %s can't memalign %d bytes: %s\n"),
progname, __FUNCTION__, (int)zsize, strerror(errno));
exit(1);
}
memset(z, 0, zsize);
fd = libxfs_device_to_fd(btp->dev);
start_offset = LIBXFS_BBTOOFF64(start);
if ((lseek(fd, start_offset, SEEK_SET)) < 0) {
fprintf(stderr, _("%s: %s seek to offset %llu failed: %s\n"),
progname, __FUNCTION__,
(unsigned long long)start_offset, strerror(errno));
exit(1);
}
end_offset = LIBXFS_BBTOOFF64(start + len) - start_offset;
for (offset = 0; offset < end_offset; ) {
bytes = min((ssize_t)(end_offset - offset), zsize);
if ((bytes = write(fd, z, bytes)) < 0) {
fprintf(stderr, _("%s: %s write failed: %s\n"),
progname, __FUNCTION__, strerror(errno));
exit(1);
} else if (bytes == 0) {
fprintf(stderr, _("%s: %s not progressing?\n"),
progname, __FUNCTION__);
exit(1);
}
offset += bytes;
}
free(z);
return 0;
}
static void unmount_record(void *p)
{
xlog_op_header_t *op = (xlog_op_header_t *)p;
/* the data section must be 32 bit size aligned */
struct {
uint16_t magic;
uint16_t pad1;
uint32_t pad2; /* may as well make it 64 bits */
} magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
memset(p, 0, BBSIZE);
/* dummy tid to mark this as written from userspace */
op->oh_tid = cpu_to_be32(0xb0c0d0d0);
op->oh_len = cpu_to_be32(sizeof(magic));
op->oh_clientid = XFS_LOG;
op->oh_flags = XLOG_UNMOUNT_TRANS;
op->oh_res2 = 0;
/* and the data for this op */
memcpy((char *)p + sizeof(xlog_op_header_t), &magic, sizeof(magic));
}
static char *next(
char *ptr,
int offset,
void *private)
{
struct xfs_buf *buf = (struct xfs_buf *)private;
if (buf &&
(buf->b_bcount < (int)(ptr - (char *)buf->b_addr) + offset))
abort();
return ptr + offset;
}
/*
* Format the log. The caller provides either a buftarg which is used to access
* the log via buffers or a direct pointer to a buffer that encapsulates the
* entire log.
*/
int
libxfs_log_clear(
struct xfs_buftarg *btp,
char *dptr,
xfs_daddr_t start,
uint length, /* basic blocks */
uuid_t *fs_uuid,
int version,
int sunit, /* bytes */
int fmt,
int cycle,
bool max)
{
struct xfs_buf *bp = NULL;
int len;
xfs_lsn_t lsn;
xfs_lsn_t tail_lsn;
xfs_daddr_t blk;
xfs_daddr_t end_blk;
char *ptr;
if (((btp && dptr) || (!btp && !dptr)) ||
(btp && !btp->dev) || !fs_uuid)
return -EINVAL;
/* first zero the log */
if (btp)
libxfs_device_zero(btp, start, length);
else
memset(dptr, 0, BBTOB(length));
/*
* Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
* special reset case where we only write a single record where the lsn
* and tail_lsn match. Otherwise, the record lsn starts at block 0 of
* the specified cycle and points tail_lsn at the last record of the
* previous cycle.
*/
len = ((version == 2) && sunit) ? BTOBB(sunit) : 2;
len = max(len, 2);
lsn = xlog_assign_lsn(cycle, 0);
if (cycle == XLOG_INIT_CYCLE)
tail_lsn = lsn;
else
tail_lsn = xlog_assign_lsn(cycle - 1, length - len);
/* write out the first log record */
ptr = dptr;
if (btp) {
bp = libxfs_getbufr(btp, start, len);
ptr = bp->b_addr;
}
libxfs_log_header(ptr, fs_uuid, version, sunit, fmt, lsn, tail_lsn,
next, bp);
if (bp) {
bp->b_flags |= LIBXFS_B_DIRTY;
libxfs_putbufr(bp);
}
/*
* There's nothing else to do if this is a log reset. The kernel detects
* the rest of the log is zeroed and starts at cycle 1.
*/
if (cycle == XLOG_INIT_CYCLE)
return 0;
/*
* Bump the record size for a full log format if the caller allows it.
* This is primarily for performance reasons and most callers don't care
* about record size since the log is clean after we're done.
*/
if (max)
len = BTOBB(BDSTRAT_SIZE);
/*
* Otherwise, fill everything beyond the initial record with records of
* the previous cycle so the kernel head/tail detection works correctly.
*
* We don't particularly care about the record size or content here.
* It's only important that the headers are in place such that the
* kernel finds 1.) a clean log and 2.) the correct current cycle value.
* Therefore, bump up the record size to the max to use larger I/Os and
* improve performance.
*/
cycle--;
blk = start + len;
if (dptr)
dptr += BBTOB(len);
end_blk = start + length;
len = min(end_blk - blk, len);
while (blk < end_blk) {
lsn = xlog_assign_lsn(cycle, blk - start);
tail_lsn = xlog_assign_lsn(cycle, blk - start - len);
ptr = dptr;
if (btp) {
bp = libxfs_getbufr(btp, blk, len);
ptr = bp->b_addr;
}
/*
* Note: pass the full buffer length as the sunit to initialize
* the entire buffer.
*/
libxfs_log_header(ptr, fs_uuid, version, BBTOB(len), fmt, lsn,
tail_lsn, next, bp);
if (bp) {
bp->b_flags |= LIBXFS_B_DIRTY;
libxfs_putbufr(bp);
}
blk += len;
if (dptr)
dptr += BBTOB(len);
len = min(end_blk - blk, len);
}
return 0;
}
int
libxfs_log_header(
char *caddr,
uuid_t *fs_uuid,
int version,
int sunit,
int fmt,
xfs_lsn_t lsn,
xfs_lsn_t tail_lsn,
libxfs_get_block_t *nextfunc,
void *private)
{
xlog_rec_header_t *head = (xlog_rec_header_t *)caddr;
char *p = caddr;
__be32 cycle_lsn;
int i, len;
int hdrs = 1;
if (lsn == NULLCOMMITLSN)
lsn = xlog_assign_lsn(XLOG_INIT_CYCLE, 0);
if (tail_lsn == NULLCOMMITLSN)
tail_lsn = lsn;
len = ((version == 2) && sunit) ? BTOBB(sunit) : 1;
memset(p, 0, BBSIZE);
head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
head->h_cycle = cpu_to_be32(CYCLE_LSN(lsn));
head->h_version = cpu_to_be32(version);
head->h_crc = cpu_to_le32(0);
head->h_prev_block = cpu_to_be32(-1);
head->h_num_logops = cpu_to_be32(1);
head->h_fmt = cpu_to_be32(fmt);
head->h_size = cpu_to_be32(max(sunit, XLOG_BIG_RECORD_BSIZE));
head->h_lsn = cpu_to_be64(lsn);
head->h_tail_lsn = cpu_to_be64(tail_lsn);
memcpy(&head->h_fs_uuid, fs_uuid, sizeof(uuid_t));
/*
* The kernel expects to see either a log record header magic value or
* the LSN cycle at the top of every log block. The first word of each
* non-header block is copied to the record headers and replaced with
* the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
* details).
*
* Even though we only ever write an unmount record (one block), we
* support writing log records up to the max log buffer size of 256k to
* improve log format performance. This means a record can require up
* to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
* data (each header supports 32k of data).
*/
cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
if (version == 2 && sunit > XLOG_HEADER_CYCLE_SIZE) {
hdrs = sunit / XLOG_HEADER_CYCLE_SIZE;
if (sunit % XLOG_HEADER_CYCLE_SIZE)
hdrs++;
}
/*
* A fixed number of extended headers is expected based on h_size. If
* required, format those now so the unmount record is located
* correctly.
*
* Since we only write an unmount record, we only need one h_cycle_data
* entry for the unmount record block. The subsequent record data
* blocks are zeroed, which means we can stamp them directly with the
* cycle and zero the rest of the cycle data in the extended headers.
*/
if (hdrs > 1) {
for (i = 1; i < hdrs; i++) {
p = nextfunc(p, BBSIZE, private);
memset(p, 0, BBSIZE);
/* xlog_rec_ext_header.xh_cycle */
*(__be32 *)p = cycle_lsn;
}
}
/*
* The total length is the max of the stripe unit or 2 basic block
* minimum (1 hdr blk + 1 data blk). The record length is the total
* minus however many header blocks are required.
*/
head->h_len = cpu_to_be32(max(BBTOB(2), sunit) - hdrs * BBSIZE);
/*
* Write out the unmount record, pack the first word into the record
* header and stamp the block with the cycle.
*/
p = nextfunc(p, BBSIZE, private);
unmount_record(p);
head->h_cycle_data[0] = *(__be32 *)p;
*(__be32 *)p = cycle_lsn;
/*
* Finally, zero all remaining blocks in the record and stamp each with
* the cycle. We don't need to pack any of these blocks because the
* cycle data in the headers has already been zeroed.
*/
len = max(len, hdrs + 1);
for (i = hdrs + 1; i < len; i++) {
p = nextfunc(p, BBSIZE, private);
memset(p, 0, BBSIZE);
*(__be32 *)p = cycle_lsn;
}
return BBTOB(len);
}
/*
* Simple I/O (buffer cache) interface
*/
#ifdef XFS_BUF_TRACING
#undef libxfs_readbuf
#undef libxfs_readbuf_map
#undef libxfs_writebuf
#undef libxfs_getbuf
#undef libxfs_getbuf_map
#undef libxfs_getbuf_flags
#undef libxfs_putbuf
xfs_buf_t *libxfs_readbuf(struct xfs_buftarg *, xfs_daddr_t, int, int,
const struct xfs_buf_ops *);
xfs_buf_t *libxfs_readbuf_map(struct xfs_buftarg *, struct xfs_buf_map *,
int, int, const struct xfs_buf_ops *);
int libxfs_writebuf(xfs_buf_t *, int);
xfs_buf_t *libxfs_getbuf(struct xfs_buftarg *, xfs_daddr_t, int);
xfs_buf_t *libxfs_getbuf_map(struct xfs_buftarg *, struct xfs_buf_map *,
int, int);
xfs_buf_t *libxfs_getbuf_flags(struct xfs_buftarg *, xfs_daddr_t, int,
unsigned int);
void libxfs_putbuf (xfs_buf_t *);
#define __add_trace(bp, func, file, line) \
do { \
if (bp) { \
(bp)->b_func = (func); \
(bp)->b_file = (file); \
(bp)->b_line = (line); \
} \
} while (0)
xfs_buf_t *
libxfs_trace_readbuf(const char *func, const char *file, int line,
struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, int flags,
const struct xfs_buf_ops *ops)
{
xfs_buf_t *bp = libxfs_readbuf(btp, blkno, len, flags, ops);
__add_trace(bp, func, file, line);
return bp;
}
xfs_buf_t *
libxfs_trace_readbuf_map(const char *func, const char *file, int line,
struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps, int flags,
const struct xfs_buf_ops *ops)
{
xfs_buf_t *bp = libxfs_readbuf_map(btp, map, nmaps, flags, ops);
__add_trace(bp, func, file, line);
return bp;
}
int
libxfs_trace_writebuf(const char *func, const char *file, int line, xfs_buf_t *bp, int flags)
{
__add_trace(bp, func, file, line);
return libxfs_writebuf(bp, flags);
}
xfs_buf_t *
libxfs_trace_getbuf(const char *func, const char *file, int line,
struct xfs_buftarg *btp, xfs_daddr_t blkno, int len)
{
xfs_buf_t *bp = libxfs_getbuf(btp, blkno, len);
__add_trace(bp, func, file, line);
return bp;
}
xfs_buf_t *
libxfs_trace_getbuf_map(const char *func, const char *file, int line,
struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps,
int flags)
{
xfs_buf_t *bp = libxfs_getbuf_map(btp, map, nmaps, flags);
__add_trace(bp, func, file, line);
return bp;
}
xfs_buf_t *
libxfs_trace_getbuf_flags(const char *func, const char *file, int line,
struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, unsigned int flags)
{
xfs_buf_t *bp = libxfs_getbuf_flags(btp, blkno, len, flags);
__add_trace(bp, func, file, line);
return bp;
}
void
libxfs_trace_putbuf(const char *func, const char *file, int line, xfs_buf_t *bp)
{
__add_trace(bp, func, file, line);
libxfs_putbuf(bp);
}
#endif
xfs_buf_t *
libxfs_getsb(xfs_mount_t *mp, int flags)
{
return libxfs_readbuf(mp->m_ddev_targp, XFS_SB_DADDR,
XFS_FSS_TO_BB(mp, 1), flags, &xfs_sb_buf_ops);
}
kmem_zone_t *xfs_buf_zone;
static struct cache_mru xfs_buf_freelist =
{{&xfs_buf_freelist.cm_list, &xfs_buf_freelist.cm_list},
0, PTHREAD_MUTEX_INITIALIZER };
/*
* The bufkey is used to pass the new buffer information to the cache object
* allocation routine. Because discontiguous buffers need to pass different
* information, we need fields to pass that information. However, because the
* blkno and bblen is needed for the initial cache entry lookup (i.e. for
* bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
* buffer initialisation instead of a contiguous buffer.
*/
struct xfs_bufkey {
struct xfs_buftarg *buftarg;
xfs_daddr_t blkno;
unsigned int bblen;
struct xfs_buf_map *map;
int nmaps;
};
/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
#define CACHE_LINE_SIZE 64
static unsigned int
libxfs_bhash(cache_key_t key, unsigned int hashsize, unsigned int hashshift)
{
uint64_t hashval = ((struct xfs_bufkey *)key)->blkno;
uint64_t tmp;
tmp = hashval ^ (GOLDEN_RATIO_PRIME + hashval) / CACHE_LINE_SIZE;
tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> hashshift);
return tmp % hashsize;
}
static int
libxfs_bcompare(struct cache_node *node, cache_key_t key)
{
struct xfs_buf *bp = (struct xfs_buf *)node;
struct xfs_bufkey *bkey = (struct xfs_bufkey *)key;
if (bp->b_target->dev == bkey->buftarg->dev &&
bp->b_bn == bkey->blkno) {
if (bp->b_bcount == BBTOB(bkey->bblen))
return CACHE_HIT;
#ifdef IO_BCOMPARE_CHECK
if (!(libxfs_bcache->c_flags & CACHE_MISCOMPARE_PURGE)) {
fprintf(stderr,
"%lx: Badness in key lookup (length)\n"
"bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
pthread_self(),
(unsigned long long)bp->b_bn, (int)bp->b_bcount,
(unsigned long long)bkey->blkno,
BBTOB(bkey->bblen));
}
#endif
return CACHE_PURGE;
}
return CACHE_MISS;
}
static void
__initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
unsigned int bytes)
{
bp->b_flags = 0;
bp->b_bn = bno;
bp->b_bcount = bytes;
bp->b_length = BTOBB(bytes);
bp->b_target = btp;
bp->b_error = 0;
if (!bp->b_addr)
bp->b_addr = memalign(libxfs_device_alignment(), bytes);
if (!bp->b_addr) {
fprintf(stderr,
_("%s: %s can't memalign %u bytes: %s\n"),
progname, __FUNCTION__, bytes,
strerror(errno));
exit(1);
}
memset(bp->b_addr, 0, bytes);
#ifdef XFS_BUF_TRACING
list_head_init(&bp->b_lock_list);
#endif
pthread_mutex_init(&bp->b_lock, NULL);
bp->b_holder = 0;
bp->b_recur = 0;
bp->b_ops = NULL;
if (!bp->b_maps) {
bp->b_nmaps = 1;
bp->b_maps = &bp->__b_map;
bp->b_maps[0].bm_bn = bp->b_bn;
bp->b_maps[0].bm_len = bp->b_length;
}
}
static void
libxfs_initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
unsigned int bytes)
{
__initbuf(bp, btp, bno, bytes);
}
static void
libxfs_initbuf_map(xfs_buf_t *bp, struct xfs_buftarg *btp,
struct xfs_buf_map *map, int nmaps)
{
unsigned int bytes = 0;
int i;
bytes = sizeof(struct xfs_buf_map) * nmaps;
bp->b_maps = malloc(bytes);
if (!bp->b_maps) {
fprintf(stderr,
_("%s: %s can't malloc %u bytes: %s\n"),
progname, __FUNCTION__, bytes,
strerror(errno));
exit(1);
}
bp->b_nmaps = nmaps;
bytes = 0;
for ( i = 0; i < nmaps; i++) {
bp->b_maps[i].bm_bn = map[i].bm_bn;
bp->b_maps[i].bm_len = map[i].bm_len;
bytes += BBTOB(map[i].bm_len);
}
__initbuf(bp, btp, map[0].bm_bn, bytes);
bp->b_flags |= LIBXFS_B_DISCONTIG;
}
static xfs_buf_t *
__libxfs_getbufr(int blen)
{
xfs_buf_t *bp;
/*
* first look for a buffer that can be used as-is,
* if one cannot be found, see if there is a buffer,
* and if so, free its buffer and set b_addr to NULL
* before calling libxfs_initbuf.
*/
pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
if (!list_empty(&xfs_buf_freelist.cm_list)) {
list_for_each_entry(bp, &xfs_buf_freelist.cm_list, b_node.cn_mru) {
if (bp->b_bcount == blen) {
list_del_init(&bp->b_node.cn_mru);
break;
}
}
if (&bp->b_node.cn_mru == &xfs_buf_freelist.cm_list) {
bp = list_entry(xfs_buf_freelist.cm_list.next,
xfs_buf_t, b_node.cn_mru);
list_del_init(&bp->b_node.cn_mru);
free(bp->b_addr);
bp->b_addr = NULL;
if (bp->b_maps != &bp->__b_map)
free(bp->b_maps);
bp->b_maps = NULL;
}
} else
bp = kmem_zone_zalloc(xfs_buf_zone, 0);
pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
bp->b_ops = NULL;
if (bp->b_flags & LIBXFS_B_DIRTY)
fprintf(stderr, "found dirty buffer (bulk) on free list!");
return bp;
}
xfs_buf_t *
libxfs_getbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen)
{
xfs_buf_t *bp;
int blen = BBTOB(bblen);
bp =__libxfs_getbufr(blen);
if (bp)
libxfs_initbuf(bp, btp, blkno, blen);
#ifdef IO_DEBUG
printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
pthread_self(), __FUNCTION__, blen,
(long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
#endif
return bp;
}
static xfs_buf_t *
libxfs_getbufr_map(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen,
struct xfs_buf_map *map, int nmaps)
{
xfs_buf_t *bp;
int blen = BBTOB(bblen);
if (!map || !nmaps) {
fprintf(stderr,
_("%s: %s invalid map %p or nmaps %d\n"),
progname, __FUNCTION__, map, nmaps);
exit(1);
}
if (blkno != map[0].bm_bn) {
fprintf(stderr,
_("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
progname, __FUNCTION__, (long long)map[0].bm_bn,
(long long)blkno);
exit(1);
}
bp =__libxfs_getbufr(blen);
if (bp)
libxfs_initbuf_map(bp, btp, map, nmaps);
#ifdef IO_DEBUG
printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
pthread_self(), __FUNCTION__, blen,
(long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
#endif
return bp;
}
#ifdef XFS_BUF_TRACING
struct list_head lock_buf_list = {&lock_buf_list, &lock_buf_list};
int lock_buf_count = 0;
#endif
static struct xfs_buf *
__cache_lookup(struct xfs_bufkey *key, unsigned int flags)
{
struct xfs_buf *bp;
cache_node_get(libxfs_bcache, key, (struct cache_node **)&bp);
if (!bp)
return NULL;
if (use_xfs_buf_lock) {
int ret;
ret = pthread_mutex_trylock(&bp->b_lock);
if (ret) {
ASSERT(ret == EAGAIN);
if (flags & LIBXFS_GETBUF_TRYLOCK)
goto out_put;
if (pthread_equal(bp->b_holder, pthread_self())) {
fprintf(stderr,
_("Warning: recursive buffer locking at block %" PRIu64 " detected\n"),
key->blkno);
bp->b_recur++;
return bp;
} else {
pthread_mutex_lock(&bp->b_lock);
}
}
bp->b_holder = pthread_self();
}
cache_node_set_priority(libxfs_bcache, (struct cache_node *)bp,
cache_node_get_priority((struct cache_node *)bp) -
CACHE_PREFETCH_PRIORITY);
#ifdef XFS_BUF_TRACING
pthread_mutex_lock(&libxfs_bcache->c_mutex);
lock_buf_count++;
list_add(&bp->b_lock_list, &lock_buf_list);
pthread_mutex_unlock(&libxfs_bcache->c_mutex);
#endif
#ifdef IO_DEBUG
printf("%lx %s: hit buffer %p for bno = 0x%llx/0x%llx\n",
pthread_self(), __FUNCTION__,
bp, bp->b_bn, (long long)LIBXFS_BBTOOFF64(key->blkno));
#endif
return bp;
out_put:
cache_node_put(libxfs_bcache, (struct cache_node *)bp);
return NULL;
}
struct xfs_buf *
libxfs_getbuf_flags(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len,
unsigned int flags)
{
struct xfs_bufkey key = {NULL};
key.buftarg = btp;
key.blkno = blkno;
key.bblen = len;
return __cache_lookup(&key, flags);
}
/*
* Clean the buffer flags for libxfs_getbuf*(), which wants to return
* an unused buffer with clean state. This prevents CRC errors on a
* re-read of a corrupt block that was prefetched and freed. This
* can happen with a massively corrupt directory that is discarded,
* but whose blocks are then recycled into expanding lost+found.
*
* Note however that if the buffer's dirty (prefetch calls getbuf)
* we'll leave the state alone because we don't want to discard blocks
* that have been fixed.
*/
static void
reset_buf_state(
struct xfs_buf *bp)
{
if (bp && !(bp->b_flags & LIBXFS_B_DIRTY))
bp->b_flags &= ~(LIBXFS_B_UNCHECKED | LIBXFS_B_STALE |
LIBXFS_B_UPTODATE);
}
struct xfs_buf *
libxfs_getbuf(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len)
{
struct xfs_buf *bp;
bp = libxfs_getbuf_flags(btp, blkno, len, 0);
reset_buf_state(bp);
return bp;
}
static struct xfs_buf *
__libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
int nmaps, int flags)
{
struct xfs_bufkey key = {NULL};
int i;
if (nmaps == 1)
return libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
flags);
key.buftarg = btp;
key.blkno = map[0].bm_bn;
for (i = 0; i < nmaps; i++) {
key.bblen += map[i].bm_len;
}
key.map = map;
key.nmaps = nmaps;
return __cache_lookup(&key, flags);
}
struct xfs_buf *
libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
int nmaps, int flags)
{
struct xfs_buf *bp;
bp = __libxfs_getbuf_map(btp, map, nmaps, flags);
reset_buf_state(bp);
return bp;
}
void
libxfs_putbuf(xfs_buf_t *bp)
{
/*
* ensure that any errors on this use of the buffer don't carry
* over to the next user.
*/
bp->b_error = 0;
#ifdef XFS_BUF_TRACING
pthread_mutex_lock(&libxfs_bcache->c_mutex);
lock_buf_count--;
ASSERT(lock_buf_count >= 0);
list_del_init(&bp->b_lock_list);
pthread_mutex_unlock(&libxfs_bcache->c_mutex);
#endif
if (use_xfs_buf_lock) {
if (bp->b_recur) {
bp->b_recur--;
} else {
bp->b_holder = 0;
pthread_mutex_unlock(&bp->b_lock);
}
}
cache_node_put(libxfs_bcache, (struct cache_node *)bp);
}
void
libxfs_purgebuf(xfs_buf_t *bp)
{
struct xfs_bufkey key = {NULL};
key.buftarg = bp->b_target;
key.blkno = bp->b_bn;
key.bblen = bp->b_length;
cache_node_purge(libxfs_bcache, &key, (struct cache_node *)bp);
}
static struct cache_node *
libxfs_balloc(cache_key_t key)
{
struct xfs_bufkey *bufkey = (struct xfs_bufkey *)key;
if (bufkey->map)
return (struct cache_node *)
libxfs_getbufr_map(bufkey->buftarg,
bufkey->blkno, bufkey->bblen,
bufkey->map, bufkey->nmaps);
return (struct cache_node *)libxfs_getbufr(bufkey->buftarg,
bufkey->blkno, bufkey->bblen);
}
static int
__read_buf(int fd, void *buf, int len, off64_t offset, int flags)
{
int sts;
sts = pread(fd, buf, len, offset);
if (sts < 0) {
int error = errno;
fprintf(stderr, _("%s: read failed: %s\n"),
progname, strerror(error));
if (flags & LIBXFS_EXIT_ON_FAILURE)
exit(1);
return -error;
} else if (sts != len) {
fprintf(stderr, _("%s: error - read only %d of %d bytes\n"),
progname, sts, len);
if (flags & LIBXFS_EXIT_ON_FAILURE)
exit(1);
return -EIO;
}
return 0;
}
int
libxfs_readbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, xfs_buf_t *bp,
int len, int flags)
{
int fd = libxfs_device_to_fd(btp->dev);
int bytes = BBTOB(len);
int error;
ASSERT(BBTOB(len) <= bp->b_bcount);
error = __read_buf(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno), flags);
if (!error &&
bp->b_target->dev == btp->dev &&
bp->b_bn == blkno &&
bp->b_bcount == bytes)
bp->b_flags |= LIBXFS_B_UPTODATE;
#ifdef IO_DEBUG
printf("%lx: %s: read %u bytes, error %d, blkno=0x%llx(0x%llx), %p\n",
pthread_self(), __FUNCTION__, bytes, error,
(long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
#endif
return error;
}
void
libxfs_readbuf_verify(struct xfs_buf *bp, const struct xfs_buf_ops *ops)
{
if (!ops)
return;
bp->b_ops = ops;
bp->b_ops->verify_read(bp);
bp->b_flags &= ~LIBXFS_B_UNCHECKED;
}
xfs_buf_t *
libxfs_readbuf(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, int flags,
const struct xfs_buf_ops *ops)
{
xfs_buf_t *bp;
int error;
bp = libxfs_getbuf_flags(btp, blkno, len, 0);
if (!bp)
return NULL;
/*
* if the buffer was prefetched, it is likely that it was not validated.
* Hence if we are supplied an ops function and the buffer is marked as
* unchecked, we need to validate it now.
*
* We do this verification even if the buffer is dirty - the
* verification is almost certainly going to fail the CRC check in this
* case as a dirty buffer has not had the CRC recalculated. However, we
* should not be dirtying unchecked buffers and therefore failing it
* here because it's dirty and unchecked indicates we've screwed up
* somewhere else.
*/
bp->b_error = 0;
if ((bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY))) {
if (bp->b_flags & LIBXFS_B_UNCHECKED)
libxfs_readbuf_verify(bp, ops);
return bp;
}
/*
* Set the ops on a cache miss (i.e. first physical read) as the
* verifier may change the ops to match the type of buffer it contains.
* A cache hit might reset the verifier to the original type if we set
* it again, but it won't get called again and set to match the buffer
* contents. *cough* xfs_da_node_buf_ops *cough*.
*/
error = libxfs_readbufr(btp, blkno, bp, len, flags);
if (error)
bp->b_error = error;
else
libxfs_readbuf_verify(bp, ops);
return bp;
}
int
libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags)
{
int fd;
int error = 0;
void *buf;
int i;
fd = libxfs_device_to_fd(btp->dev);
buf = bp->b_addr;
for (i = 0; i < bp->b_nmaps; i++) {
off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
int len = BBTOB(bp->b_maps[i].bm_len);
error = __read_buf(fd, buf, len, offset, flags);
if (error) {
bp->b_error = error;
break;
}
buf += len;
}
if (!error)
bp->b_flags |= LIBXFS_B_UPTODATE;
#ifdef IO_DEBUG
printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
(long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
#endif
return error;
}
struct xfs_buf *
libxfs_readbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps,
int flags, const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
int error = 0;
if (nmaps == 1)
return libxfs_readbuf(btp, map[0].bm_bn, map[0].bm_len,
flags, ops);
bp = __libxfs_getbuf_map(btp, map, nmaps, 0);
if (!bp)
return NULL;
bp->b_error = 0;
if ((bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY))) {
if (bp->b_flags & LIBXFS_B_UNCHECKED)
libxfs_readbuf_verify(bp, ops);
return bp;
}
error = libxfs_readbufr_map(btp, bp, flags);
if (!error)
libxfs_readbuf_verify(bp, ops);
#ifdef IO_DEBUGX
printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
(long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
#endif
return bp;
}
static int
__write_buf(int fd, void *buf, int len, off64_t offset, int flags)
{
int sts;
sts = pwrite(fd, buf, len, offset);
if (sts < 0) {
int error = errno;
fprintf(stderr, _("%s: pwrite failed: %s\n"),
progname, strerror(error));
if (flags & LIBXFS_B_EXIT)
exit(1);
return -error;
} else if (sts != len) {
fprintf(stderr, _("%s: error - pwrite only %d of %d bytes\n"),
progname, sts, len);
if (flags & LIBXFS_B_EXIT)
exit(1);
return -EIO;
}
return 0;
}
int
libxfs_writebufr(xfs_buf_t *bp)
{
int fd = libxfs_device_to_fd(bp->b_target->dev);
/*
* we never write buffers that are marked stale. This indicates they
* contain data that has been invalidated, and even if the buffer is
* dirty it must *never* be written. Verifiers are wonderful for finding
* bugs like this. Make sure the error is obvious as to the cause.
*/
if (bp->b_flags & LIBXFS_B_STALE) {
bp->b_error = -ESTALE;
return bp->b_error;
}
/*
* clear any pre-existing error status on the buffer. This can occur if
* the buffer is corrupt on disk and the repair process doesn't clear
* the error before fixing and writing it back.
*/
bp->b_error = 0;
if (bp->b_ops) {
bp->b_ops->verify_write(bp);
if (bp->b_error) {
fprintf(stderr,
_("%s: write verifer failed on %s bno 0x%llx/0x%x\n"),
__func__, bp->b_ops->name,
(long long)bp->b_bn, bp->b_bcount);
return bp->b_error;
}
}
if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) {
bp->b_error = __write_buf(fd, bp->b_addr, bp->b_bcount,
LIBXFS_BBTOOFF64(bp->b_bn), bp->b_flags);
} else {
int i;
void *buf = bp->b_addr;
for (i = 0; i < bp->b_nmaps; i++) {
off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
int len = BBTOB(bp->b_maps[i].bm_len);
bp->b_error = __write_buf(fd, buf, len, offset,
bp->b_flags);
if (bp->b_error)
break;
buf += len;
}
}
#ifdef IO_DEBUG
printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
pthread_self(), __FUNCTION__, bp->b_bcount,
(long long)LIBXFS_BBTOOFF64(bp->b_bn),
(long long)bp->b_bn, bp, bp->b_error);
#endif
if (!bp->b_error) {
bp->b_flags |= LIBXFS_B_UPTODATE;
bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_EXIT |
LIBXFS_B_UNCHECKED);
}
return bp->b_error;
}
int
libxfs_writebuf_int(xfs_buf_t *bp, int flags)
{
/*
* Clear any error hanging over from reading the buffer. This prevents
* subsequent reads after this write from seeing stale errors.
*/
bp->b_error = 0;
bp->b_flags &= ~LIBXFS_B_STALE;
bp->b_flags |= (LIBXFS_B_DIRTY | flags);
return 0;
}
int
libxfs_writebuf(xfs_buf_t *bp, int flags)
{
#ifdef IO_DEBUG
printf("%lx: %s: dirty blkno=%llu(%llu)\n",
pthread_self(), __FUNCTION__,
(long long)LIBXFS_BBTOOFF64(bp->b_bn),
(long long)bp->b_bn);
#endif
/*
* Clear any error hanging over from reading the buffer. This prevents
* subsequent reads after this write from seeing stale errors.
*/
bp->b_error = 0;
bp->b_flags &= ~LIBXFS_B_STALE;
bp->b_flags |= (LIBXFS_B_DIRTY | flags);
libxfs_putbuf(bp);
return 0;
}
void
libxfs_iomove(xfs_buf_t *bp, uint boff, int len, void *data, int flags)
{
#ifdef IO_DEBUG
if (boff + len > bp->b_bcount) {
printf("Badness, iomove out of range!\n"
"bp=(bno 0x%llx, bytes %u) range=(boff %u, bytes %u)\n",
(long long)bp->b_bn, bp->b_bcount, boff, len);
abort();
}
#endif
switch (flags) {
case LIBXFS_BZERO:
memset(bp->b_addr + boff, 0, len);
break;
case LIBXFS_BREAD:
memcpy(data, bp->b_addr + boff, len);
break;
case LIBXFS_BWRITE:
memcpy(bp->b_addr + boff, data, len);
break;
}
}
static void
libxfs_brelse(
struct cache_node *node)
{
struct xfs_buf *bp = (struct xfs_buf *)node;
if (!bp)
return;
if (bp->b_flags & LIBXFS_B_DIRTY)
fprintf(stderr,
"releasing dirty buffer to free list!");
pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
}
static unsigned int
libxfs_bulkrelse(
struct cache *cache,
struct list_head *list)
{
xfs_buf_t *bp;
int count = 0;
if (list_empty(list))
return 0 ;
list_for_each_entry(bp, list, b_node.cn_mru) {
if (bp->b_flags & LIBXFS_B_DIRTY)
fprintf(stderr,
"releasing dirty buffer (bulk) to free list!");
count++;
}
pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
list_splice(list, &xfs_buf_freelist.cm_list);
pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
return count;
}
/*
* Free everything from the xfs_buf_freelist MRU, used at final teardown
*/
void
libxfs_bcache_free(void)
{
struct list_head *cm_list;
xfs_buf_t *bp, *next;
cm_list = &xfs_buf_freelist.cm_list;
list_for_each_entry_safe(bp, next, cm_list, b_node.cn_mru) {
free(bp->b_addr);
if (bp->b_maps != &bp->__b_map)
free(bp->b_maps);
kmem_zone_free(xfs_buf_zone, bp);
}
}
/*
* When a buffer is marked dirty, the error is cleared. Hence if we are trying
* to flush a buffer prior to cache reclaim that has an error on it it means
* we've already tried to flush it and it failed. Prevent repeated corruption
* errors from being reported by skipping such buffers - when the corruption is
* fixed the buffer will be marked dirty again and we can write it again.
*/
static int
libxfs_bflush(
struct cache_node *node)
{
struct xfs_buf *bp = (struct xfs_buf *)node;
if (!bp->b_error && bp->b_flags & LIBXFS_B_DIRTY)
return libxfs_writebufr(bp);
return bp->b_error;
}
void
libxfs_putbufr(xfs_buf_t *bp)
{
if (bp->b_flags & LIBXFS_B_DIRTY)
libxfs_writebufr(bp);
libxfs_brelse((struct cache_node *)bp);
}
void
libxfs_bcache_purge(void)
{
cache_purge(libxfs_bcache);
}
void
libxfs_bcache_flush(void)
{
cache_flush(libxfs_bcache);
}
int
libxfs_bcache_overflowed(void)
{
return cache_overflowed(libxfs_bcache);
}
struct cache_operations libxfs_bcache_operations = {
.hash = libxfs_bhash,
.alloc = libxfs_balloc,
.flush = libxfs_bflush,
.relse = libxfs_brelse,
.compare = libxfs_bcompare,
.bulkrelse = libxfs_bulkrelse
};
/*
* Inode cache stubs.
*/
kmem_zone_t *xfs_inode_zone;
extern kmem_zone_t *xfs_ili_zone;
/*
* If there are inline format data / attr forks attached to this inode,
* make sure they're not corrupt.
*/
bool
libxfs_inode_verify_forks(
struct xfs_inode *ip,
struct xfs_ifork_ops *ops)
{
struct xfs_ifork *ifp;
xfs_failaddr_t fa;
if (!ops)
return true;
fa = xfs_ifork_verify_data(ip, ops);
if (fa) {
ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
ifp->if_u1.if_data, ifp->if_bytes, fa);
return false;
}
fa = xfs_ifork_verify_attr(ip, ops);
if (fa) {
ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
ifp ? ifp->if_u1.if_data : NULL,
ifp ? ifp->if_bytes : 0, fa);
return false;
}
return true;
}
int
libxfs_iget(
struct xfs_mount *mp,
struct xfs_trans *tp,
xfs_ino_t ino,
uint lock_flags,
struct xfs_inode **ipp,
struct xfs_ifork_ops *ifork_ops)
{
struct xfs_inode *ip;
int error = 0;
ip = kmem_zone_zalloc(xfs_inode_zone, 0);
if (!ip)
return -ENOMEM;
ip->i_ino = ino;
ip->i_mount = mp;
error = xfs_iread(mp, tp, ip, 0);
if (error) {
kmem_zone_free(xfs_inode_zone, ip);
*ipp = NULL;
return error;
}
if (!libxfs_inode_verify_forks(ip, ifork_ops)) {
libxfs_irele(ip);
return -EFSCORRUPTED;
}
/*
* set up the inode ops structure that the libxfs code relies on
*/
if (XFS_ISDIR(ip))
ip->d_ops = mp->m_dir_inode_ops;
else
ip->d_ops = mp->m_nondir_inode_ops;
*ipp = ip;
return 0;
}
static void
libxfs_idestroy(xfs_inode_t *ip)
{
switch (VFS_I(ip)->i_mode & S_IFMT) {
case S_IFREG:
case S_IFDIR:
case S_IFLNK:
libxfs_idestroy_fork(ip, XFS_DATA_FORK);
break;
}
if (ip->i_afp)
libxfs_idestroy_fork(ip, XFS_ATTR_FORK);
if (ip->i_cowfp)
xfs_idestroy_fork(ip, XFS_COW_FORK);
}
void
libxfs_irele(
struct xfs_inode *ip)
{
if (ip->i_itemp)
kmem_zone_free(xfs_ili_zone, ip->i_itemp);
ip->i_itemp = NULL;
libxfs_idestroy(ip);
kmem_zone_free(xfs_inode_zone, ip);
}