blob: 38b2514d71d083a3a9b8355d567bfae956ee103c [file] [log] [blame]
/*
* Network block device - make block devices work over TCP
*
* Note that you can not swap over this thing, yet. Seems to work but
* deadlocks sometimes - you can not swap over TCP in general.
*
* Copyright 1997-2000 Pavel Machek <pavel@ucw.cz>
* Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
*
* (part of code stolen from loop.c)
*
* 97-3-25 compiled 0-th version, not yet tested it
* (it did not work, BTW) (later that day) HEY! it works!
* (bit later) hmm, not that much... 2:00am next day:
* yes, it works, but it gives something like 50kB/sec
* 97-4-01 complete rewrite to make it possible for many requests at
* once to be processed
* 97-4-11 Making protocol independent of endianity etc.
* 97-9-13 Cosmetic changes
* 98-5-13 Attempt to make 64-bit-clean on 64-bit machines
* 99-1-11 Attempt to make 64-bit-clean on 32-bit machines <ankry@mif.pg.gda.pl>
* 01-2-27 Fix to store proper blockcount for kernel (calculated using
* BLOCK_SIZE_BITS, not device blocksize) <aga@permonline.ru>
* 01-3-11 Make nbd work with new Linux block layer code. It now supports
* plugging like all the other block devices. Also added in MSG_MORE to
* reduce number of partial TCP segments sent. <steve@chygwyn.com>
*
* possible FIXME: make set_sock / set_blksize / set_size / do_it one syscall
* why not: would need verify_area and friends, would share yet another
* structure with userland
*/
#define PARANOIA
#include <linux/major.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/ioctl.h>
#include <net/sock.h>
#include <linux/devfs_fs_kernel.h>
#include <asm/segment.h>
#include <asm/uaccess.h>
#include <asm/types.h>
#define MAJOR_NR NBD_MAJOR
#include <linux/nbd.h>
#define LO_MAGIC 0x68797548
static int nbd_blksizes[MAX_NBD];
static int nbd_blksize_bits[MAX_NBD];
static int nbd_sizes[MAX_NBD];
static u64 nbd_bytesizes[MAX_NBD];
static struct nbd_device nbd_dev[MAX_NBD];
static devfs_handle_t devfs_handle;
static spinlock_t nbd_lock = SPIN_LOCK_UNLOCKED;
#define DEBUG( s )
/* #define DEBUG( s ) printk( s )
*/
#ifdef PARANOIA
static int requests_in;
static int requests_out;
#endif
static int nbd_open(struct inode *inode, struct file *file)
{
int dev;
if (!inode)
return -EINVAL;
dev = MINOR(inode->i_rdev);
if (dev >= MAX_NBD)
return -ENODEV;
nbd_dev[dev].refcnt++;
return 0;
}
/*
* Send or receive packet.
*/
static int nbd_xmit(int send, struct socket *sock, char *buf, int size, int msg_flags)
{
mm_segment_t oldfs;
int result;
struct msghdr msg;
struct iovec iov;
unsigned long flags;
sigset_t oldset;
oldfs = get_fs();
set_fs(get_ds());
spin_lock_irqsave(&current->sigmask_lock, flags);
oldset = current->blocked;
sigfillset(&current->blocked);
recalc_sigpending(current);
spin_unlock_irqrestore(&current->sigmask_lock, flags);
do {
sock->sk->allocation = GFP_NOIO;
iov.iov_base = buf;
iov.iov_len = size;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_namelen = 0;
msg.msg_flags = msg_flags | MSG_NOSIGNAL;
if (send)
result = sock_sendmsg(sock, &msg, size);
else
result = sock_recvmsg(sock, &msg, size, 0);
if (result <= 0) {
#ifdef PARANOIA
printk(KERN_ERR "NBD: %s - sock=%ld at buf=%ld, size=%d returned %d.\n",
send ? "send" : "receive", (long) sock, (long) buf, size, result);
#endif
break;
}
size -= result;
buf += result;
} while (size > 0);
spin_lock_irqsave(&current->sigmask_lock, flags);
current->blocked = oldset;
recalc_sigpending(current);
spin_unlock_irqrestore(&current->sigmask_lock, flags);
set_fs(oldfs);
return result;
}
#define FAIL( s ) { printk( KERN_ERR "NBD: " s "(result %d)\n", result ); goto error_out; }
void nbd_send_req(struct socket *sock, struct request *req)
{
int result, rw, i, flags;
struct nbd_request request;
unsigned long size = req->nr_sectors << 9;
DEBUG("NBD: sending control, ");
request.magic = htonl(NBD_REQUEST_MAGIC);
request.type = htonl(req->flags);
request.from = cpu_to_be64( (u64) req->sector << 9);
request.len = htonl(size);
memcpy(request.handle, &req, sizeof(req));
rw = rq_data_dir(req);
result = nbd_xmit(1, sock, (char *) &request, sizeof(request), rw & WRITE ? MSG_MORE : 0);
if (result <= 0)
FAIL("Sendmsg failed for control.");
if (rw & WRITE) {
struct bio *bio;
/*
* we are really probing at internals to determine
* whether to set MSG_MORE or not...
*/
rq_for_each_bio(bio, req) {
struct bio_vec *bvec;
bio_for_each_segment(bvec, bio, i) {
flags = 0;
if ((i < (bio->bi_vcnt - 1)) || bio->bi_next)
flags = MSG_MORE;
DEBUG("data, ");
result = nbd_xmit(1, sock, page_address(bvec->bv_page) + bvec->bv_offset, bvec->bv_len, flags);
if (result <= 0)
FAIL("Send data failed.");
}
}
}
return;
error_out:
req->errors++;
}
#define HARDFAIL( s ) { printk( KERN_ERR "NBD: " s "(result %d)\n", result ); lo->harderror = result; return NULL; }
struct request *nbd_read_stat(struct nbd_device *lo)
/* NULL returned = something went wrong, inform userspace */
{
int result;
struct nbd_reply reply;
struct request *xreq, *req;
DEBUG("reading control, ");
reply.magic = 0;
result = nbd_xmit(0, lo->sock, (char *) &reply, sizeof(reply), MSG_WAITALL);
if (result <= 0)
HARDFAIL("Recv control failed.");
memcpy(&xreq, reply.handle, sizeof(xreq));
req = blkdev_entry_prev_request(&lo->queue_head);
if (xreq != req)
FAIL("Unexpected handle received.\n");
DEBUG("ok, ");
if (ntohl(reply.magic) != NBD_REPLY_MAGIC)
HARDFAIL("Not enough magic.");
if (ntohl(reply.error))
FAIL("Other side returned error.");
if (rq_data_dir(req) == READ) {
struct bio *bio = req->bio;
DEBUG("data, ");
do {
result = nbd_xmit(0, lo->sock, bio_data(bio), bio->bi_size, MSG_WAITALL);
if (result <= 0)
HARDFAIL("Recv data failed.");
bio = bio->bi_next;
} while(bio);
}
DEBUG("done.\n");
return req;
/* Can we get here? Yes, if other side returns error */
error_out:
req->errors++;
return req;
}
void nbd_do_it(struct nbd_device *lo)
{
struct request *req;
down (&lo->queue_lock);
while (1) {
up (&lo->queue_lock);
req = nbd_read_stat(lo);
down (&lo->queue_lock);
if (!req) {
printk(KERN_ALERT "req should never be null\n" );
goto out;
}
#ifdef PARANOIA
if (req != blkdev_entry_prev_request(&lo->queue_head)) {
printk(KERN_ALERT "NBD: I have problem...\n");
}
if (lo != &nbd_dev[MINOR(req->rq_dev)]) {
printk(KERN_ALERT "NBD: request corrupted!\n");
continue;
}
if (lo->magic != LO_MAGIC) {
printk(KERN_ALERT "NBD: nbd_dev[] corrupted: Not enough magic\n");
goto out;
}
#endif
blkdev_dequeue_request(req);
up (&lo->queue_lock);
nbd_end_request(req);
down (&lo->queue_lock);
}
out:
up (&lo->queue_lock);
}
void nbd_clear_que(struct nbd_device *lo)
{
struct request *req;
#ifdef PARANOIA
if (lo->magic != LO_MAGIC) {
printk(KERN_ERR "NBD: nbd_dev[] corrupted: Not enough magic when clearing!\n");
return;
}
#endif
while (!list_empty(&lo->queue_head)) {
req = blkdev_entry_prev_request(&lo->queue_head);
#ifdef PARANOIA
if (!req) {
printk( KERN_ALERT "NBD: panic, panic, panic\n" );
break;
}
if (lo != &nbd_dev[MINOR(req->rq_dev)]) {
printk(KERN_ALERT "NBD: request corrupted when clearing!\n");
continue;
}
#endif
req->errors++;
blkdev_dequeue_request(req);
up(&lo->queue_lock);
nbd_end_request(req);
down(&lo->queue_lock);
}
}
/*
* We always wait for result of write, for now. It would be nice to make it optional
* in future
* if ((req->cmd == WRITE) && (lo->flags & NBD_WRITE_NOCHK))
* { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
*/
#undef FAIL
#define FAIL( s ) { printk( KERN_ERR "NBD, minor %d: " s "\n", dev ); goto error_out; }
static void do_nbd_request(request_queue_t * q)
{
struct request *req;
int dev = 0;
struct nbd_device *lo;
while (!QUEUE_EMPTY) {
req = CURRENT;
#ifdef PARANOIA
if (!req)
FAIL("que not empty but no request?");
#endif
dev = MINOR(req->rq_dev);
#ifdef PARANOIA
if (dev >= MAX_NBD)
FAIL("Minor too big."); /* Probably can not happen */
#endif
if (!(req->flags & REQ_CMD))
goto error_out;
lo = &nbd_dev[dev];
if (!lo->file)
FAIL("Request when not-ready.");
if ((rq_data_dir(req) == WRITE) && (lo->flags & NBD_READ_ONLY))
FAIL("Write on read-only");
#ifdef PARANOIA
if (lo->magic != LO_MAGIC)
FAIL("nbd[] is not magical!");
requests_in++;
#endif
req->errors = 0;
blkdev_dequeue_request(req);
spin_unlock_irq(q->queue_lock);
down (&lo->queue_lock);
list_add(&req->queuelist, &lo->queue_head);
nbd_send_req(lo->sock, req); /* Why does this block? */
up (&lo->queue_lock);
spin_lock_irq(q->queue_lock);
continue;
error_out:
req->errors++;
blkdev_dequeue_request(req);
spin_unlock(q->queue_lock);
nbd_end_request(req);
spin_lock(q->queue_lock);
}
return;
}
static int nbd_ioctl(struct inode *inode, struct file *file,
unsigned int cmd, unsigned long arg)
{
struct nbd_device *lo;
int dev, error, temp;
struct request sreq ;
/* Anyone capable of this syscall can do *real bad* things */
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!inode)
return -EINVAL;
dev = MINOR(inode->i_rdev);
if (dev >= MAX_NBD)
return -ENODEV;
lo = &nbd_dev[dev];
switch (cmd) {
case NBD_DISCONNECT:
printk("NBD_DISCONNECT\n") ;
sreq.flags = REQ_SPECIAL; /* FIXME: interpet as shutdown cmd */
if (!lo->sock) return -EINVAL ;
nbd_send_req(lo->sock,&sreq) ;
return 0 ;
case NBD_CLEAR_SOCK:
down(&lo->queue_lock);
nbd_clear_que(lo);
if (!list_empty(&lo->queue_head)) {
up(&lo->queue_lock);
printk(KERN_ERR "nbd: Some requests are in progress -> can not turn off.\n");
return -EBUSY;
}
up(&lo->queue_lock);
file = lo->file;
if (!file)
return -EINVAL;
lo->file = NULL;
lo->sock = NULL;
fput(file);
return 0;
case NBD_SET_SOCK:
if (lo->file)
return -EBUSY;
error = -EINVAL;
file = fget(arg);
if (file) {
inode = file->f_dentry->d_inode;
/* N.B. Should verify that it's a socket */
lo->file = file;
lo->sock = &inode->u.socket_i;
error = 0;
}
return error;
case NBD_SET_BLKSIZE:
if ((arg & (arg-1)) || (arg < 512) || (arg > PAGE_SIZE))
return -EINVAL;
nbd_blksizes[dev] = arg;
temp = arg >> 9;
nbd_blksize_bits[dev] = 9;
while (temp > 1) {
nbd_blksize_bits[dev]++;
temp >>= 1;
}
nbd_bytesizes[dev] &= ~(nbd_blksizes[dev]-1);
nbd_sizes[dev] = nbd_bytesizes[dev] >> BLOCK_SIZE_BITS;
return 0;
case NBD_SET_SIZE:
nbd_bytesizes[dev] = arg & ~(nbd_blksizes[dev]-1);
nbd_sizes[dev] = nbd_bytesizes[dev] >> BLOCK_SIZE_BITS;
return 0;
case NBD_SET_SIZE_BLOCKS:
nbd_bytesizes[dev] = ((u64) arg) << nbd_blksize_bits[dev];
nbd_sizes[dev] = nbd_bytesizes[dev] >> BLOCK_SIZE_BITS;
return 0;
case NBD_DO_IT:
if (!lo->file)
return -EINVAL;
nbd_do_it(lo);
return lo->harderror;
case NBD_CLEAR_QUE:
nbd_clear_que(lo);
return 0;
#ifdef PARANOIA
case NBD_PRINT_DEBUG:
printk(KERN_INFO "NBD device %d: next = %p, prev = %p. Global: in %d, out %d\n",
dev, lo->queue_head.next, lo->queue_head.prev, requests_in, requests_out);
return 0;
#endif
case BLKGETSIZE:
return put_user(nbd_bytesizes[dev] >> 9, (unsigned long *) arg);
case BLKGETSIZE64:
return put_user((u64)nbd_bytesizes[dev], (u64 *) arg);
}
return -EINVAL;
}
static int nbd_release(struct inode *inode, struct file *file)
{
struct nbd_device *lo;
int dev;
if (!inode)
return -ENODEV;
dev = MINOR(inode->i_rdev);
if (dev >= MAX_NBD)
return -ENODEV;
lo = &nbd_dev[dev];
if (lo->refcnt <= 0)
printk(KERN_ALERT "nbd_release: refcount(%d) <= 0\n", lo->refcnt);
lo->refcnt--;
/* N.B. Doesn't lo->file need an fput?? */
return 0;
}
static struct block_device_operations nbd_fops =
{
owner: THIS_MODULE,
open: nbd_open,
release: nbd_release,
ioctl: nbd_ioctl,
};
/*
* And here should be modules and kernel interface
* (Just smiley confuses emacs :-)
*/
static int __init nbd_init(void)
{
int i;
if (sizeof(struct nbd_request) != 28) {
printk(KERN_CRIT "Sizeof nbd_request needs to be 28 in order to work!\n" );
return -EIO;
}
if (register_blkdev(MAJOR_NR, "nbd", &nbd_fops)) {
printk("Unable to get major number %d for NBD\n",
MAJOR_NR);
return -EIO;
}
#ifdef MODULE
printk("nbd: registered device at major %d\n", MAJOR_NR);
#endif
blksize_size[MAJOR_NR] = nbd_blksizes;
blk_size[MAJOR_NR] = nbd_sizes;
blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), do_nbd_request, &nbd_lock);
for (i = 0; i < MAX_NBD; i++) {
nbd_dev[i].refcnt = 0;
nbd_dev[i].file = NULL;
nbd_dev[i].magic = LO_MAGIC;
nbd_dev[i].flags = 0;
INIT_LIST_HEAD(&nbd_dev[i].queue_head);
init_MUTEX(&nbd_dev[i].queue_lock);
nbd_blksizes[i] = 1024;
nbd_blksize_bits[i] = 10;
nbd_bytesizes[i] = 0x7ffffc00; /* 2GB */
nbd_sizes[i] = nbd_bytesizes[i] >> BLOCK_SIZE_BITS;
register_disk(NULL, MKDEV(MAJOR_NR,i), 1, &nbd_fops,
nbd_bytesizes[i]>>9);
}
devfs_handle = devfs_mk_dir (NULL, "nbd", NULL);
devfs_register_series (devfs_handle, "%u", MAX_NBD,
DEVFS_FL_DEFAULT, MAJOR_NR, 0,
S_IFBLK | S_IRUSR | S_IWUSR,
&nbd_fops, NULL);
return 0;
}
static void __exit nbd_cleanup(void)
{
devfs_unregister (devfs_handle);
blk_cleanup_queue(BLK_DEFAULT_QUEUE(MAJOR_NR));
if (unregister_blkdev(MAJOR_NR, "nbd") != 0)
printk("nbd: cleanup_module failed\n");
else
printk("nbd: module cleaned up.\n");
}
module_init(nbd_init);
module_exit(nbd_cleanup);
MODULE_DESCRIPTION("Network Block Device");
MODULE_LICENSE("GPL");