blob: c79ba85329800bd08267a72f9935215782a0b668 [file] [log] [blame]
/*
* Network block device - make block devices work over TCP
*
* Note that you can not swap over this thing, yet. Seems to work but
* deadlocks sometimes - you can not swap over TCP in general.
*
* Copyright 1997-2000 Pavel Machek <pavel@ucw.cz>
* Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
*
* (part of code stolen from loop.c)
*
* 97-3-25 compiled 0-th version, not yet tested it
* (it did not work, BTW) (later that day) HEY! it works!
* (bit later) hmm, not that much... 2:00am next day:
* yes, it works, but it gives something like 50kB/sec
* 97-4-01 complete rewrite to make it possible for many requests at
* once to be processed
* 97-4-11 Making protocol independent of endianity etc.
* 97-9-13 Cosmetic changes
* 98-5-13 Attempt to make 64-bit-clean on 64-bit machines
* 99-1-11 Attempt to make 64-bit-clean on 32-bit machines <ankry@mif.pg.gda.pl>
* 01-2-27 Fix to store proper blockcount for kernel (calculated using
* BLOCK_SIZE_BITS, not device blocksize) <aga@permonline.ru>
* 01-3-11 Make nbd work with new Linux block layer code. It now supports
* plugging like all the other block devices. Also added in MSG_MORE to
* reduce number of partial TCP segments sent. <steve@chygwyn.com>
* 01-12-6 Fix deadlock condition by making queue locks independant of
* the transmit lock. <steve@chygwyn.com>
* 02-10-11 Allow hung xmit to be aborted via SIGKILL & various fixes.
* <Paul.Clements@SteelEye.com> <James.Bottomley@SteelEye.com>
*
* possible FIXME: make set_sock / set_blksize / set_size / do_it one syscall
* why not: would need verify_area and friends, would share yet another
* structure with userland
*/
#define PARANOIA
#include <linux/major.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/ioctl.h>
#include <net/sock.h>
#include <linux/devfs_fs_kernel.h>
#include <asm/uaccess.h>
#include <asm/types.h>
#define MAJOR_NR NBD_MAJOR
#include <linux/nbd.h>
#define LO_MAGIC 0x68797548
static int nbd_blksizes[MAX_NBD];
static int nbd_blksize_bits[MAX_NBD];
static int nbd_sizes[MAX_NBD];
static u64 nbd_bytesizes[MAX_NBD];
static struct nbd_device nbd_dev[MAX_NBD];
static devfs_handle_t devfs_handle;
#define DEBUG( s )
/* #define DEBUG( s ) printk( s )
*/
#ifdef PARANOIA
static int requests_in;
static int requests_out;
#endif
static void
nbd_end_request(struct request *req)
{
struct buffer_head *bh;
unsigned nsect;
unsigned long flags;
int uptodate = (req->errors == 0) ? 1 : 0;
#ifdef PARANOIA
requests_out++;
#endif
spin_lock_irqsave(&io_request_lock, flags);
while((bh = req->bh) != NULL) {
nsect = bh->b_size >> 9;
blk_finished_io(nsect);
req->bh = bh->b_reqnext;
bh->b_reqnext = NULL;
bh->b_end_io(bh, uptodate);
}
blkdev_release_request(req);
spin_unlock_irqrestore(&io_request_lock, flags);
}
static int nbd_open(struct inode *inode, struct file *file)
{
int dev;
if (!inode)
return -EINVAL;
dev = MINOR(inode->i_rdev);
if (dev >= MAX_NBD)
return -ENODEV;
nbd_dev[dev].refcnt++;
return 0;
}
/*
* Send or receive packet.
*/
static int nbd_xmit(int send, struct socket *sock, char *buf, int size, int msg_flags)
{
mm_segment_t oldfs;
int result;
struct msghdr msg;
struct iovec iov;
unsigned long flags;
sigset_t oldset;
oldfs = get_fs();
set_fs(get_ds());
/* Allow interception of SIGKILL only
* Don't allow other signals to interrupt the transmission */
spin_lock_irqsave(&current->sigmask_lock, flags);
oldset = current->blocked;
sigfillset(&current->blocked);
sigdelsetmask(&current->blocked, sigmask(SIGKILL));
recalc_sigpending(current);
spin_unlock_irqrestore(&current->sigmask_lock, flags);
do {
sock->sk->allocation = GFP_NOIO;
iov.iov_base = buf;
iov.iov_len = size;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_namelen = 0;
msg.msg_flags = msg_flags | MSG_NOSIGNAL;
if (send)
result = sock_sendmsg(sock, &msg, size);
else
result = sock_recvmsg(sock, &msg, size, 0);
if (signal_pending(current)) {
siginfo_t info;
spin_lock_irqsave(&current->sigmask_lock, flags);
printk(KERN_WARNING "NBD (pid %d: %s) got signal %d\n",
current->pid, current->comm,
dequeue_signal(&current->blocked, &info));
spin_unlock_irqrestore(&current->sigmask_lock, flags);
result = -EINTR;
break;
}
if (result <= 0) {
#ifdef PARANOIA
printk(KERN_ERR "NBD: %s - sock=%ld at buf=%ld, size=%d returned %d.\n",
send ? "send" : "receive", (long) sock, (long) buf, size, result);
#endif
break;
}
size -= result;
buf += result;
} while (size > 0);
spin_lock_irqsave(&current->sigmask_lock, flags);
current->blocked = oldset;
recalc_sigpending(current);
spin_unlock_irqrestore(&current->sigmask_lock, flags);
set_fs(oldfs);
return result;
}
#define FAIL( s ) { printk( KERN_ERR "NBD: " s "(result %d)\n", result ); goto error_out; }
void nbd_send_req(struct nbd_device *lo, struct request *req)
{
int result = -1;
struct nbd_request request;
unsigned long size = req->nr_sectors << 9;
struct socket *sock = lo->sock;
DEBUG("NBD: sending control, ");
request.magic = htonl(NBD_REQUEST_MAGIC);
request.type = htonl(req->cmd);
request.from = cpu_to_be64( (u64) req->sector << 9);
request.len = htonl(size);
memcpy(request.handle, &req, sizeof(req));
down(&lo->tx_lock);
if (!sock || !lo->sock) {
FAIL("Attempted sendmsg to closed socket\n");
}
result = nbd_xmit(1, sock, (char *) &request, sizeof(request), req->cmd == WRITE ? MSG_MORE : 0);
if (result <= 0)
FAIL("Sendmsg failed for control.");
if (req->cmd == WRITE) {
struct buffer_head *bh = req->bh;
DEBUG("data, ");
do {
result = nbd_xmit(1, sock, bh->b_data, bh->b_size, bh->b_reqnext == NULL ? 0 : MSG_MORE);
if (result <= 0)
FAIL("Send data failed.");
bh = bh->b_reqnext;
} while(bh);
}
up(&lo->tx_lock);
return;
error_out:
up(&lo->tx_lock);
req->errors++;
}
static struct request *nbd_find_request(struct nbd_device *lo, char *handle)
{
struct request *req;
struct list_head *tmp;
struct request *xreq;
memcpy(&xreq, handle, sizeof(xreq));
spin_lock(&lo->queue_lock);
list_for_each(tmp, &lo->queue_head) {
req = list_entry(tmp, struct request, queue);
if (req != xreq)
continue;
list_del(&req->queue);
spin_unlock(&lo->queue_lock);
return req;
}
spin_unlock(&lo->queue_lock);
return NULL;
}
#define HARDFAIL( s ) { printk( KERN_ERR "NBD: " s "(result %d)\n", result ); lo->harderror = result; return NULL; }
struct request *nbd_read_stat(struct nbd_device *lo)
/* NULL returned = something went wrong, inform userspace */
{
int result;
struct nbd_reply reply;
struct request *req;
DEBUG("reading control, ");
reply.magic = 0;
result = nbd_xmit(0, lo->sock, (char *) &reply, sizeof(reply), MSG_WAITALL);
if (result <= 0)
HARDFAIL("Recv control failed.");
req = nbd_find_request(lo, reply.handle);
if (req == NULL)
HARDFAIL("Unexpected reply");
DEBUG("ok, ");
if (ntohl(reply.magic) != NBD_REPLY_MAGIC)
HARDFAIL("Not enough magic.");
if (ntohl(reply.error))
FAIL("Other side returned error.");
if (req->cmd == READ) {
struct buffer_head *bh = req->bh;
DEBUG("data, ");
do {
result = nbd_xmit(0, lo->sock, bh->b_data, bh->b_size, MSG_WAITALL);
if (result <= 0)
HARDFAIL("Recv data failed.");
bh = bh->b_reqnext;
} while(bh);
}
DEBUG("done.\n");
return req;
/* Can we get here? Yes, if other side returns error */
error_out:
req->errors++;
return req;
}
void nbd_do_it(struct nbd_device *lo)
{
struct request *req;
while (1) {
req = nbd_read_stat(lo);
if (!req) {
printk(KERN_ALERT "req should never be null\n" );
goto out;
}
#ifdef PARANOIA
if (lo != &nbd_dev[MINOR(req->rq_dev)]) {
printk(KERN_ALERT "NBD: request corrupted!\n");
continue;
}
if (lo->magic != LO_MAGIC) {
printk(KERN_ALERT "NBD: nbd_dev[] corrupted: Not enough magic\n");
goto out;
}
#endif
nbd_end_request(req);
}
out:
return;
}
void nbd_clear_que(struct nbd_device *lo)
{
struct request *req;
#ifdef PARANOIA
if (lo->magic != LO_MAGIC) {
printk(KERN_ERR "NBD: nbd_dev[] corrupted: Not enough magic when clearing!\n");
return;
}
#endif
do {
req = NULL;
spin_lock(&lo->queue_lock);
if (!list_empty(&lo->queue_head)) {
req = list_entry(lo->queue_head.next, struct request, queue);
list_del(&req->queue);
}
spin_unlock(&lo->queue_lock);
if (req) {
req->errors++;
nbd_end_request(req);
}
} while(req);
}
/*
* We always wait for result of write, for now. It would be nice to make it optional
* in future
* if ((req->cmd == WRITE) && (lo->flags & NBD_WRITE_NOCHK))
* { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
*/
#undef FAIL
#define FAIL( s ) { printk( KERN_ERR "NBD, minor %d: " s "\n", dev ); goto error_out; }
static void do_nbd_request(request_queue_t * q)
{
struct request *req;
int dev = 0;
struct nbd_device *lo;
while (!QUEUE_EMPTY) {
req = CURRENT;
#ifdef PARANOIA
if (!req)
FAIL("que not empty but no request?");
#endif
dev = MINOR(req->rq_dev);
#ifdef PARANOIA
if (dev >= MAX_NBD)
FAIL("Minor too big."); /* Probably can not happen */
#endif
lo = &nbd_dev[dev];
if (!lo->file)
FAIL("Request when not-ready.");
if ((req->cmd == WRITE) && (lo->flags & NBD_READ_ONLY))
FAIL("Write on read-only");
#ifdef PARANOIA
if (lo->magic != LO_MAGIC)
FAIL("nbd[] is not magical!");
requests_in++;
#endif
req->errors = 0;
blkdev_dequeue_request(req);
spin_unlock_irq(&io_request_lock);
spin_lock(&lo->queue_lock);
if (!lo->file) {
spin_unlock(&lo->queue_lock);
printk(KERN_ERR "nbd: failed between accept and semaphore, file lost\n");
req->errors++;
nbd_end_request(req);
spin_lock_irq(&io_request_lock);
continue;
}
list_add_tail(&req->queue, &lo->queue_head);
spin_unlock(&lo->queue_lock);
nbd_send_req(lo, req);
if (req->errors) {
printk(KERN_ERR "nbd: nbd_send_req failed\n");
spin_lock(&lo->queue_lock);
list_del(&req->queue);
spin_unlock(&lo->queue_lock);
nbd_end_request(req);
spin_lock_irq(&io_request_lock);
continue;
}
spin_lock_irq(&io_request_lock);
continue;
error_out:
req->errors++;
blkdev_dequeue_request(req);
spin_unlock(&io_request_lock);
nbd_end_request(req);
spin_lock(&io_request_lock);
}
return;
}
static int nbd_ioctl(struct inode *inode, struct file *file,
unsigned int cmd, unsigned long arg)
{
struct nbd_device *lo;
int dev, error, temp;
struct request sreq ;
if (!inode)
return -EINVAL;
dev = MINOR(inode->i_rdev);
if (dev >= MAX_NBD)
return -ENODEV;
lo = &nbd_dev[dev];
/* these are innocent, but.... */
switch (cmd) {
case BLKGETSIZE:
return put_user(nbd_bytesizes[dev] >> 9, (unsigned long *) arg);
case BLKGETSIZE64:
return put_user((u64)nbd_bytesizes[dev], (u64 *) arg);
}
/* ... anyone capable of any of the below ioctls can do *real bad*
things */
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
switch (cmd) {
case NBD_DISCONNECT:
printk("NBD_DISCONNECT\n");
sreq.cmd=2 ; /* shutdown command */
if (!lo->sock) return -EINVAL;
nbd_send_req(lo, &sreq);
return 0 ;
case NBD_CLEAR_SOCK:
error = 0;
down(&lo->tx_lock);
lo->sock = NULL;
up(&lo->tx_lock);
spin_lock(&lo->queue_lock);
file = lo->file;
lo->file = NULL;
spin_unlock(&lo->queue_lock);
nbd_clear_que(lo);
spin_lock(&lo->queue_lock);
if (!list_empty(&lo->queue_head)) {
printk(KERN_ERR "nbd: disconnect: some requests are in progress -> please try again.\n");
error = -EBUSY;
}
spin_unlock(&lo->queue_lock);
if (file)
fput(file);
return error;
case NBD_SET_SOCK:
if (lo->file)
return -EBUSY;
error = -EINVAL;
file = fget(arg);
if (file) {
inode = file->f_dentry->d_inode;
/* N.B. Should verify that it's a socket */
lo->file = file;
lo->sock = &inode->u.socket_i;
error = 0;
}
return error;
case NBD_SET_BLKSIZE:
if ((arg & (arg-1)) || (arg < 512) || (arg > PAGE_SIZE))
return -EINVAL;
nbd_blksizes[dev] = arg;
temp = arg >> 9;
nbd_blksize_bits[dev] = 9;
while (temp > 1) {
nbd_blksize_bits[dev]++;
temp >>= 1;
}
nbd_bytesizes[dev] &= ~(nbd_blksizes[dev]-1);
nbd_sizes[dev] = nbd_bytesizes[dev] >> BLOCK_SIZE_BITS;
return 0;
case NBD_SET_SIZE:
nbd_bytesizes[dev] = arg & ~(nbd_blksizes[dev]-1);
nbd_sizes[dev] = nbd_bytesizes[dev] >> BLOCK_SIZE_BITS;
return 0;
case NBD_SET_SIZE_BLOCKS:
nbd_bytesizes[dev] = ((u64) arg) << nbd_blksize_bits[dev];
nbd_sizes[dev] = nbd_bytesizes[dev] >> BLOCK_SIZE_BITS;
return 0;
case NBD_DO_IT:
if (!lo->file)
return -EINVAL;
nbd_do_it(lo);
/* on return tidy up in case we have a signal */
/* Forcibly shutdown the socket causing all listeners
* to error
*
* FIXME: This code is duplicated from sys_shutdown, but
* there should be a more generic interface rather than
* calling socket ops directly here */
down(&lo->tx_lock);
if (lo->sock) {
printk(KERN_WARNING "nbd: shutting down socket\n");
lo->sock->ops->shutdown(lo->sock,
SEND_SHUTDOWN|RCV_SHUTDOWN);
lo->sock = NULL;
}
up(&lo->tx_lock);
spin_lock(&lo->queue_lock);
file = lo->file;
lo->file = NULL;
spin_unlock(&lo->queue_lock);
nbd_clear_que(lo);
printk(KERN_WARNING "nbd: queue cleared\n");
if (file)
fput(file);
return lo->harderror;
case NBD_CLEAR_QUE:
down(&lo->tx_lock);
if (lo->sock) {
up(&lo->tx_lock);
return 0; /* probably should be error, but that would
* break "nbd-client -d", so just return 0 */
}
up(&lo->tx_lock);
nbd_clear_que(lo);
return 0;
#ifdef PARANOIA
case NBD_PRINT_DEBUG:
printk(KERN_INFO "NBD device %d: next = %p, prev = %p. Global: in %d, out %d\n",
dev, lo->queue_head.next, lo->queue_head.prev, requests_in, requests_out);
return 0;
#endif
}
return -EINVAL;
}
static int nbd_release(struct inode *inode, struct file *file)
{
struct nbd_device *lo;
int dev;
if (!inode)
return -ENODEV;
dev = MINOR(inode->i_rdev);
if (dev >= MAX_NBD)
return -ENODEV;
lo = &nbd_dev[dev];
if (lo->refcnt <= 0)
printk(KERN_ALERT "nbd_release: refcount(%d) <= 0\n", lo->refcnt);
lo->refcnt--;
/* N.B. Doesn't lo->file need an fput?? */
return 0;
}
static struct block_device_operations nbd_fops =
{
owner: THIS_MODULE,
open: nbd_open,
release: nbd_release,
ioctl: nbd_ioctl,
};
/*
* And here should be modules and kernel interface
* (Just smiley confuses emacs :-)
*/
static int __init nbd_init(void)
{
int i;
if (sizeof(struct nbd_request) != 28) {
printk(KERN_CRIT "Sizeof nbd_request needs to be 28 in order to work!\n" );
return -EIO;
}
if (register_blkdev(MAJOR_NR, "nbd", &nbd_fops)) {
printk("Unable to get major number %d for NBD\n",
MAJOR_NR);
return -EIO;
}
#ifdef MODULE
printk("nbd: registered device at major %d\n", MAJOR_NR);
#endif
blksize_size[MAJOR_NR] = nbd_blksizes;
blk_size[MAJOR_NR] = nbd_sizes;
blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), do_nbd_request);
blk_queue_headactive(BLK_DEFAULT_QUEUE(MAJOR_NR), 0);
for (i = 0; i < MAX_NBD; i++) {
nbd_dev[i].refcnt = 0;
nbd_dev[i].file = NULL;
nbd_dev[i].magic = LO_MAGIC;
nbd_dev[i].flags = 0;
spin_lock_init(&nbd_dev[i].queue_lock);
INIT_LIST_HEAD(&nbd_dev[i].queue_head);
init_MUTEX(&nbd_dev[i].tx_lock);
nbd_blksizes[i] = 1024;
nbd_blksize_bits[i] = 10;
nbd_bytesizes[i] = ((u64)0x7ffffc00) << 10; /* 2TB */
nbd_sizes[i] = nbd_bytesizes[i] >> BLOCK_SIZE_BITS;
register_disk(NULL, MKDEV(MAJOR_NR,i), 1, &nbd_fops,
nbd_bytesizes[i]>>9);
}
devfs_handle = devfs_mk_dir (NULL, "nbd", NULL);
devfs_register_series (devfs_handle, "%u", MAX_NBD,
DEVFS_FL_DEFAULT, MAJOR_NR, 0,
S_IFBLK | S_IRUSR | S_IWUSR,
&nbd_fops, NULL);
return 0;
}
static void __exit nbd_cleanup(void)
{
devfs_unregister (devfs_handle);
blk_cleanup_queue(BLK_DEFAULT_QUEUE(MAJOR_NR));
if (unregister_blkdev(MAJOR_NR, "nbd") != 0)
printk("nbd: cleanup_module failed\n");
else
printk("nbd: module cleaned up.\n");
}
module_init(nbd_init);
module_exit(nbd_cleanup);
MODULE_DESCRIPTION("Network Block Device");
MODULE_LICENSE("GPL");