drivers/md/multipath.c - pub/scm/linux/kernel/git/wtarreau/linux-2.4 - Git at Google

 /*
  * multipath.c : Multiple Devices driver for Linux
  *
  * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
  *
  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
  *
  * MULTIPATH management functions.
  *
  * derived from raid1.c.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2, or (at your option)
  * any later version.
  *
  * You should have received a copy of the GNU General Public License
  * (for example /usr/src/linux/COPYING); if not, write to the Free
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */

 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/raid/multipath.h>
 #include <asm/atomic.h>

 #define MAJOR_NR MD_MAJOR
 #define MD_DRIVER
 #define MD_PERSONALITY

 #define MAX_WORK_PER_DISK 128

 #define	NR_RESERVED_BUFS	32


 /*
  * The following can be used to debug the driver
  */
 #define MULTIPATH_DEBUG	0

 #if MULTIPATH_DEBUG
 #define PRINTK(x...)   printk(x)
 #define inline
 #define __inline__
 #else
 #define PRINTK(x...)  do { } while (0)
 #endif


 static mdk_personality_t multipath_personality;
 static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
 struct multipath_bh *multipath_retry_list = NULL, **multipath_retry_tail;

 static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state);


 static struct multipath_bh *multipath_alloc_mpbh(multipath_conf_t *conf)
 {
 	struct multipath_bh *mp_bh = NULL;

 	do {
 		md_spin_lock_irq(&conf->device_lock);
 		if (!conf->freer1_blocked && conf->freer1) {
 			mp_bh = conf->freer1;
 			conf->freer1 = mp_bh->next_mp;
 			conf->freer1_cnt--;
 			mp_bh->next_mp = NULL;
 			mp_bh->state = (1 << MPBH_PreAlloc);
 			mp_bh->bh_req.b_state = 0;
 		}
 		md_spin_unlock_irq(&conf->device_lock);
 		if (mp_bh)
 			return mp_bh;
 		mp_bh = (struct multipath_bh *) kmalloc(sizeof(struct multipath_bh),
 					GFP_NOIO);
 		if (mp_bh) {
 			memset(mp_bh, 0, sizeof(*mp_bh));
 			return mp_bh;
 		}
 		conf->freer1_blocked = 1;
 		wait_disk_event(conf->wait_buffer,
 				!conf->freer1_blocked ||
 				conf->freer1_cnt > NR_RESERVED_BUFS/2
 		    );
 		conf->freer1_blocked = 0;
 	} while (1);
 }

 static inline void multipath_free_mpbh(struct multipath_bh *mp_bh)
 {
 	multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);

 	if (test_bit(MPBH_PreAlloc, &mp_bh->state)) {
 		unsigned long flags;
 		spin_lock_irqsave(&conf->device_lock, flags);
 		mp_bh->next_mp = conf->freer1;
 		conf->freer1 = mp_bh;
 		conf->freer1_cnt++;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 		wake_up(&conf->wait_buffer);
 	} else {
 		kfree(mp_bh);
 	}
 }

 static int multipath_grow_mpbh (multipath_conf_t *conf, int cnt)
 {
 	int i = 0;

 	while (i < cnt) {
 		struct multipath_bh *mp_bh;
 		mp_bh = (struct multipath_bh*)kmalloc(sizeof(*mp_bh), GFP_KERNEL);
 		if (!mp_bh)
 			break;
 		memset(mp_bh, 0, sizeof(*mp_bh));
 		set_bit(MPBH_PreAlloc, &mp_bh->state);
 		mp_bh->mddev = conf->mddev;

 		multipath_free_mpbh(mp_bh);
 		i++;
 	}
 	return i;
 }

 static void multipath_shrink_mpbh(multipath_conf_t *conf)
 {
 	md_spin_lock_irq(&conf->device_lock);
 	while (conf->freer1) {
 		struct multipath_bh *mp_bh = conf->freer1;
 		conf->freer1 = mp_bh->next_mp;
 		conf->freer1_cnt--;
 		kfree(mp_bh);
 	}
 	md_spin_unlock_irq(&conf->device_lock);
 }


 static int multipath_map (mddev_t *mddev, kdev_t *rdev)
 {
 	multipath_conf_t *conf = mddev_to_conf(mddev);
 	int i, disks = MD_SB_DISKS;

 	/*
 	 * Later we do read balancing on the read side
 	 * now we use the first available disk.
 	 */

 	for (i = 0; i < disks; i++) {
 		if (conf->multipaths[i].operational) {
 			*rdev = conf->multipaths[i].dev;
 			return (0);
 		}
 	}

 	printk (KERN_ERR "multipath_map(): no more operational IO paths?\n");
 	return (-1);
 }

 static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
 {
 	unsigned long flags;
 	mddev_t *mddev = mp_bh->mddev;
 	multipath_conf_t *conf = mddev_to_conf(mddev);

 	md_spin_lock_irqsave(&retry_list_lock, flags);
 	if (multipath_retry_list == NULL)
 		multipath_retry_tail = &multipath_retry_list;
 	*multipath_retry_tail = mp_bh;
 	multipath_retry_tail = &mp_bh->next_mp;
 	mp_bh->next_mp = NULL;
 	md_spin_unlock_irqrestore(&retry_list_lock, flags);
 	md_wakeup_thread(conf->thread);
 }


 /*
  * multipath_end_bh_io() is called when we have finished servicing a multipathed
  * operation and are ready to return a success/failure code to the buffer
  * cache layer.
  */
 static void multipath_end_bh_io (struct multipath_bh *mp_bh, int uptodate)
 {
 	struct buffer_head *bh = mp_bh->master_bh;

 	bh->b_end_io(bh, uptodate);
 	multipath_free_mpbh(mp_bh);
 }

 void multipath_end_request (struct buffer_head *bh, int uptodate)
 {
 	struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private);

 	/*
 	 * this branch is our 'one multipath IO has finished' event handler:
 	 */
 	if (!uptodate)
 		md_error (mp_bh->mddev, bh->b_dev);
 	else
 		/*
 		 * Set MPBH_Uptodate in our master buffer_head, so that
 		 * we will return a good error code for to the higher
 		 * levels even if IO on some other multipathed buffer fails.
 		 *
 		 * The 'master' represents the complex operation to
 		 * user-side. So if something waits for IO, then it will
 		 * wait for the 'master' buffer_head.
 		 */
 		set_bit (MPBH_Uptodate, &mp_bh->state);


 	if (uptodate) {
 		multipath_end_bh_io(mp_bh, uptodate);
 		return;
 	}
 	/*
 	 * oops, IO error:
 	 */
 	printk(KERN_ERR "multipath: %s: rescheduling block %lu\n",
 		 partition_name(bh->b_dev), bh->b_blocknr);
 	multipath_reschedule_retry(mp_bh);
 	return;
 }

 /*
  * This routine returns the disk from which the requested read should
  * be done.
  */

 static int multipath_read_balance (multipath_conf_t *conf)
 {
 	int disk;

 	for (disk = 0; disk < conf->raid_disks; disk++)
 		if (conf->multipaths[disk].operational)
 			return disk;
 	BUG();
 	return 0;
 }

 static int multipath_make_request (mddev_t *mddev, int rw,
 			       struct buffer_head * bh)
 {
 	multipath_conf_t *conf = mddev_to_conf(mddev);
 	struct buffer_head *bh_req;
 	struct multipath_bh * mp_bh;
 	struct multipath_info *multipath;

 	if (!buffer_locked(bh))
 		BUG();

 /*
  * make_request() can abort the operation when READA is being
  * used and no empty request is available.
  *
  * Currently, just replace the command with READ/WRITE.
  */
 	if (rw == READA)
 		rw = READ;

 	mp_bh = multipath_alloc_mpbh (conf);

 	mp_bh->master_bh = bh;
 	mp_bh->mddev = mddev;
 	mp_bh->cmd = rw;

 	/*
 	 * read balancing logic:
 	 */
 	multipath = conf->multipaths + multipath_read_balance(conf);

 	bh_req = &mp_bh->bh_req;
 	memcpy(bh_req, bh, sizeof(*bh));
 	bh_req->b_blocknr = bh->b_rsector;
 	bh_req->b_dev = multipath->dev;
 	bh_req->b_rdev = multipath->dev;
 /*	bh_req->b_rsector = bh->n_rsector; */
 	bh_req->b_end_io = multipath_end_request;
 	bh_req->b_private = mp_bh;
 	generic_make_request (rw, bh_req);
 	return 0;
 }

 static void multipath_status (struct seq_file *seq, mddev_t *mddev)
 {
 	multipath_conf_t *conf = mddev_to_conf(mddev);
 	int i;

 	seq_printf (seq, " [%d/%d] [", conf->raid_disks,
 						 conf->working_disks);
 	for (i = 0; i < conf->raid_disks; i++)
 		seq_printf (seq, "%s",
 			conf->multipaths[i].operational ? "U" : "_");
 	seq_printf (seq, "]");
 }

 #define LAST_DISK KERN_ALERT \
 "multipath: only one IO path left and IO error.\n"

 #define NO_SPARE_DISK KERN_ALERT \
 "multipath: no spare IO path left!\n"

 #define DISK_FAILED KERN_ALERT \
 "multipath: IO failure on %s, disabling IO path. \n" \
 "	Operation continuing on %d IO paths.\n"

 static void mark_disk_bad (mddev_t *mddev, int failed)
 {
 	multipath_conf_t *conf = mddev_to_conf(mddev);
 	struct multipath_info *multipath = conf->multipaths+failed;
 	mdp_super_t *sb = mddev->sb;

 	multipath->operational = 0;
 	mark_disk_faulty(sb->disks+multipath->number);
 	mark_disk_nonsync(sb->disks+multipath->number);
 	mark_disk_inactive(sb->disks+multipath->number);
 	sb->active_disks--;
 	sb->working_disks--;
 	sb->failed_disks++;
 	mddev->sb_dirty = 1;
 	md_wakeup_thread(conf->thread);
 	conf->working_disks--;
 	printk (DISK_FAILED, partition_name (multipath->dev),
 				 conf->working_disks);
 }

 /*
  * Careful, this can execute in IRQ contexts as well!
  */
 static int multipath_error (mddev_t *mddev, kdev_t dev)
 {
 	multipath_conf_t *conf = mddev_to_conf(mddev);
 	struct multipath_info * multipaths = conf->multipaths;
 	int disks = MD_SB_DISKS;
 	int other_paths = 1;
 	int i;

 	if (conf->working_disks == 1) {
 		other_paths = 0;
 		for (i = 0; i < disks; i++) {
 			if (multipaths[i].spare) {
 				other_paths = 1;
 				break;
 			}
 		}
 	}

 	if (!other_paths) {
 		/*
 		 * Uh oh, we can do nothing if this is our last path, but
 		 * first check if this is a queued request for a device
 		 * which has just failed.
 		 */
 		for (i = 0; i < disks; i++) {
 			if (multipaths[i].dev==dev && !multipaths[i].operational)
 				return 0;
 		}
 		printk (LAST_DISK);
 	} else {
 		/*
 		 * Mark disk as unusable
 		 */
 		for (i = 0; i < disks; i++) {
 			if (multipaths[i].dev==dev && multipaths[i].operational) {
 				mark_disk_bad(mddev, i);
 				break;
 			}
 		}
 		if (!conf->working_disks) {
 			int err = 1;
 			mdp_disk_t *spare;
 			mdp_super_t *sb = mddev->sb;

 			spare = get_spare(mddev);
 			if (spare) {
 				err = multipath_diskop(mddev, &spare, DISKOP_SPARE_WRITE);
 				printk("got DISKOP_SPARE_WRITE err: %d. (spare_faulty(): %d)\n", err, disk_faulty(spare));
 			}
 			if (!err && !disk_faulty(spare)) {
 				multipath_diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
 				mark_disk_sync(spare);
 				mark_disk_active(spare);
 				sb->active_disks++;
 				sb->spare_disks--;
 			}
 		}
 	}
 	return 0;
 }

 #undef LAST_DISK
 #undef NO_SPARE_DISK
 #undef DISK_FAILED


 static void print_multipath_conf (multipath_conf_t *conf)
 {
 	int i;
 	struct multipath_info *tmp;

 	printk("MULTIPATH conf printout:\n");
 	if (!conf) {
 		printk("(conf==NULL)\n");
 		return;
 	}
 	printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
 			 conf->raid_disks, conf->nr_disks);

 	for (i = 0; i < MD_SB_DISKS; i++) {
 		tmp = conf->multipaths + i;
 		if (tmp->spare || tmp->operational || tmp->number ||
 				tmp->raid_disk || tmp->used_slot)
 			printk(" disk%d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
 				i, tmp->spare,tmp->operational,
 				tmp->number,tmp->raid_disk,tmp->used_slot,
 				partition_name(tmp->dev));
 	}
 }

 static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
 {
 	int err = 0;
 	int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
 	multipath_conf_t *conf = mddev->private;
 	struct multipath_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
 	mdp_super_t *sb = mddev->sb;
 	mdp_disk_t *failed_desc, *spare_desc, *added_desc;
 	mdk_rdev_t *spare_rdev, *failed_rdev;

 	print_multipath_conf(conf);
 	md_spin_lock_irq(&conf->device_lock);
 	/*
 	 * find the disk ...
 	 */
 	switch (state) {

 	case DISKOP_SPARE_ACTIVE:

 		/*
 		 * Find the failed disk within the MULTIPATH configuration ...
 		 * (this can only be in the first conf->working_disks part)
 		 */
 		for (i = 0; i < conf->raid_disks; i++) {
 			tmp = conf->multipaths + i;
 			if ((!tmp->operational && !tmp->spare) ||
 					!tmp->used_slot) {
 				failed_disk = i;
 				break;
 			}
 		}
 		/*
 		 * When we activate a spare disk we _must_ have a disk in
 		 * the lower (active) part of the array to replace.
 		 */
 		if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}
 		/* fall through */

 	case DISKOP_SPARE_WRITE:
 	case DISKOP_SPARE_INACTIVE:

 		/*
 		 * Find the spare disk ... (can only be in the 'high'
 		 * area of the array)
 		 */
 		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
 			tmp = conf->multipaths + i;
 			if (tmp->spare && tmp->number == (*d)->number) {
 				spare_disk = i;
 				break;
 			}
 		}
 		if (spare_disk == -1) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}
 		break;

 	case DISKOP_HOT_REMOVE_DISK:

 		for (i = 0; i < MD_SB_DISKS; i++) {
 			tmp = conf->multipaths + i;
 			if (tmp->used_slot && (tmp->number == (*d)->number)) {
 				if (tmp->operational) {
 					printk(KERN_ERR "hot-remove-disk, slot %d is identified to be the requested disk (number %d), but is still operational!\n", i, (*d)->number);
 					err = -EBUSY;
 					goto abort;
 				}
 				removed_disk = i;
 				break;
 			}
 		}
 		if (removed_disk == -1) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}
 		break;

 	case DISKOP_HOT_ADD_DISK:

 		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
 			tmp = conf->multipaths + i;
 			if (!tmp->used_slot) {
 				added_disk = i;
 				break;
 			}
 		}
 		if (added_disk == -1) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}
 		break;
 	}

 	switch (state) {
 	/*
 	 * Switch the spare disk to write-only mode:
 	 */
 	case DISKOP_SPARE_WRITE:
 		sdisk = conf->multipaths + spare_disk;
 		sdisk->operational = 1;
 		break;
 	/*
 	 * Deactivate a spare disk:
 	 */
 	case DISKOP_SPARE_INACTIVE:
 		sdisk = conf->multipaths + spare_disk;
 		sdisk->operational = 0;
 		break;
 	/*
 	 * Activate (mark read-write) the (now sync) spare disk,
 	 * which means we switch it's 'raid position' (->raid_disk)
 	 * with the failed disk. (only the first 'conf->nr_disks'
 	 * slots are used for 'real' disks and we must preserve this
 	 * property)
 	 */
 	case DISKOP_SPARE_ACTIVE:
 		sdisk = conf->multipaths + spare_disk;
 		fdisk = conf->multipaths + failed_disk;

 		spare_desc = &sb->disks[sdisk->number];
 		failed_desc = &sb->disks[fdisk->number];

 		if (spare_desc != *d) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}

 		if (spare_desc->raid_disk != sdisk->raid_disk) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}

 		if (sdisk->raid_disk != spare_disk) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}

 		if (failed_desc->raid_disk != fdisk->raid_disk) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}

 		if (fdisk->raid_disk != failed_disk) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}

 		/*
 		 * do the switch finally
 		 */
 		spare_rdev = find_rdev_nr(mddev, spare_desc->number);
 		failed_rdev = find_rdev_nr(mddev, failed_desc->number);
 		xchg_values(spare_rdev->desc_nr, failed_rdev->desc_nr);
 		spare_rdev->alias_device = 0;
 		failed_rdev->alias_device = 1;

 		xchg_values(*spare_desc, *failed_desc);
 		xchg_values(*fdisk, *sdisk);

 		/*
 		 * (careful, 'failed' and 'spare' are switched from now on)
 		 *
 		 * we want to preserve linear numbering and we want to
 		 * give the proper raid_disk number to the now activated
 		 * disk. (this means we switch back these values)
 		 */

 		xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
 		xchg_values(sdisk->raid_disk, fdisk->raid_disk);
 		xchg_values(spare_desc->number, failed_desc->number);
 		xchg_values(sdisk->number, fdisk->number);

 		*d = failed_desc;

 		if (sdisk->dev == MKDEV(0,0))
 			sdisk->used_slot = 0;
 		/*
 		 * this really activates the spare.
 		 */
 		fdisk->spare = 0;

 		/*
 		 * if we activate a spare, we definitely replace a
 		 * non-operational disk slot in the 'low' area of
 		 * the disk array.
 		 */

 		conf->working_disks++;

 		break;

 	case DISKOP_HOT_REMOVE_DISK:
 		rdisk = conf->multipaths + removed_disk;

 		if (rdisk->spare && (removed_disk < conf->raid_disks)) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}
 		rdisk->dev = MKDEV(0,0);
 		rdisk->used_slot = 0;
 		conf->nr_disks--;
 		break;

 	case DISKOP_HOT_ADD_DISK:
 		adisk = conf->multipaths + added_disk;
 		added_desc = *d;

 		if (added_disk != added_desc->number) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}

 		adisk->number = added_desc->number;
 		adisk->raid_disk = added_desc->raid_disk;
 		adisk->dev = MKDEV(added_desc->major,added_desc->minor);

 		adisk->operational = 0;
 		adisk->spare = 1;
 		adisk->used_slot = 1;
 		conf->nr_disks++;

 		break;

 	default:
 		MD_BUG();
 		err = 1;
 		goto abort;
 	}
 abort:
 	md_spin_unlock_irq(&conf->device_lock);

 	print_multipath_conf(conf);
 	return err;
 }


 #define IO_ERROR KERN_ALERT \
 "multipath: %s: unrecoverable IO read error for block %lu\n"

 #define REDIRECT_SECTOR KERN_ERR \
 "multipath: %s: redirecting sector %lu to another IO path\n"

 /*
  * This is a kernel thread which:
  *
  *	1.	Retries failed read operations on working multipaths.
  *	2.	Updates the raid superblock when problems encounter.
  *	3.	Performs writes following reads for array syncronising.
  */

 static void multipathd (void *data)
 {
 	struct multipath_bh *mp_bh;
 	struct buffer_head *bh;
 	unsigned long flags;
 	mddev_t *mddev;
 	kdev_t dev;


 	for (;;) {
 		md_spin_lock_irqsave(&retry_list_lock, flags);
 		mp_bh = multipath_retry_list;
 		if (!mp_bh)
 			break;
 		multipath_retry_list = mp_bh->next_mp;
 		md_spin_unlock_irqrestore(&retry_list_lock, flags);

 		mddev = mp_bh->mddev;
 		if (mddev->sb_dirty)
 			md_update_sb(mddev);
 		bh = &mp_bh->bh_req;
 		dev = bh->b_dev;

 		multipath_map (mddev, &bh->b_dev);
 		if (bh->b_dev == dev) {
 			printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
 			multipath_end_bh_io(mp_bh, 0);
 		} else {
 			printk (REDIRECT_SECTOR,
 				partition_name(bh->b_dev), bh->b_blocknr);
 			bh->b_rdev = bh->b_dev;
 			bh->b_rsector = bh->b_blocknr;
 			generic_make_request (mp_bh->cmd, bh);
 		}
 	}
 	md_spin_unlock_irqrestore(&retry_list_lock, flags);
 }
 #undef IO_ERROR
 #undef REDIRECT_SECTOR

 /*
  * This will catch the scenario in which one of the multipaths was
  * mounted as a normal device rather than as a part of a raid set.
  *
  * check_consistency is very personality-dependent, eg. RAID5 cannot
  * do this check, it uses another method.
  */
 static int __check_consistency (mddev_t *mddev, int row)
 {
 	multipath_conf_t *conf = mddev_to_conf(mddev);
 	int disks = MD_SB_DISKS;
 	kdev_t dev;
 	struct buffer_head *bh = NULL;
 	int i, rc = 0;
 	char *buffer = NULL;

 	for (i = 0; i < disks; i++) {
 		if (!conf->multipaths[i].operational)
 			continue;
 		printk("(checking disk %d)\n",i);
 		dev = conf->multipaths[i].dev;
 		set_blocksize(dev, 4096);
 		if ((bh = bread(dev, row / 4, 4096)) == NULL)
 			break;
 		if (!buffer) {
 			buffer = (char *) __get_free_page(GFP_KERNEL);
 			if (!buffer)
 				break;
 			memcpy(buffer, bh->b_data, 4096);
 		} else if (memcmp(buffer, bh->b_data, 4096)) {
 			rc = 1;
 			break;
 		}
 		bforget(bh);
 		fsync_dev(dev);
 		invalidate_buffers(dev);
 		bh = NULL;
 	}
 	if (buffer)
 		free_page((unsigned long) buffer);
 	if (bh) {
 		dev = bh->b_dev;
 		bforget(bh);
 		fsync_dev(dev);
 		invalidate_buffers(dev);
 	}
 	return rc;
 }

 static int check_consistency (mddev_t *mddev)
 {
 	if (__check_consistency(mddev, 0))
 /*
  * we do not do this currently, as it's perfectly possible to
  * have an inconsistent array when it's freshly created. Only
  * newly written data has to be consistent.
  */
 		return 0;

 	return 0;
 }

 #define INVALID_LEVEL KERN_WARNING \
 "multipath: md%d: raid level not set to multipath IO (%d)\n"

 #define NO_SB KERN_ERR \
 "multipath: disabled IO path %s (couldn't access raid superblock)\n"

 #define ERRORS KERN_ERR \
 "multipath: disabled IO path %s (errors detected)\n"

 #define NOT_IN_SYNC KERN_ERR \
 "multipath: making IO path %s a spare path (not in sync)\n"

 #define INCONSISTENT KERN_ERR \
 "multipath: disabled IO path %s (inconsistent descriptor)\n"

 #define ALREADY_RUNNING KERN_ERR \
 "multipath: disabled IO path %s (multipath %d already operational)\n"

 #define OPERATIONAL KERN_INFO \
 "multipath: device %s operational as IO path %d\n"

 #define MEM_ERROR KERN_ERR \
 "multipath: couldn't allocate memory for md%d\n"

 #define SPARE KERN_INFO \
 "multipath: spare IO path %s\n"

 #define NONE_OPERATIONAL KERN_ERR \
 "multipath: no operational IO paths for md%d\n"

 #define SB_DIFFERENCES KERN_ERR \
 "multipath: detected IO path differences!\n"

 #define ARRAY_IS_ACTIVE KERN_INFO \
 "multipath: array md%d active with %d out of %d IO paths (%d spare IO paths)\n"

 #define THREAD_ERROR KERN_ERR \
 "multipath: couldn't allocate thread for md%d\n"

 static int multipath_run (mddev_t *mddev)
 {
 	multipath_conf_t *conf;
 	int i, j, disk_idx;
 	struct multipath_info *disk, *disk2;
 	mdp_super_t *sb = mddev->sb;
 	mdp_disk_t *desc, *desc2;
 	mdk_rdev_t *rdev, *def_rdev = NULL;
 	struct md_list_head *tmp;
 	int num_rdevs = 0;

 	MOD_INC_USE_COUNT;

 	if (sb->level != -4) {
 		printk(INVALID_LEVEL, mdidx(mddev), sb->level);
 		goto out;
 	}
 	/*
 	 * copy the already verified devices into our private MULTIPATH
 	 * bookkeeping area. [whatever we allocate in multipath_run(),
 	 * should be freed in multipath_stop()]
 	 */

 	conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL);
 	mddev->private = conf;
 	if (!conf) {
 		printk(MEM_ERROR, mdidx(mddev));
 		goto out;
 	}
 	memset(conf, 0, sizeof(*conf));

 	ITERATE_RDEV(mddev,rdev,tmp) {
 		if (rdev->faulty) {
 			/* this is a "should never happen" case and if it */
 			/* ever does happen, a continue; won't help */
 			printk(ERRORS, partition_name(rdev->dev));
 			continue;
 		} else {
 			/* this is a "should never happen" case and if it */
 			/* ever does happen, a continue; won't help */
 			if (!rdev->sb) {
 				MD_BUG();
 				continue;
 			}
 		}
 		if (rdev->desc_nr == -1) {
 			MD_BUG();
 			continue;
 		}

 		desc = &sb->disks[rdev->desc_nr];
 		disk_idx = desc->raid_disk;
 		disk = conf->multipaths + disk_idx;

 		if (!disk_sync(desc))
 			printk(NOT_IN_SYNC, partition_name(rdev->dev));

 		/*
 		 * Mark all disks as spare to start with, then pick our
 		 * active disk.  If we have a disk that is marked active
 		 * in the sb, then use it, else use the first rdev.
 		 */
 		disk->number = desc->number;
 		disk->raid_disk = desc->raid_disk;
 		disk->dev = rdev->dev;
 		disk->operational = 0;
 		disk->spare = 1;
 		disk->used_slot = 1;
 		mark_disk_sync(desc);

 		if (disk_active(desc)) {
 			if(!conf->working_disks) {
 				printk(OPERATIONAL, partition_name(rdev->dev),
  					desc->raid_disk);
 				disk->operational = 1;
 				disk->spare = 0;
 				conf->working_disks++;
 				def_rdev = rdev;
 			} else {
 				mark_disk_spare(desc);
 			}
 		} else
 			mark_disk_spare(desc);

 		if(!num_rdevs++) def_rdev = rdev;
 	}
 	if(!conf->working_disks && num_rdevs) {
 		desc = &sb->disks[def_rdev->desc_nr];
 		disk = conf->multipaths + desc->raid_disk;
 		printk(OPERATIONAL, partition_name(def_rdev->dev),
 			disk->raid_disk);
 		disk->operational = 1;
 		disk->spare = 0;
 		conf->working_disks++;
 		mark_disk_active(desc);
 	}
 	/*
 	 * Make sure our active path is in desc spot 0
 	 */
 	if(def_rdev->desc_nr != 0) {
 		rdev = find_rdev_nr(mddev, 0);
 		desc = &sb->disks[def_rdev->desc_nr];
 		desc2 = sb->disks;
 		disk = conf->multipaths + desc->raid_disk;
 		disk2 = conf->multipaths + desc2->raid_disk;
 		xchg_values(*desc2,*desc);
 		xchg_values(*disk2,*disk);
 		xchg_values(desc2->number, desc->number);
 		xchg_values(disk2->number, disk->number);
 		xchg_values(desc2->raid_disk, desc->raid_disk);
 		xchg_values(disk2->raid_disk, disk->raid_disk);
 		if(rdev) {
 			xchg_values(def_rdev->desc_nr,rdev->desc_nr);
 		} else {
 			def_rdev->desc_nr = 0;
 		}
 	}
 	conf->raid_disks = sb->raid_disks = sb->active_disks = 1;
 	conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs;
 	sb->failed_disks = 0;
 	sb->spare_disks = num_rdevs - 1;
 	mddev->sb_dirty = 1;
 	conf->mddev = mddev;
 	conf->device_lock = MD_SPIN_LOCK_UNLOCKED;

 	init_waitqueue_head(&conf->wait_buffer);

 	if (!conf->working_disks) {
 		printk(NONE_OPERATIONAL, mdidx(mddev));
 		goto out_free_conf;
 	}


 	/* pre-allocate some buffer_head structures.
 	 * As a minimum, 1 mpbh and raid_disks buffer_heads
 	 * would probably get us by in tight memory situations,
 	 * but a few more is probably a good idea.
 	 * For now, try NR_RESERVED_BUFS mpbh and
 	 * NR_RESERVED_BUFS*raid_disks bufferheads
 	 * This will allow at least NR_RESERVED_BUFS concurrent
 	 * reads or writes even if kmalloc starts failing
 	 */
 	if (multipath_grow_mpbh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS) {
 		printk(MEM_ERROR, mdidx(mddev));
 		goto out_free_conf;
 	}

 	if ((sb->state & (1 << MD_SB_CLEAN))) {
 		/*
 		 * we do sanity checks even if the device says
 		 * it's clean ...
 		 */
 		if (check_consistency(mddev)) {
 			printk(SB_DIFFERENCES);
 			sb->state &= ~(1 << MD_SB_CLEAN);
 		}
 	}

 	{
 		const char * name = "multipathd";

 		conf->thread = md_register_thread(multipathd, conf, name);
 		if (!conf->thread) {
 			printk(THREAD_ERROR, mdidx(mddev));
 			goto out_free_conf;
 		}
 	}

 	/*
 	 * Regenerate the "device is in sync with the raid set" bit for
 	 * each device.
 	 */
 	for (i = 0; i < MD_SB_DISKS; i++) {
 		mark_disk_nonsync(sb->disks+i);
 		for (j = 0; j < sb->raid_disks; j++) {
 			if (sb->disks[i].number == conf->multipaths[j].number)
 				mark_disk_sync(sb->disks+i);
 		}
 	}

 	printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks,
 			sb->raid_disks, sb->spare_disks);
 	/*
 	 * Ok, everything is just fine now
 	 */
 	return 0;

 out_free_conf:
 	multipath_shrink_mpbh(conf);
 	kfree(conf);
 	mddev->private = NULL;
 out:
 	MOD_DEC_USE_COUNT;
 	return -EIO;
 }

 #undef INVALID_LEVEL
 #undef NO_SB
 #undef ERRORS
 #undef NOT_IN_SYNC
 #undef INCONSISTENT
 #undef ALREADY_RUNNING
 #undef OPERATIONAL
 #undef SPARE
 #undef NONE_OPERATIONAL
 #undef SB_DIFFERENCES
 #undef ARRAY_IS_ACTIVE

 static int multipath_stop (mddev_t *mddev)
 {
 	multipath_conf_t *conf = mddev_to_conf(mddev);

 	md_unregister_thread(conf->thread);
 	multipath_shrink_mpbh(conf);
 	kfree(conf);
 	mddev->private = NULL;
 	MOD_DEC_USE_COUNT;
 	return 0;
 }

 static mdk_personality_t multipath_personality=
 {
 	name:		"multipath",
 	make_request:	multipath_make_request,
 	run:		multipath_run,
 	stop:		multipath_stop,
 	status:		multipath_status,
 	error_handler:	multipath_error,
 	diskop:		multipath_diskop,
 };

 static int md__init multipath_init (void)
 {
 	return register_md_personality (MULTIPATH, &multipath_personality);
 }

 static void multipath_exit (void)
 {
 	unregister_md_personality (MULTIPATH);
 }

 module_init(multipath_init);
 module_exit(multipath_exit);
 MODULE_LICENSE("GPL");