drivers/md/raid1.c - pub/scm/linux/kernel/git/wtarreau/linux-2.4 - Git at Google

 /*
  * raid1.c : Multiple Devices driver for Linux
  *
  * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
  *
  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
  *
  * RAID-1 management functions.
  *
  * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
  *
  * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
  * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2, or (at your option)
  * any later version.
  *
  * You should have received a copy of the GNU General Public License
  * (for example /usr/src/linux/COPYING); if not, write to the Free
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */

 #include <linux/module.h>
 #include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/raid/raid1.h>
 #include <asm/atomic.h>

 #define MAJOR_NR MD_MAJOR
 #define MD_DRIVER
 #define MD_PERSONALITY

 #define MAX_WORK_PER_DISK 128

 #define	NR_RESERVED_BUFS	32


 /*
  * The following can be used to debug the driver
  */
 #define RAID1_DEBUG	0

 #if RAID1_DEBUG
 #define PRINTK(x...)   printk(x)
 #define inline
 #define __inline__
 #else
 #define PRINTK(x...)  do { } while (0)
 #endif


 static mdk_personality_t raid1_personality;
 static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
 struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;

 static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
 {
 	/* return a linked list of "cnt" struct buffer_heads.
 	 * don't take any off the free list unless we know we can
 	 * get all we need, otherwise we could deadlock
 	 */
 	struct buffer_head *bh=NULL;

 	while(cnt) {
 		struct buffer_head *t;
 		md_spin_lock_irq(&conf->device_lock);
 		if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
 			while (cnt) {
 				t = conf->freebh;
 				conf->freebh = t->b_next;
 				t->b_next = bh;
 				bh = t;
 				t->b_state = 0;
 				conf->freebh_cnt--;
 				cnt--;
 			}
 		md_spin_unlock_irq(&conf->device_lock);
 		if (cnt == 0)
 			break;
 		t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
 		if (t) {
 			t->b_next = bh;
 			bh = t;
 			cnt--;
 		} else {
 			PRINTK("raid1: waiting for %d bh\n", cnt);
 			conf->freebh_blocked = 1;
 			wait_disk_event(conf->wait_buffer,
 					!conf->freebh_blocked ||
 					conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
 			conf->freebh_blocked = 0;
 		}
 	}
 	return bh;
 }

 static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&conf->device_lock, flags);
 	while (bh) {
 		struct buffer_head *t = bh;
 		bh=bh->b_next;
 		if (t->b_pprev == NULL)
 			kmem_cache_free(bh_cachep, t);
 		else {
 			t->b_next= conf->freebh;
 			conf->freebh = t;
 			conf->freebh_cnt++;
 		}
 	}
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 	wake_up(&conf->wait_buffer);
 }

 static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
 {
 	/* allocate cnt buffer_heads, possibly less if kmalloc fails */
 	int i = 0;

 	while (i < cnt) {
 		struct buffer_head *bh;
 		bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
 		if (!bh) break;

 		md_spin_lock_irq(&conf->device_lock);
 		bh->b_pprev = &conf->freebh;
 		bh->b_next = conf->freebh;
 		conf->freebh = bh;
 		conf->freebh_cnt++;
 		md_spin_unlock_irq(&conf->device_lock);

 		i++;
 	}
 	return i;
 }

 static void raid1_shrink_bh(raid1_conf_t *conf)
 {
 	/* discard all buffer_heads */

 	md_spin_lock_irq(&conf->device_lock);
 	while (conf->freebh) {
 		struct buffer_head *bh = conf->freebh;
 		conf->freebh = bh->b_next;
 		kmem_cache_free(bh_cachep, bh);
 		conf->freebh_cnt--;
 	}
 	md_spin_unlock_irq(&conf->device_lock);
 }


 static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
 {
 	struct raid1_bh *r1_bh = NULL;

 	do {
 		md_spin_lock_irq(&conf->device_lock);
 		if (!conf->freer1_blocked && conf->freer1) {
 			r1_bh = conf->freer1;
 			conf->freer1 = r1_bh->next_r1;
 			conf->freer1_cnt--;
 			r1_bh->next_r1 = NULL;
 			r1_bh->state = (1 << R1BH_PreAlloc);
 			r1_bh->bh_req.b_state = 0;
 		}
 		md_spin_unlock_irq(&conf->device_lock);
 		if (r1_bh)
 			return r1_bh;
 		r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
 		if (r1_bh) {
 			memset(r1_bh, 0, sizeof(*r1_bh));
 			return r1_bh;
 		}
 		conf->freer1_blocked = 1;
 		wait_disk_event(conf->wait_buffer,
 				!conf->freer1_blocked ||
 				conf->freer1_cnt > NR_RESERVED_BUFS/2
 			);
 		conf->freer1_blocked = 0;
 	} while (1);
 }

 static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
 {
 	struct buffer_head *bh = r1_bh->mirror_bh_list;
 	raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);

 	r1_bh->mirror_bh_list = NULL;

 	if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
 		unsigned long flags;
 		spin_lock_irqsave(&conf->device_lock, flags);
 		r1_bh->next_r1 = conf->freer1;
 		conf->freer1 = r1_bh;
 		conf->freer1_cnt++;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 		/* don't need to wakeup wait_buffer because
 		 *  raid1_free_bh below will do that
 		 */
 	} else {
 		kfree(r1_bh);
 	}
 	raid1_free_bh(conf, bh);
 }

 static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
 {
 	int i = 0;

 	while (i < cnt) {
 		struct raid1_bh *r1_bh;
 		r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
 		if (!r1_bh)
 			break;
 		memset(r1_bh, 0, sizeof(*r1_bh));
 		set_bit(R1BH_PreAlloc, &r1_bh->state);
 		r1_bh->mddev = conf->mddev;

 		raid1_free_r1bh(r1_bh);
 		i++;
 	}
 	return i;
 }

 static void raid1_shrink_r1bh(raid1_conf_t *conf)
 {
 	md_spin_lock_irq(&conf->device_lock);
 	while (conf->freer1) {
 		struct raid1_bh *r1_bh = conf->freer1;
 		conf->freer1 = r1_bh->next_r1;
 		conf->freer1_cnt--;
 		kfree(r1_bh);
 	}
 	md_spin_unlock_irq(&conf->device_lock);
 }


 static inline void raid1_free_buf(struct raid1_bh *r1_bh)
 {
 	unsigned long flags;
 	struct buffer_head *bh = r1_bh->mirror_bh_list;
 	raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
 	r1_bh->mirror_bh_list = NULL;

 	spin_lock_irqsave(&conf->device_lock, flags);
 	r1_bh->next_r1 = conf->freebuf;
 	conf->freebuf = r1_bh;
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 	raid1_free_bh(conf, bh);
 }

 static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
 {
 	struct raid1_bh *r1_bh;

 	md_spin_lock_irq(&conf->device_lock);
 	wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
 	r1_bh = conf->freebuf;
 	conf->freebuf = r1_bh->next_r1;
 	r1_bh->next_r1= NULL;
 	md_spin_unlock_irq(&conf->device_lock);

 	return r1_bh;
 }

 static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
 {
 	int i = 0;
 	struct raid1_bh *head = NULL, **tail;
 	tail = &head;

 	while (i < cnt) {
 		struct raid1_bh *r1_bh;
 		struct page *page;

 		page = alloc_page(GFP_KERNEL);
 		if (!page)
 			break;

 		r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
 		if (!r1_bh) {
 			__free_page(page);
 			break;
 		}
 		memset(r1_bh, 0, sizeof(*r1_bh));
 		r1_bh->bh_req.b_page = page;
 		r1_bh->bh_req.b_data = page_address(page);
 		*tail = r1_bh;
 		r1_bh->next_r1 = NULL;
 		tail = & r1_bh->next_r1;
 		i++;
 	}
 	/* this lock probably isn't needed, as at the time when
 	 * we are allocating buffers, nobody else will be touching the
 	 * freebuf list.  But it doesn't hurt....
 	 */
 	md_spin_lock_irq(&conf->device_lock);
 	*tail = conf->freebuf;
 	conf->freebuf = head;
 	md_spin_unlock_irq(&conf->device_lock);
 	return i;
 }

 static void raid1_shrink_buffers (raid1_conf_t *conf)
 {
 	struct raid1_bh *head;
 	md_spin_lock_irq(&conf->device_lock);
 	head = conf->freebuf;
 	conf->freebuf = NULL;
 	md_spin_unlock_irq(&conf->device_lock);

 	while (head) {
 		struct raid1_bh *r1_bh = head;
 		head = r1_bh->next_r1;
 		__free_page(r1_bh->bh_req.b_page);
 		kfree(r1_bh);
 	}
 }

 static int raid1_map (mddev_t *mddev, kdev_t *rdev)
 {
 	raid1_conf_t *conf = mddev_to_conf(mddev);
 	int i, disks = MD_SB_DISKS;
 	unsigned long flags;

 	/*
 	 * Later we do read balancing on the read side
 	 * now we use the first available disk.
 	 */

 	md_spin_lock_irqsave(&conf->device_lock, flags);
 	for (i = 0; i < disks; i++) {
 		if (conf->mirrors[i].operational) {
 			*rdev = conf->mirrors[i].dev;
 			md_spin_unlock_irqrestore(&conf->device_lock, flags);
 			return (0);
 		}
 	}
 	md_spin_unlock_irqrestore(&conf->device_lock, flags);

 	printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
 	return (-1);
 }

 static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
 {
 	unsigned long flags;
 	mddev_t *mddev = r1_bh->mddev;
 	raid1_conf_t *conf = mddev_to_conf(mddev);

 	md_spin_lock_irqsave(&retry_list_lock, flags);
 	if (raid1_retry_list == NULL)
 		raid1_retry_tail = &raid1_retry_list;
 	*raid1_retry_tail = r1_bh;
 	raid1_retry_tail = &r1_bh->next_r1;
 	r1_bh->next_r1 = NULL;
 	md_spin_unlock_irqrestore(&retry_list_lock, flags);
 	md_wakeup_thread(conf->thread);
 }


 static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&conf->segment_lock, flags);
 	if (sector < conf->start_active)
 		conf->cnt_done--;
 	else if (sector >= conf->start_future && conf->phase == phase)
 		conf->cnt_future--;
 	else if (!--conf->cnt_pending)
 		wake_up(&conf->wait_ready);

 	spin_unlock_irqrestore(&conf->segment_lock, flags);
 }

 static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&conf->segment_lock, flags);
 	if (sector >= conf->start_ready)
 		--conf->cnt_ready;
 	else if (sector >= conf->start_active) {
 		if (!--conf->cnt_active) {
 			conf->start_active = conf->start_ready;
 			wake_up(&conf->wait_done);
 		}
 	}
 	spin_unlock_irqrestore(&conf->segment_lock, flags);
 }

 /*
  * raid1_end_bh_io() is called when we have finished servicing a mirrored
  * operation and are ready to return a success/failure code to the buffer
  * cache layer.
  */
 static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
 {
 	struct buffer_head *bh = r1_bh->master_bh;

 	io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
 			test_bit(R1BH_SyncPhase, &r1_bh->state));

 	bh->b_end_io(bh, uptodate);
 	raid1_free_r1bh(r1_bh);
 }
 void raid1_end_request (struct buffer_head *bh, int uptodate)
 {
 	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);

 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
 	if (!uptodate)
 		md_error (r1_bh->mddev, bh->b_dev);
 	else
 		/*
 		 * Set R1BH_Uptodate in our master buffer_head, so that
 		 * we will return a good error code for to the higher
 		 * levels even if IO on some other mirrored buffer fails.
 		 *
 		 * The 'master' represents the complex operation to
 		 * user-side. So if something waits for IO, then it will
 		 * wait for the 'master' buffer_head.
 		 */
 		set_bit (R1BH_Uptodate, &r1_bh->state);

 	/*
 	 * We split up the read and write side, imho they are
 	 * conceptually different.
 	 */

 	if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
 		/*
 		 * we have only one buffer_head on the read side
 		 */

 		if (uptodate) {
 			raid1_end_bh_io(r1_bh, uptodate);
 			return;
 		}
 		/*
 		 * oops, read error:
 		 */
 		printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
 			 partition_name(bh->b_dev), bh->b_blocknr);
 		raid1_reschedule_retry(r1_bh);
 		return;
 	}

 	/*
 	 * WRITE:
 	 *
 	 * Let's see if all mirrored write operations have finished
 	 * already.
 	 */

 	if (atomic_dec_and_test(&r1_bh->remaining))
 		raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
 }

 /*
  * This routine returns the disk from which the requested read should
  * be done. It bookkeeps the last read position for every disk
  * in array and when new read requests come, the disk which last
  * position is nearest to the request, is chosen.
  *
  * TODO: now if there are 2 mirrors in the same 2 devices, performance
  * degrades dramatically because position is mirror, not device based.
  * This should be changed to be device based. Also atomic sequential
  * reads should be somehow balanced.
  */

 static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
 {
 	int new_disk = conf->last_used;
 	const int sectors = bh->b_size >> 9;
 	const unsigned long this_sector = bh->b_rsector;
 	int disk = new_disk;
 	unsigned long new_distance;
 	unsigned long current_distance;

 	/*
 	 * Check if it is sane at all to balance
 	 */

 	if (conf->resync_mirrors)
 		goto rb_out;


 #if defined(CONFIG_ALPHA) && ((__GNUC__ < 3) || \
 			      ((__GNUC__ == 3) && (__GNUC_MINOR__ < 3)))
 	/* Work around a compiler bug in older gcc */
 	new_disk = *(volatile int *)&new_disk;
 #endif

 	/* make sure that disk is operational */
 	while( !conf->mirrors[new_disk].operational) {
 		if (new_disk <= 0) new_disk = conf->raid_disks;
 		new_disk--;
 		if (new_disk == disk) {
 			/*
 			 * This means no working disk was found
 			 * Nothing much to do, lets not change anything
 			 * and hope for the best...
 			 */

 			new_disk = conf->last_used;

 			goto rb_out;
 		}
 	}
 	disk = new_disk;
 	/* now disk == new_disk == starting point for search */

 	/*
 	 * Don't touch anything for sequential reads.
 	 */

 	if (this_sector == conf->mirrors[new_disk].head_position)
 		goto rb_out;

 	/*
 	 * If reads have been done only on a single disk
 	 * for a time, lets give another disk a change.
 	 * This is for kicking those idling disks so that
 	 * they would find work near some hotspot.
 	 */

 	if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
 		conf->sect_count = 0;

 #if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92)
 		/* Work around a compiler bug in egcs-2.92.11 19980921 */
 		new_disk = *(volatile int *)&new_disk;
 #endif
 		do {
 			if (new_disk<=0)
 				new_disk = conf->raid_disks;
 			new_disk--;
 			if (new_disk == disk)
 				break;
 		} while ((conf->mirrors[new_disk].write_only) ||
 			 (!conf->mirrors[new_disk].operational));

 		goto rb_out;
 	}

 	current_distance = abs(this_sector -
 				conf->mirrors[disk].head_position);

 	/* Find the disk which is closest */

 #if defined(CONFIG_ALPHA) && ((__GNUC__ < 3) || \
 			      ((__GNUC__ == 3) && (__GNUC_MINOR__ < 3)))
 	/* Work around a compiler bug in older gcc */
 	disk = *(volatile int *)&disk;
 #endif
 	do {
 		if (disk <= 0)
 			disk = conf->raid_disks;
 		disk--;

 		if ((conf->mirrors[disk].write_only) ||
 				(!conf->mirrors[disk].operational))
 			continue;

 		new_distance = abs(this_sector -
 					conf->mirrors[disk].head_position);

 		if (new_distance < current_distance) {
 			conf->sect_count = 0;
 			current_distance = new_distance;
 			new_disk = disk;
 		}
 	} while (disk != conf->last_used);

 rb_out:
 	conf->mirrors[new_disk].head_position = this_sector + sectors;

 	conf->last_used = new_disk;
 	conf->sect_count += sectors;

 	return new_disk;
 }

 static int raid1_make_request (mddev_t *mddev, int rw,
 			       struct buffer_head * bh)
 {
 	raid1_conf_t *conf = mddev_to_conf(mddev);
 	struct buffer_head *bh_req, *bhl;
 	struct raid1_bh * r1_bh;
 	int disks = MD_SB_DISKS;
 	int i, sum_bhs = 0;
 	struct mirror_info *mirror;
 	kdev_t dev;

 	if (!buffer_locked(bh))
 		BUG();

 /*
  * make_request() can abort the operation when READA is being
  * used and no empty request is available.
  *
  * Currently, just replace the command with READ/WRITE.
  */
 	if (rw == READA)
 		rw = READ;

 	r1_bh = raid1_alloc_r1bh (conf);

 	spin_lock_irq(&conf->segment_lock);
 	wait_event_lock_irq(conf->wait_done,
 			bh->b_rsector < conf->start_active ||
 			bh->b_rsector >= conf->start_future,
 			conf->segment_lock);
 	if (bh->b_rsector < conf->start_active)
 		conf->cnt_done++;
 	else {
 		conf->cnt_future++;
 		if (conf->phase)
 			set_bit(R1BH_SyncPhase, &r1_bh->state);
 	}
 	spin_unlock_irq(&conf->segment_lock);

 	/*
 	 * i think the read and write branch should be separated completely,
 	 * since we want to do read balancing on the read side for example.
 	 * Alternative implementations? :) --mingo
 	 */

 	r1_bh->master_bh = bh;
 	r1_bh->mddev = mddev;
 	r1_bh->cmd = rw;

 	if (rw == READ) {
 		/*
 		 * read balancing logic:
 		 */
 		spin_lock_irq(&conf->device_lock);
 		mirror = conf->mirrors + raid1_read_balance(conf, bh);
 		dev = mirror->dev;
 		spin_unlock_irq(&conf->device_lock);

 		bh_req = &r1_bh->bh_req;
 		memcpy(bh_req, bh, sizeof(*bh));
 		bh_req->b_blocknr = bh->b_rsector;
 		bh_req->b_dev = dev;
 		bh_req->b_rdev = dev;
 	/*	bh_req->b_rsector = bh->n_rsector; */
 		bh_req->b_end_io = raid1_end_request;
 		bh_req->b_private = r1_bh;
 		generic_make_request (rw, bh_req);
 		return 0;
 	}

 	/*
 	 * WRITE:
 	 */

 	bhl = raid1_alloc_bh(conf, conf->raid_disks);
 	spin_lock_irq(&conf->device_lock);
 	for (i = 0; i < disks; i++) {
 		struct buffer_head *mbh;
 		if (!conf->mirrors[i].operational)
 			continue;

 	/*
 	 * We should use a private pool (size depending on NR_REQUEST),
 	 * to avoid writes filling up the memory with bhs
 	 *
  	 * Such pools are much faster than kmalloc anyways (so we waste
  	 * almost nothing by not using the master bh when writing and
  	 * win alot of cleanness) but for now we are cool enough. --mingo
  	 *
 	 * It's safe to sleep here, buffer heads cannot be used in a shared
  	 * manner in the write branch. Look how we lock the buffer at the
  	 * beginning of this function to grok the difference ;)
 	 */
  		mbh = bhl;
 		if (mbh == NULL) {
 			MD_BUG();
 			break;
 		}
 		bhl = mbh->b_next;
 		mbh->b_next = NULL;
 		mbh->b_this_page = (struct buffer_head *)1;

  	/*
  	 * prepare mirrored mbh (fields ordered for max mem throughput):
  	 */
 		mbh->b_blocknr    = bh->b_rsector;
 		mbh->b_dev        = conf->mirrors[i].dev;
 		mbh->b_rdev	  = conf->mirrors[i].dev;
 		mbh->b_rsector	  = bh->b_rsector;
 		mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
 						(1<<BH_Mapped) | (1<<BH_Lock);

 		atomic_set(&mbh->b_count, 1);
  		mbh->b_size       = bh->b_size;
  		mbh->b_page	  = bh->b_page;
  		mbh->b_data	  = bh->b_data;
  		mbh->b_list       = BUF_LOCKED;
  		mbh->b_end_io     = raid1_end_request;
  		mbh->b_private    = r1_bh;

 		mbh->b_next = r1_bh->mirror_bh_list;
 		r1_bh->mirror_bh_list = mbh;
 		sum_bhs++;
 	}
 	spin_unlock_irq(&conf->device_lock);
 	if (bhl) raid1_free_bh(conf,bhl);
 	if (!sum_bhs) {
 		/* Gag - all mirrors non-operational.. */
 		raid1_end_bh_io(r1_bh, 0);
 		return 0;
 	}
 	md_atomic_set(&r1_bh->remaining, sum_bhs);

 	/*
 	 * We have to be a bit careful about the semaphore above, thats
 	 * why we start the requests separately. Since kmalloc() could
 	 * fail, sleep and make_request() can sleep too, this is the
 	 * safer solution. Imagine, end_request decreasing the semaphore
 	 * before we could have set it up ... We could play tricks with
 	 * the semaphore (presetting it and correcting at the end if
 	 * sum_bhs is not 'n' but we have to do end_request by hand if
 	 * all requests finish until we had a chance to set up the
 	 * semaphore correctly ... lots of races).
 	 */
 	bh = r1_bh->mirror_bh_list;
 	while(bh) {
 		struct buffer_head *bh2 = bh;
 		bh = bh->b_next;
 		generic_make_request(rw, bh2);
 	}
 	return (0);
 }

 static void raid1_status(struct seq_file *seq, mddev_t *mddev)
 {
 	raid1_conf_t *conf = mddev_to_conf(mddev);
 	int i;

 	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
 						 conf->working_disks);
 	for (i = 0; i < conf->raid_disks; i++)
 		seq_printf(seq, "%s",
 			conf->mirrors[i].operational ? "U" : "_");
 	seq_printf(seq, "]");
 }

 #define LAST_DISK KERN_ALERT \
 "raid1: only one disk left and IO error.\n"

 #define NO_SPARE_DISK KERN_ALERT \
 "raid1: no spare disk left, degrading mirror level by one.\n"

 #define DISK_FAILED KERN_ALERT \
 "raid1: Disk failure on %s, disabling device. \n" \
 "	Operation continuing on %d devices\n"

 #define START_SYNCING KERN_ALERT \
 "raid1: start syncing spare disk.\n"

 #define ALREADY_SYNCING KERN_INFO \
 "raid1: syncing already in progress.\n"

 static void mark_disk_bad (mddev_t *mddev, int failed)
 {
 	raid1_conf_t *conf = mddev_to_conf(mddev);
 	struct mirror_info *mirror = conf->mirrors+failed;
 	mdp_super_t *sb = mddev->sb;

 	mirror->operational = 0;
 	mark_disk_faulty(sb->disks+mirror->number);
 	mark_disk_nonsync(sb->disks+mirror->number);
 	mark_disk_inactive(sb->disks+mirror->number);
 	if (!mirror->write_only)
 		sb->active_disks--;
 	else
 		sb->spare_disks--;
 	sb->working_disks--;
 	sb->failed_disks++;
 	mddev->sb_dirty = 1;
 	md_wakeup_thread(conf->thread);
 	if (!mirror->write_only)
 		conf->working_disks--;
 	printk (DISK_FAILED, partition_name (mirror->dev),
 				 conf->working_disks);
 }

 static int raid1_error (mddev_t *mddev, kdev_t dev)
 {
 	raid1_conf_t *conf = mddev_to_conf(mddev);
 	struct mirror_info * mirrors = conf->mirrors;
 	int disks = MD_SB_DISKS;
 	int i;
 	unsigned long flags;

 	/* Find the drive.
 	 * If it is not operational, then we have already marked it as dead
 	 * else if it is the last working disks, ignore the error, let the
 	 * next level up know.
 	 * else mark the drive as failed
 	 */

 	for (i = 0; i < disks; i++)
 		if (mirrors[i].dev==dev && mirrors[i].operational)
 			break;
 	if (i == disks)
 		return 0;

 	if (i < conf->raid_disks && conf->working_disks == 1) {
 		/* Don't fail the drive, act as though we were just a
 		 * normal single drive
 		 */

 		return 1;
 	}
 	md_spin_lock_irqsave(&conf->device_lock, flags);
 	mark_disk_bad(mddev, i);
 	md_spin_unlock_irqrestore(&conf->device_lock, flags);
 	return 0;
 }

 #undef LAST_DISK
 #undef NO_SPARE_DISK
 #undef DISK_FAILED
 #undef START_SYNCING


 static void print_raid1_conf (raid1_conf_t *conf)
 {
 	int i;
 	struct mirror_info *tmp;

 	printk("RAID1 conf printout:\n");
 	if (!conf) {
 		printk("(conf==NULL)\n");
 		return;
 	}
 	printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
 			 conf->raid_disks, conf->nr_disks);

 	for (i = 0; i < MD_SB_DISKS; i++) {
 		tmp = conf->mirrors + i;
 		printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
 			i, tmp->spare,tmp->operational,
 			tmp->number,tmp->raid_disk,tmp->used_slot,
 			partition_name(tmp->dev));
 	}
 }

 static void close_sync(raid1_conf_t *conf)
 {
 	mddev_t *mddev = conf->mddev;
 	/* If reconstruction was interrupted, we need to close the "active" and "pending"
 	 * holes.
 	 * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
 	 */
 	/* this is really needed when recovery stops too... */
 	spin_lock_irq(&conf->segment_lock);
 	conf->start_active = conf->start_pending;
 	conf->start_ready = conf->start_pending;
 	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
 	conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
 	conf->start_future = (mddev->sb->size<<1)+1;
 	conf->cnt_pending = conf->cnt_future;
 	conf->cnt_future = 0;
 	conf->phase = conf->phase ^1;
 	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
 	conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
 	conf->phase = 0;
 	conf->cnt_future = conf->cnt_done;;
 	conf->cnt_done = 0;
 	spin_unlock_irq(&conf->segment_lock);
 	wake_up(&conf->wait_done);
 }

 static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
 {
 	int err = 0;
 	int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
 	raid1_conf_t *conf = mddev->private;
 	struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
 	mdp_super_t *sb = mddev->sb;
 	mdp_disk_t *failed_desc, *spare_desc, *added_desc;
 	mdk_rdev_t *spare_rdev, *failed_rdev;

 	if (conf->resync_mirrors)
 		return 1; /* Cannot do any diskops during a resync */

 	switch (state) {
 	case DISKOP_SPARE_ACTIVE:
 	case DISKOP_SPARE_INACTIVE:
 		/* need to wait for pending sync io before locking device */
 		close_sync(conf);
 	}

 	md_spin_lock_irq(&conf->device_lock);
 	/*
 	 * Need the conf lock when printing out state else we get BUG()s
 	 */
 	print_raid1_conf(conf);
 	/*
 	 * find the disk ...
 	 */
 	switch (state) {

 	case DISKOP_SPARE_ACTIVE:

 		/*
 		 * Find the failed disk within the RAID1 configuration ...
 		 * (this can only be in the first conf->working_disks part)
 		 */
 		for (i = 0; i < conf->raid_disks; i++) {
 			tmp = conf->mirrors + i;
 			if ((!tmp->operational && !tmp->spare) ||
 					!tmp->used_slot) {
 				failed_disk = i;
 				break;
 			}
 		}
 		/*
 		 * When we activate a spare disk we _must_ have a disk in
 		 * the lower (active) part of the array to replace.
 		 */
 		if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}
 		/* fall through */

 	case DISKOP_SPARE_WRITE:
 	case DISKOP_SPARE_INACTIVE:

 		/*
 		 * Find the spare disk ... (can only be in the 'high'
 		 * area of the array)
 		 */
 		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
 			tmp = conf->mirrors + i;
 			if (tmp->spare && tmp->number == (*d)->number) {
 				spare_disk = i;
 				break;
 			}
 		}
 		if (spare_disk == -1) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}
 		break;

 	case DISKOP_HOT_REMOVE_DISK:

 		for (i = 0; i < MD_SB_DISKS; i++) {
 			tmp = conf->mirrors + i;
 			if (tmp->used_slot && (tmp->number == (*d)->number)) {
 				if (tmp->operational) {
 					err = -EBUSY;
 					goto abort;
 				}
 				removed_disk = i;
 				break;
 			}
 		}
 		if (removed_disk == -1) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}
 		break;

 	case DISKOP_HOT_ADD_DISK:

 		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
 			tmp = conf->mirrors + i;
 			if (!tmp->used_slot) {
 				added_disk = i;
 				break;
 			}
 		}
 		if (added_disk == -1) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}
 		break;
 	}

 	switch (state) {
 	/*
 	 * Switch the spare disk to write-only mode:
 	 */
 	case DISKOP_SPARE_WRITE:
 		sdisk = conf->mirrors + spare_disk;
 		sdisk->operational = 1;
 		sdisk->write_only = 1;
 		break;
 	/*
 	 * Deactivate a spare disk:
 	 */
 	case DISKOP_SPARE_INACTIVE:
 		if (conf->start_future > 0) {
 			MD_BUG();
 			err = -EBUSY;
 			break;
 		}
 		sdisk = conf->mirrors + spare_disk;
 		sdisk->operational = 0;
 		sdisk->write_only = 0;
 		break;
 	/*
 	 * Activate (mark read-write) the (now sync) spare disk,
 	 * which means we switch it's 'raid position' (->raid_disk)
 	 * with the failed disk. (only the first 'conf->nr_disks'
 	 * slots are used for 'real' disks and we must preserve this
 	 * property)
 	 */
 	case DISKOP_SPARE_ACTIVE:
 		if (conf->start_future > 0) {
 			MD_BUG();
 			err = -EBUSY;
 			break;
 		}
 		sdisk = conf->mirrors + spare_disk;
 		fdisk = conf->mirrors + failed_disk;

 		spare_desc = &sb->disks[sdisk->number];
 		failed_desc = &sb->disks[fdisk->number];

 		if (spare_desc != *d) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}

 		if (spare_desc->raid_disk != sdisk->raid_disk) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}

 		if (sdisk->raid_disk != spare_disk) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}

 		if (failed_desc->raid_disk != fdisk->raid_disk) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}

 		if (fdisk->raid_disk != failed_disk) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}

 		/*
 		 * do the switch finally
 		 */
 		spare_rdev = find_rdev_nr(mddev, spare_desc->number);
 		failed_rdev = find_rdev_nr(mddev, failed_desc->number);

 		/* There must be a spare_rdev, but there may not be a
 		 * failed_rdev.  That slot might be empty...
 		 */
 		spare_rdev->desc_nr = failed_desc->number;
 		if (failed_rdev)
 			failed_rdev->desc_nr = spare_desc->number;

 		xchg_values(*spare_desc, *failed_desc);
 		xchg_values(*fdisk, *sdisk);

 		/*
 		 * (careful, 'failed' and 'spare' are switched from now on)
 		 *
 		 * we want to preserve linear numbering and we want to
 		 * give the proper raid_disk number to the now activated
 		 * disk. (this means we switch back these values)
 		 */

 		xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
 		xchg_values(sdisk->raid_disk, fdisk->raid_disk);
 		xchg_values(spare_desc->number, failed_desc->number);
 		xchg_values(sdisk->number, fdisk->number);

 		*d = failed_desc;

 		if (sdisk->dev == MKDEV(0,0))
 			sdisk->used_slot = 0;
 		/*
 		 * this really activates the spare.
 		 */
 		fdisk->spare = 0;
 		fdisk->write_only = 0;

 		/*
 		 * if we activate a spare, we definitely replace a
 		 * non-operational disk slot in the 'low' area of
 		 * the disk array.
 		 */

 		conf->working_disks++;

 		break;

 	case DISKOP_HOT_REMOVE_DISK:
 		rdisk = conf->mirrors + removed_disk;

 		if (rdisk->spare && (removed_disk < conf->raid_disks)) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}
 		rdisk->dev = MKDEV(0,0);
 		rdisk->used_slot = 0;
 		conf->nr_disks--;
 		break;

 	case DISKOP_HOT_ADD_DISK:
 		adisk = conf->mirrors + added_disk;
 		added_desc = *d;

 		if (added_disk != added_desc->number) {
 			MD_BUG();
 			err = 1;
 			goto abort;
 		}

 		adisk->number = added_desc->number;
 		adisk->raid_disk = added_desc->raid_disk;
 		adisk->dev = MKDEV(added_desc->major,added_desc->minor);

 		adisk->operational = 0;
 		adisk->write_only = 0;
 		adisk->spare = 1;
 		adisk->used_slot = 1;
 		adisk->head_position = 0;
 		conf->nr_disks++;

 		break;

 	default:
 		MD_BUG();
 		err = 1;
 		goto abort;
 	}
 abort:
 	print_raid1_conf(conf);
 	md_spin_unlock_irq(&conf->device_lock);
 	if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
 		/* should move to "END_REBUILD" when such exists */
 		raid1_shrink_buffers(conf);

 	return err;
 }


 #define IO_ERROR KERN_ALERT \
 "raid1: %s: unrecoverable I/O read error for block %lu\n"

 #define REDIRECT_SECTOR KERN_ERR \
 "raid1: %s: redirecting sector %lu to another mirror\n"

 /*
  * This is a kernel thread which:
  *
  *	1.	Retries failed read operations on working mirrors.
  *	2.	Updates the raid superblock when problems encounter.
  *	3.	Performs writes following reads for array syncronising.
  */
 static void end_sync_write(struct buffer_head *bh, int uptodate);
 static void end_sync_read(struct buffer_head *bh, int uptodate);

 static void raid1d (void *data)
 {
 	struct raid1_bh *r1_bh;
 	struct buffer_head *bh;
 	unsigned long flags;
 	raid1_conf_t *conf = data;
 	mddev_t *mddev = conf->mddev;
 	kdev_t dev;

 	if (mddev->sb_dirty)
 		md_update_sb(mddev);

 	for (;;) {
 		md_spin_lock_irqsave(&retry_list_lock, flags);
 		r1_bh = raid1_retry_list;
 		if (!r1_bh)
 			break;
 		raid1_retry_list = r1_bh->next_r1;
 		md_spin_unlock_irqrestore(&retry_list_lock, flags);

 		mddev = r1_bh->mddev;
 		bh = &r1_bh->bh_req;
 		switch(r1_bh->cmd) {
 		case SPECIAL:
 			/* have to allocate lots of bh structures and
 			 * schedule writes
 			 */
 			if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
 				int i, sum_bhs = 0;
 				int disks = MD_SB_DISKS;
 				struct buffer_head *bhl, *mbh;

 				conf = mddev_to_conf(mddev);
 				bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
 				spin_lock_irq(&conf->device_lock);
 				for (i = 0; i < disks ; i++) {
 					if (!conf->mirrors[i].operational)
 						continue;
 					if (i==conf->last_used)
 						/* we read from here, no need to write */
 						continue;
 					if (i < conf->raid_disks
 					    && !conf->resync_mirrors)
 						/* don't need to write this,
 						 * we are just rebuilding */
 						continue;
 					mbh = bhl;
 					if (!mbh) {
 						MD_BUG();
 						break;
 					}
 					bhl = mbh->b_next;
 					mbh->b_this_page = (struct buffer_head *)1;


 				/*
 				 * prepare mirrored bh (fields ordered for max mem throughput):
 				 */
 					mbh->b_blocknr    = bh->b_blocknr;
 					mbh->b_dev        = conf->mirrors[i].dev;
 					mbh->b_rdev	  = conf->mirrors[i].dev;
 					mbh->b_rsector	  = bh->b_blocknr;
 					mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
 						(1<<BH_Mapped) | (1<<BH_Lock);
 					atomic_set(&mbh->b_count, 1);
 					mbh->b_size       = bh->b_size;
 					mbh->b_page	  = bh->b_page;
 					mbh->b_data	  = bh->b_data;
 					mbh->b_list       = BUF_LOCKED;
 					mbh->b_end_io     = end_sync_write;
 					mbh->b_private    = r1_bh;

 					mbh->b_next = r1_bh->mirror_bh_list;
 					r1_bh->mirror_bh_list = mbh;

 					sum_bhs++;
 				}
 				spin_unlock_irq(&conf->device_lock);
 				md_atomic_set(&r1_bh->remaining, sum_bhs);
 				if (bhl) raid1_free_bh(conf, bhl);
 				mbh = r1_bh->mirror_bh_list;

 				if (!sum_bhs) {
 					/* nowhere to write this too... I guess we
 					 * must be done
 					 */
 					sync_request_done(bh->b_blocknr, conf);
 					md_done_sync(mddev, bh->b_size>>9, 0);
 					raid1_free_buf(r1_bh);
 				} else
 				while (mbh) {
 					struct buffer_head *bh1 = mbh;
 					mbh = mbh->b_next;
 					generic_make_request(WRITE, bh1);
 					md_sync_acct(bh1->b_dev, bh1->b_size/512);
 				}
 			} else {
 				/* There is no point trying a read-for-reconstruct
 				 * as reconstruct is about to be aborted
 				 */

 				printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
 				md_done_sync(mddev, bh->b_size>>9, 0);
 			}

 			break;
 		case READ:
 		case READA:
 			dev = bh->b_dev;
 			raid1_map (mddev, &bh->b_dev);
 			if (bh->b_dev == dev) {
 				printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
 				raid1_end_bh_io(r1_bh, 0);
 			} else {
 				printk (REDIRECT_SECTOR,
 					partition_name(bh->b_dev), bh->b_blocknr);
 				bh->b_rdev = bh->b_dev;
 				bh->b_rsector = bh->b_blocknr;
 				generic_make_request (r1_bh->cmd, bh);
 			}
 			break;
 		}
 	}
 	md_spin_unlock_irqrestore(&retry_list_lock, flags);
 }
 #undef IO_ERROR
 #undef REDIRECT_SECTOR

 /*
  * Private kernel thread to reconstruct mirrors after an unclean
  * shutdown.
  */
 static void raid1syncd (void *data)
 {
 	raid1_conf_t *conf = data;
 	mddev_t *mddev = conf->mddev;

 	if (!conf->resync_mirrors)
 		return;
 	if (conf->resync_mirrors == 2)
 		return;
 	down(&mddev->recovery_sem);
 	if (!md_do_sync(mddev, NULL)) {
 		/*
 		 * Only if everything went Ok.
 		 */
 		conf->resync_mirrors = 0;
 	}

 	close_sync(conf);

 	up(&mddev->recovery_sem);
 	raid1_shrink_buffers(conf);

 	md_recover_arrays(); /* incase we are degraded and a spare is available */
 }

 /*
  * perform a "sync" on one "block"
  *
  * We need to make sure that no normal I/O request - particularly write
  * requests - conflict with active sync requests.
  * This is achieved by conceptually dividing the device space into a
  * number of sections:
  *  DONE: 0 .. a-1     These blocks are in-sync
  *  ACTIVE: a.. b-1    These blocks may have active sync requests, but
  *                     no normal IO requests
  *  READY: b .. c-1    These blocks have no normal IO requests - sync
  *                     request may be happening
  *  PENDING: c .. d-1  These blocks may have IO requests, but no new
  *                     ones will be added
  *  FUTURE:  d .. end  These blocks are not to be considered yet. IO may
  *                     be happening, but not sync
  *
  * We keep a
  *   phase    which flips (0 or 1) each time d moves and
  * a count of:
  *   z =  active io requests in FUTURE since d moved - marked with
  *        current phase
  *   y =  active io requests in FUTURE before d moved, or PENDING -
  *        marked with previous phase
  *   x =  active sync requests in READY
  *   w =  active sync requests in ACTIVE
  *   v =  active io requests in DONE
  *
  * Normally, a=b=c=d=0 and z= active io requests
  *   or a=b=c=d=END and v= active io requests
  * Allowed changes to a,b,c,d:
  * A:  c==d &&  y==0 -> d+=window, y=z, z=0, phase=!phase
  * B:  y==0 -> c=d
  * C:   b=c, w+=x, x=0
  * D:  w==0 -> a=b
  * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
  *
  * At start of sync we apply A.
  * When y reaches 0, we apply B then A then being sync requests
  * When sync point reaches c-1, we wait for y==0, and W==0, and
  * then apply apply B then A then D then C.
  * Finally, we apply E
  *
  * The sync request simply issues a "read" against a working drive
  * This is marked so that on completion the raid1d thread is woken to
  * issue suitable write requests
  */

 static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
 {
 	raid1_conf_t *conf = mddev_to_conf(mddev);
 	struct mirror_info *mirror;
 	struct raid1_bh *r1_bh;
 	struct buffer_head *bh;
 	int bsize;
 	int disk;
 	int block_nr;
 	int buffs;
 	kdev_t dev;

 	if (!sector_nr) {
 		/* we want enough buffers to hold twice the window of 128*/
 		buffs = 128 *2 / (PAGE_SIZE>>9);
 		buffs = raid1_grow_buffers(conf, buffs);
 		if (buffs < 2)
 			goto nomem;
 		conf->window = buffs*(PAGE_SIZE>>9)/2;
 	}
 	spin_lock_irq(&conf->segment_lock);
 	if (!sector_nr) {
 		/* initialize ...*/
 		conf->start_active = 0;
 		conf->start_ready = 0;
 		conf->start_pending = 0;
 		conf->start_future = 0;
 		conf->phase = 0;

 		conf->cnt_future += conf->cnt_done+conf->cnt_pending;
 		conf->cnt_done = conf->cnt_pending = 0;
 		if (conf->cnt_ready || conf->cnt_active)
 			MD_BUG();
 	}
 	while (sector_nr >= conf->start_pending) {
 		PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
 			sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
 			conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
 		wait_event_lock_irq(conf->wait_done,
 					!conf->cnt_active,
 					conf->segment_lock);
 		wait_event_lock_irq(conf->wait_ready,
 					!conf->cnt_pending,
 					conf->segment_lock);
 		conf->start_active = conf->start_ready;
 		conf->start_ready = conf->start_pending;
 		conf->start_pending = conf->start_future;
 		conf->start_future = conf->start_future+conf->window;
 		// Note: falling off the end is not a problem
 		conf->phase = conf->phase ^1;
 		conf->cnt_active = conf->cnt_ready;
 		conf->cnt_ready = 0;
 		conf->cnt_pending = conf->cnt_future;
 		conf->cnt_future = 0;
 		wake_up(&conf->wait_done);
 	}
 	conf->cnt_ready++;
 	spin_unlock_irq(&conf->segment_lock);


 	/* If reconstructing, and >1 working disc,
 	 * could dedicate one to rebuild and others to
 	 * service read requests ..
 	 */
 	spin_lock_irq(&conf->device_lock);
 	disk = conf->last_used;
 	/* make sure disk is operational */
 	while (!conf->mirrors[disk].operational) {
 		if (disk <= 0) disk = conf->raid_disks;
 		disk--;
 		if (disk == conf->last_used)
 			break;
 	}
 	conf->last_used = disk;

 	mirror = conf->mirrors+conf->last_used;
 	dev = mirror->dev;
 	spin_unlock_irq(&conf->device_lock);

 	r1_bh = raid1_alloc_buf (conf);
 	r1_bh->master_bh = NULL;
 	r1_bh->mddev = mddev;
 	r1_bh->cmd = SPECIAL;
 	bh = &r1_bh->bh_req;

 	block_nr = sector_nr;
 	bsize = 512;
 	while (!(block_nr & 1) && bsize < PAGE_SIZE
 			&& (block_nr+2)*(bsize>>9) <= (mddev->sb->size *2)) {
 		block_nr >>= 1;
 		bsize <<= 1;
 	}
 	bh->b_size = bsize;
 	bh->b_list = BUF_LOCKED;
 	bh->b_dev = dev;
 	bh->b_rdev = dev;
 	bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
 	if (!bh->b_page)
 		BUG();
 	if (!bh->b_data)
 		BUG();
 	if (bh->b_data != page_address(bh->b_page))
 		BUG();
 	bh->b_end_io = end_sync_read;
 	bh->b_private = r1_bh;
 	bh->b_blocknr = sector_nr;
 	bh->b_rsector = sector_nr;
 	init_waitqueue_head(&bh->b_wait);

 	generic_make_request(READ, bh);
 	md_sync_acct(bh->b_dev, bh->b_size/512);

 	return (bsize >> 9);

 nomem:
 	raid1_shrink_buffers(conf);
 	return -ENOMEM;
 }

 static void end_sync_read(struct buffer_head *bh, int uptodate)
 {
 	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);

 	/* we have read a block, now it needs to be re-written,
 	 * or re-read if the read failed.
 	 * We don't do much here, just schedule handling by raid1d
 	 */
 	if (!uptodate)
 		md_error (r1_bh->mddev, bh->b_dev);
 	else
 		set_bit(R1BH_Uptodate, &r1_bh->state);
 	raid1_reschedule_retry(r1_bh);
 }

 static void end_sync_write(struct buffer_head *bh, int uptodate)
 {
  	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);

 	if (!uptodate)
  		md_error (r1_bh->mddev, bh->b_dev);
 	if (atomic_dec_and_test(&r1_bh->remaining)) {
 		mddev_t *mddev = r1_bh->mddev;
  		unsigned long sect = bh->b_blocknr;
 		int size = bh->b_size;
 		raid1_free_buf(r1_bh);
 		sync_request_done(sect, mddev_to_conf(mddev));
 		md_done_sync(mddev,size>>9, uptodate);
 	}
 }

 #define INVALID_LEVEL KERN_WARNING \
 "raid1: md%d: raid level not set to mirroring (%d)\n"

 #define NO_SB KERN_ERR \
 "raid1: disabled mirror %s (couldn't access raid superblock)\n"

 #define ERRORS KERN_ERR \
 "raid1: disabled mirror %s (errors detected)\n"

 #define NOT_IN_SYNC KERN_ERR \
 "raid1: disabled mirror %s (not in sync)\n"

 #define INCONSISTENT KERN_ERR \
 "raid1: disabled mirror %s (inconsistent descriptor)\n"

 #define ALREADY_RUNNING KERN_ERR \
 "raid1: disabled mirror %s (mirror %d already operational)\n"

 #define OPERATIONAL KERN_INFO \
 "raid1: device %s operational as mirror %d\n"

 #define MEM_ERROR KERN_ERR \
 "raid1: couldn't allocate memory for md%d\n"

 #define SPARE KERN_INFO \
 "raid1: spare disk %s\n"

 #define NONE_OPERATIONAL KERN_ERR \
 "raid1: no operational mirrors for md%d\n"

 #define ARRAY_IS_ACTIVE KERN_INFO \
 "raid1: raid set md%d active with %d out of %d mirrors\n"

 #define THREAD_ERROR KERN_ERR \
 "raid1: couldn't allocate thread for md%d\n"

 #define START_RESYNC KERN_WARNING \
 "raid1: raid set md%d not clean; reconstructing mirrors\n"

 static int raid1_run (mddev_t *mddev)
 {
 	raid1_conf_t *conf;
 	int i, j, disk_idx;
 	struct mirror_info *disk;
 	mdp_super_t *sb = mddev->sb;
 	mdp_disk_t *descriptor;
 	mdk_rdev_t *rdev;
 	struct md_list_head *tmp;
 	int start_recovery = 0;

 	MOD_INC_USE_COUNT;

 	if (sb->level != 1) {
 		printk(INVALID_LEVEL, mdidx(mddev), sb->level);
 		goto out;
 	}
 	/*
 	 * copy the already verified devices into our private RAID1
 	 * bookkeeping area. [whatever we allocate in raid1_run(),
 	 * should be freed in raid1_stop()]
 	 */

 	conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
 	mddev->private = conf;
 	if (!conf) {
 		printk(MEM_ERROR, mdidx(mddev));
 		goto out;
 	}
 	memset(conf, 0, sizeof(*conf));

 	ITERATE_RDEV(mddev,rdev,tmp) {
 		if (rdev->faulty) {
 			printk(ERRORS, partition_name(rdev->dev));
 		} else {
 			if (!rdev->sb) {
 				MD_BUG();
 				continue;
 			}
 		}
 		if (rdev->desc_nr == -1) {
 			MD_BUG();
 			continue;
 		}
 		descriptor = &sb->disks[rdev->desc_nr];
 		disk_idx = descriptor->raid_disk;
 		disk = conf->mirrors + disk_idx;

 		if (disk_faulty(descriptor)) {
 			disk->number = descriptor->number;
 			disk->raid_disk = disk_idx;
 			disk->dev = rdev->dev;
 			disk->sect_limit = MAX_WORK_PER_DISK;
 			disk->operational = 0;
 			disk->write_only = 0;
 			disk->spare = 0;
 			disk->used_slot = 1;
 			disk->head_position = 0;
 			continue;
 		}
 		if (disk_active(descriptor)) {
 			if (!disk_sync(descriptor)) {
 				printk(NOT_IN_SYNC,
 					partition_name(rdev->dev));
 				continue;
 			}
 			if ((descriptor->number > MD_SB_DISKS) ||
 					 (disk_idx > sb->raid_disks)) {

 				printk(INCONSISTENT,
 					partition_name(rdev->dev));
 				continue;
 			}
 			if (disk->operational) {
 				printk(ALREADY_RUNNING,
 					partition_name(rdev->dev),
 					disk_idx);
 				continue;
 			}
 			printk(OPERATIONAL, partition_name(rdev->dev),
  					disk_idx);
 			disk->number = descriptor->number;
 			disk->raid_disk = disk_idx;
 			disk->dev = rdev->dev;
 			disk->sect_limit = MAX_WORK_PER_DISK;
 			disk->operational = 1;
 			disk->write_only = 0;
 			disk->spare = 0;
 			disk->used_slot = 1;
 			disk->head_position = 0;
 			conf->working_disks++;
 		} else {
 		/*
 		 * Must be a spare disk ..
 		 */
 			printk(SPARE, partition_name(rdev->dev));
 			disk->number = descriptor->number;
 			disk->raid_disk = disk_idx;
 			disk->dev = rdev->dev;
 			disk->sect_limit = MAX_WORK_PER_DISK;
 			disk->operational = 0;
 			disk->write_only = 0;
 			disk->spare = 1;
 			disk->used_slot = 1;
 			disk->head_position = 0;
 		}
 	}
 	conf->raid_disks = sb->raid_disks;
 	conf->nr_disks = sb->nr_disks;
 	conf->mddev = mddev;
 	conf->device_lock = MD_SPIN_LOCK_UNLOCKED;

 	conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
 	init_waitqueue_head(&conf->wait_buffer);
 	init_waitqueue_head(&conf->wait_done);
 	init_waitqueue_head(&conf->wait_ready);

 	if (!conf->working_disks) {
 		printk(NONE_OPERATIONAL, mdidx(mddev));
 		goto out_free_conf;
 	}


 	/* pre-allocate some buffer_head structures.
 	 * As a minimum, 1 r1bh and raid_disks buffer_heads
 	 * would probably get us by in tight memory situations,
 	 * but a few more is probably a good idea.
 	 * For now, try NR_RESERVED_BUFS r1bh and
 	 * NR_RESERVED_BUFS*raid_disks bufferheads
 	 * This will allow at least NR_RESERVED_BUFS concurrent
 	 * reads or writes even if kmalloc starts failing
 	 */
 	if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
 	    raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
 	                      < NR_RESERVED_BUFS*conf->raid_disks) {
 		printk(MEM_ERROR, mdidx(mddev));
 		goto out_free_conf;
 	}

 	for (i = 0; i < MD_SB_DISKS; i++) {

 		descriptor = sb->disks+i;
 		disk_idx = descriptor->raid_disk;
 		disk = conf->mirrors + disk_idx;

 		if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
 				!disk->used_slot) {

 			disk->number = descriptor->number;
 			disk->raid_disk = disk_idx;
 			disk->dev = MKDEV(0,0);

 			disk->operational = 0;
 			disk->write_only = 0;
 			disk->spare = 0;
 			disk->used_slot = 1;
 			disk->head_position = 0;
 		}
 	}

 	/*
 	 * find the first working one and use it as a starting point
 	 * to read balancing.
 	 */
 	for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
 		/* nothing */;
 	conf->last_used = j;


 	{
 		const char * name = "raid1d";

 		conf->thread = md_register_thread(raid1d, conf, name);
 		if (!conf->thread) {
 			printk(THREAD_ERROR, mdidx(mddev));
 			goto out_free_conf;
 		}
 	}

 	if (!(sb->state & (1 << MD_SB_CLEAN)) &&
 	    (conf->working_disks > 1)) {
 		const char * name = "raid1syncd";

 		conf->resync_thread = md_register_thread(raid1syncd, conf,name);
 		if (!conf->resync_thread) {
 			printk(THREAD_ERROR, mdidx(mddev));
 			goto out_free_conf;
 		}

 		printk(START_RESYNC, mdidx(mddev));
 		conf->resync_mirrors = 1;
 		md_wakeup_thread(conf->resync_thread);
 	} else if (conf->working_disks != sb->raid_disks) {
 		printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
 		start_recovery = 1;
 	}

 	/*
 	 * Regenerate the "device is in sync with the raid set" bit for
 	 * each device.
 	 */
 	for (i = 0; i < MD_SB_DISKS; i++) {
 		mark_disk_nonsync(sb->disks+i);
 		for (j = 0; j < sb->raid_disks; j++) {
 			if (!conf->mirrors[j].operational)
 				continue;
 			if (sb->disks[i].number == conf->mirrors[j].number)
 				mark_disk_sync(sb->disks+i);
 		}
 	}
 	sb->active_disks = conf->working_disks;

 	if (start_recovery)
 		md_recover_arrays();


 	printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
 	/*
 	 * Ok, everything is just fine now
 	 */
 	return 0;

 out_free_conf:
 	raid1_shrink_r1bh(conf);
 	raid1_shrink_bh(conf);
 	raid1_shrink_buffers(conf);
 	kfree(conf);
 	mddev->private = NULL;
 out:
 	MOD_DEC_USE_COUNT;
 	return -EIO;
 }

 #undef INVALID_LEVEL
 #undef NO_SB
 #undef ERRORS
 #undef NOT_IN_SYNC
 #undef INCONSISTENT
 #undef ALREADY_RUNNING
 #undef OPERATIONAL
 #undef SPARE
 #undef NONE_OPERATIONAL
 #undef ARRAY_IS_ACTIVE

 static int raid1_stop_resync (mddev_t *mddev)
 {
 	raid1_conf_t *conf = mddev_to_conf(mddev);

 	if (conf->resync_thread) {
 		if (conf->resync_mirrors) {
 			conf->resync_mirrors = 2;
 			md_interrupt_thread(conf->resync_thread);

 			printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
 			return 1;
 		}
 		return 0;
 	}
 	return 0;
 }

 static int raid1_restart_resync (mddev_t *mddev)
 {
 	raid1_conf_t *conf = mddev_to_conf(mddev);

 	if (conf->resync_mirrors) {
 		if (!conf->resync_thread) {
 			MD_BUG();
 			return 0;
 		}
 		conf->resync_mirrors = 1;
 		md_wakeup_thread(conf->resync_thread);
 		return 1;
 	}
 	return 0;
 }

 static int raid1_stop (mddev_t *mddev)
 {
 	raid1_conf_t *conf = mddev_to_conf(mddev);

 	md_unregister_thread(conf->thread);
 	if (conf->resync_thread)
 		md_unregister_thread(conf->resync_thread);
 	raid1_shrink_r1bh(conf);
 	raid1_shrink_bh(conf);
 	raid1_shrink_buffers(conf);
 	kfree(conf);
 	mddev->private = NULL;
 	MOD_DEC_USE_COUNT;
 	return 0;
 }

 static mdk_personality_t raid1_personality=
 {
 	name:		"raid1",
 	make_request:	raid1_make_request,
 	run:		raid1_run,
 	stop:		raid1_stop,
 	status:		raid1_status,
 	error_handler:	raid1_error,
 	diskop:		raid1_diskop,
 	stop_resync:	raid1_stop_resync,
 	restart_resync:	raid1_restart_resync,
 	sync_request:	raid1_sync_request
 };

 static int md__init raid1_init (void)
 {
 	return register_md_personality (RAID1, &raid1_personality);
 }

 static void raid1_exit (void)
 {
 	unregister_md_personality (RAID1);
 }

 module_init(raid1_init);
 module_exit(raid1_exit);
 MODULE_LICENSE("GPL");