| /* |
| * raid1.c : Multiple Devices driver for Linux |
| * |
| * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat |
| * |
| * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman |
| * |
| * RAID-1 management functions. |
| * |
| * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 |
| * |
| * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk> |
| * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> |
| * |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License as published by |
| * the Free Software Foundation; either version 2, or (at your option) |
| * any later version. |
| * |
| * You should have received a copy of the GNU General Public License |
| * (for example /usr/src/linux/COPYING); if not, write to the Free |
| * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| */ |
| |
| #include <linux/module.h> |
| #include <linux/config.h> |
| #include <linux/slab.h> |
| #include <linux/raid/raid1.h> |
| #include <asm/atomic.h> |
| |
| #define MAJOR_NR MD_MAJOR |
| #define MD_DRIVER |
| #define MD_PERSONALITY |
| |
| #define MAX_WORK_PER_DISK 128 |
| |
| #define NR_RESERVED_BUFS 32 |
| |
| |
| /* |
| * The following can be used to debug the driver |
| */ |
| #define RAID1_DEBUG 0 |
| |
| #if RAID1_DEBUG |
| #define PRINTK(x...) printk(x) |
| #define inline |
| #define __inline__ |
| #else |
| #define PRINTK(x...) do { } while (0) |
| #endif |
| |
| |
| static mdk_personality_t raid1_personality; |
| static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED; |
| struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail; |
| |
| static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt) |
| { |
| /* return a linked list of "cnt" struct buffer_heads. |
| * don't take any off the free list unless we know we can |
| * get all we need, otherwise we could deadlock |
| */ |
| struct buffer_head *bh=NULL; |
| |
| while(cnt) { |
| struct buffer_head *t; |
| md_spin_lock_irq(&conf->device_lock); |
| if (!conf->freebh_blocked && conf->freebh_cnt >= cnt) |
| while (cnt) { |
| t = conf->freebh; |
| conf->freebh = t->b_next; |
| t->b_next = bh; |
| bh = t; |
| t->b_state = 0; |
| conf->freebh_cnt--; |
| cnt--; |
| } |
| md_spin_unlock_irq(&conf->device_lock); |
| if (cnt == 0) |
| break; |
| t = kmem_cache_alloc(bh_cachep, SLAB_NOIO); |
| if (t) { |
| t->b_next = bh; |
| bh = t; |
| cnt--; |
| } else { |
| PRINTK("raid1: waiting for %d bh\n", cnt); |
| conf->freebh_blocked = 1; |
| wait_disk_event(conf->wait_buffer, |
| !conf->freebh_blocked || |
| conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2); |
| conf->freebh_blocked = 0; |
| } |
| } |
| return bh; |
| } |
| |
| static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh) |
| { |
| unsigned long flags; |
| spin_lock_irqsave(&conf->device_lock, flags); |
| while (bh) { |
| struct buffer_head *t = bh; |
| bh=bh->b_next; |
| if (t->b_pprev == NULL) |
| kmem_cache_free(bh_cachep, t); |
| else { |
| t->b_next= conf->freebh; |
| conf->freebh = t; |
| conf->freebh_cnt++; |
| } |
| } |
| spin_unlock_irqrestore(&conf->device_lock, flags); |
| wake_up(&conf->wait_buffer); |
| } |
| |
| static int raid1_grow_bh(raid1_conf_t *conf, int cnt) |
| { |
| /* allocate cnt buffer_heads, possibly less if kmalloc fails */ |
| int i = 0; |
| |
| while (i < cnt) { |
| struct buffer_head *bh; |
| bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL); |
| if (!bh) break; |
| |
| md_spin_lock_irq(&conf->device_lock); |
| bh->b_pprev = &conf->freebh; |
| bh->b_next = conf->freebh; |
| conf->freebh = bh; |
| conf->freebh_cnt++; |
| md_spin_unlock_irq(&conf->device_lock); |
| |
| i++; |
| } |
| return i; |
| } |
| |
| static void raid1_shrink_bh(raid1_conf_t *conf) |
| { |
| /* discard all buffer_heads */ |
| |
| md_spin_lock_irq(&conf->device_lock); |
| while (conf->freebh) { |
| struct buffer_head *bh = conf->freebh; |
| conf->freebh = bh->b_next; |
| kmem_cache_free(bh_cachep, bh); |
| conf->freebh_cnt--; |
| } |
| md_spin_unlock_irq(&conf->device_lock); |
| } |
| |
| |
| static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf) |
| { |
| struct raid1_bh *r1_bh = NULL; |
| |
| do { |
| md_spin_lock_irq(&conf->device_lock); |
| if (!conf->freer1_blocked && conf->freer1) { |
| r1_bh = conf->freer1; |
| conf->freer1 = r1_bh->next_r1; |
| conf->freer1_cnt--; |
| r1_bh->next_r1 = NULL; |
| r1_bh->state = (1 << R1BH_PreAlloc); |
| r1_bh->bh_req.b_state = 0; |
| } |
| md_spin_unlock_irq(&conf->device_lock); |
| if (r1_bh) |
| return r1_bh; |
| r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO); |
| if (r1_bh) { |
| memset(r1_bh, 0, sizeof(*r1_bh)); |
| return r1_bh; |
| } |
| conf->freer1_blocked = 1; |
| wait_disk_event(conf->wait_buffer, |
| !conf->freer1_blocked || |
| conf->freer1_cnt > NR_RESERVED_BUFS/2 |
| ); |
| conf->freer1_blocked = 0; |
| } while (1); |
| } |
| |
| static inline void raid1_free_r1bh(struct raid1_bh *r1_bh) |
| { |
| struct buffer_head *bh = r1_bh->mirror_bh_list; |
| raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); |
| |
| r1_bh->mirror_bh_list = NULL; |
| |
| if (test_bit(R1BH_PreAlloc, &r1_bh->state)) { |
| unsigned long flags; |
| spin_lock_irqsave(&conf->device_lock, flags); |
| r1_bh->next_r1 = conf->freer1; |
| conf->freer1 = r1_bh; |
| conf->freer1_cnt++; |
| spin_unlock_irqrestore(&conf->device_lock, flags); |
| /* don't need to wakeup wait_buffer because |
| * raid1_free_bh below will do that |
| */ |
| } else { |
| kfree(r1_bh); |
| } |
| raid1_free_bh(conf, bh); |
| } |
| |
| static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt) |
| { |
| int i = 0; |
| |
| while (i < cnt) { |
| struct raid1_bh *r1_bh; |
| r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL); |
| if (!r1_bh) |
| break; |
| memset(r1_bh, 0, sizeof(*r1_bh)); |
| set_bit(R1BH_PreAlloc, &r1_bh->state); |
| r1_bh->mddev = conf->mddev; |
| |
| raid1_free_r1bh(r1_bh); |
| i++; |
| } |
| return i; |
| } |
| |
| static void raid1_shrink_r1bh(raid1_conf_t *conf) |
| { |
| md_spin_lock_irq(&conf->device_lock); |
| while (conf->freer1) { |
| struct raid1_bh *r1_bh = conf->freer1; |
| conf->freer1 = r1_bh->next_r1; |
| conf->freer1_cnt--; |
| kfree(r1_bh); |
| } |
| md_spin_unlock_irq(&conf->device_lock); |
| } |
| |
| |
| |
| static inline void raid1_free_buf(struct raid1_bh *r1_bh) |
| { |
| unsigned long flags; |
| struct buffer_head *bh = r1_bh->mirror_bh_list; |
| raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); |
| r1_bh->mirror_bh_list = NULL; |
| |
| spin_lock_irqsave(&conf->device_lock, flags); |
| r1_bh->next_r1 = conf->freebuf; |
| conf->freebuf = r1_bh; |
| spin_unlock_irqrestore(&conf->device_lock, flags); |
| raid1_free_bh(conf, bh); |
| } |
| |
| static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf) |
| { |
| struct raid1_bh *r1_bh; |
| |
| md_spin_lock_irq(&conf->device_lock); |
| wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock); |
| r1_bh = conf->freebuf; |
| conf->freebuf = r1_bh->next_r1; |
| r1_bh->next_r1= NULL; |
| md_spin_unlock_irq(&conf->device_lock); |
| |
| return r1_bh; |
| } |
| |
| static int raid1_grow_buffers (raid1_conf_t *conf, int cnt) |
| { |
| int i = 0; |
| struct raid1_bh *head = NULL, **tail; |
| tail = &head; |
| |
| while (i < cnt) { |
| struct raid1_bh *r1_bh; |
| struct page *page; |
| |
| page = alloc_page(GFP_KERNEL); |
| if (!page) |
| break; |
| |
| r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL); |
| if (!r1_bh) { |
| __free_page(page); |
| break; |
| } |
| memset(r1_bh, 0, sizeof(*r1_bh)); |
| r1_bh->bh_req.b_page = page; |
| r1_bh->bh_req.b_data = page_address(page); |
| *tail = r1_bh; |
| r1_bh->next_r1 = NULL; |
| tail = & r1_bh->next_r1; |
| i++; |
| } |
| /* this lock probably isn't needed, as at the time when |
| * we are allocating buffers, nobody else will be touching the |
| * freebuf list. But it doesn't hurt.... |
| */ |
| md_spin_lock_irq(&conf->device_lock); |
| *tail = conf->freebuf; |
| conf->freebuf = head; |
| md_spin_unlock_irq(&conf->device_lock); |
| return i; |
| } |
| |
| static void raid1_shrink_buffers (raid1_conf_t *conf) |
| { |
| struct raid1_bh *head; |
| md_spin_lock_irq(&conf->device_lock); |
| head = conf->freebuf; |
| conf->freebuf = NULL; |
| md_spin_unlock_irq(&conf->device_lock); |
| |
| while (head) { |
| struct raid1_bh *r1_bh = head; |
| head = r1_bh->next_r1; |
| __free_page(r1_bh->bh_req.b_page); |
| kfree(r1_bh); |
| } |
| } |
| |
| static int raid1_map (mddev_t *mddev, kdev_t *rdev) |
| { |
| raid1_conf_t *conf = mddev_to_conf(mddev); |
| int i, disks = MD_SB_DISKS; |
| unsigned long flags; |
| |
| /* |
| * Later we do read balancing on the read side |
| * now we use the first available disk. |
| */ |
| |
| md_spin_lock_irqsave(&conf->device_lock, flags); |
| for (i = 0; i < disks; i++) { |
| if (conf->mirrors[i].operational) { |
| *rdev = conf->mirrors[i].dev; |
| md_spin_unlock_irqrestore(&conf->device_lock, flags); |
| return (0); |
| } |
| } |
| md_spin_unlock_irqrestore(&conf->device_lock, flags); |
| |
| printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n"); |
| return (-1); |
| } |
| |
| static void raid1_reschedule_retry (struct raid1_bh *r1_bh) |
| { |
| unsigned long flags; |
| mddev_t *mddev = r1_bh->mddev; |
| raid1_conf_t *conf = mddev_to_conf(mddev); |
| |
| md_spin_lock_irqsave(&retry_list_lock, flags); |
| if (raid1_retry_list == NULL) |
| raid1_retry_tail = &raid1_retry_list; |
| *raid1_retry_tail = r1_bh; |
| raid1_retry_tail = &r1_bh->next_r1; |
| r1_bh->next_r1 = NULL; |
| md_spin_unlock_irqrestore(&retry_list_lock, flags); |
| md_wakeup_thread(conf->thread); |
| } |
| |
| |
| static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase) |
| { |
| unsigned long flags; |
| spin_lock_irqsave(&conf->segment_lock, flags); |
| if (sector < conf->start_active) |
| conf->cnt_done--; |
| else if (sector >= conf->start_future && conf->phase == phase) |
| conf->cnt_future--; |
| else if (!--conf->cnt_pending) |
| wake_up(&conf->wait_ready); |
| |
| spin_unlock_irqrestore(&conf->segment_lock, flags); |
| } |
| |
| static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf) |
| { |
| unsigned long flags; |
| spin_lock_irqsave(&conf->segment_lock, flags); |
| if (sector >= conf->start_ready) |
| --conf->cnt_ready; |
| else if (sector >= conf->start_active) { |
| if (!--conf->cnt_active) { |
| conf->start_active = conf->start_ready; |
| wake_up(&conf->wait_done); |
| } |
| } |
| spin_unlock_irqrestore(&conf->segment_lock, flags); |
| } |
| |
| /* |
| * raid1_end_bh_io() is called when we have finished servicing a mirrored |
| * operation and are ready to return a success/failure code to the buffer |
| * cache layer. |
| */ |
| static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate) |
| { |
| struct buffer_head *bh = r1_bh->master_bh; |
| |
| io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev), |
| test_bit(R1BH_SyncPhase, &r1_bh->state)); |
| |
| bh->b_end_io(bh, uptodate); |
| raid1_free_r1bh(r1_bh); |
| } |
| void raid1_end_request (struct buffer_head *bh, int uptodate) |
| { |
| struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); |
| |
| /* |
| * this branch is our 'one mirror IO has finished' event handler: |
| */ |
| if (!uptodate) |
| md_error (r1_bh->mddev, bh->b_dev); |
| else |
| /* |
| * Set R1BH_Uptodate in our master buffer_head, so that |
| * we will return a good error code for to the higher |
| * levels even if IO on some other mirrored buffer fails. |
| * |
| * The 'master' represents the complex operation to |
| * user-side. So if something waits for IO, then it will |
| * wait for the 'master' buffer_head. |
| */ |
| set_bit (R1BH_Uptodate, &r1_bh->state); |
| |
| /* |
| * We split up the read and write side, imho they are |
| * conceptually different. |
| */ |
| |
| if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) { |
| /* |
| * we have only one buffer_head on the read side |
| */ |
| |
| if (uptodate) { |
| raid1_end_bh_io(r1_bh, uptodate); |
| return; |
| } |
| /* |
| * oops, read error: |
| */ |
| printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", |
| partition_name(bh->b_dev), bh->b_blocknr); |
| raid1_reschedule_retry(r1_bh); |
| return; |
| } |
| |
| /* |
| * WRITE: |
| * |
| * Let's see if all mirrored write operations have finished |
| * already. |
| */ |
| |
| if (atomic_dec_and_test(&r1_bh->remaining)) |
| raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state)); |
| } |
| |
| /* |
| * This routine returns the disk from which the requested read should |
| * be done. It bookkeeps the last read position for every disk |
| * in array and when new read requests come, the disk which last |
| * position is nearest to the request, is chosen. |
| * |
| * TODO: now if there are 2 mirrors in the same 2 devices, performance |
| * degrades dramatically because position is mirror, not device based. |
| * This should be changed to be device based. Also atomic sequential |
| * reads should be somehow balanced. |
| */ |
| |
| static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh) |
| { |
| int new_disk = conf->last_used; |
| const int sectors = bh->b_size >> 9; |
| const unsigned long this_sector = bh->b_rsector; |
| int disk = new_disk; |
| unsigned long new_distance; |
| unsigned long current_distance; |
| |
| /* |
| * Check if it is sane at all to balance |
| */ |
| |
| if (conf->resync_mirrors) |
| goto rb_out; |
| |
| |
| #if defined(CONFIG_ALPHA) && ((__GNUC__ < 3) || \ |
| ((__GNUC__ == 3) && (__GNUC_MINOR__ < 3))) |
| /* Work around a compiler bug in older gcc */ |
| new_disk = *(volatile int *)&new_disk; |
| #endif |
| |
| /* make sure that disk is operational */ |
| while( !conf->mirrors[new_disk].operational) { |
| if (new_disk <= 0) new_disk = conf->raid_disks; |
| new_disk--; |
| if (new_disk == disk) { |
| /* |
| * This means no working disk was found |
| * Nothing much to do, lets not change anything |
| * and hope for the best... |
| */ |
| |
| new_disk = conf->last_used; |
| |
| goto rb_out; |
| } |
| } |
| disk = new_disk; |
| /* now disk == new_disk == starting point for search */ |
| |
| /* |
| * Don't touch anything for sequential reads. |
| */ |
| |
| if (this_sector == conf->mirrors[new_disk].head_position) |
| goto rb_out; |
| |
| /* |
| * If reads have been done only on a single disk |
| * for a time, lets give another disk a change. |
| * This is for kicking those idling disks so that |
| * they would find work near some hotspot. |
| */ |
| |
| if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) { |
| conf->sect_count = 0; |
| |
| #if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92) |
| /* Work around a compiler bug in egcs-2.92.11 19980921 */ |
| new_disk = *(volatile int *)&new_disk; |
| #endif |
| do { |
| if (new_disk<=0) |
| new_disk = conf->raid_disks; |
| new_disk--; |
| if (new_disk == disk) |
| break; |
| } while ((conf->mirrors[new_disk].write_only) || |
| (!conf->mirrors[new_disk].operational)); |
| |
| goto rb_out; |
| } |
| |
| current_distance = abs(this_sector - |
| conf->mirrors[disk].head_position); |
| |
| /* Find the disk which is closest */ |
| |
| #if defined(CONFIG_ALPHA) && ((__GNUC__ < 3) || \ |
| ((__GNUC__ == 3) && (__GNUC_MINOR__ < 3))) |
| /* Work around a compiler bug in older gcc */ |
| disk = *(volatile int *)&disk; |
| #endif |
| do { |
| if (disk <= 0) |
| disk = conf->raid_disks; |
| disk--; |
| |
| if ((conf->mirrors[disk].write_only) || |
| (!conf->mirrors[disk].operational)) |
| continue; |
| |
| new_distance = abs(this_sector - |
| conf->mirrors[disk].head_position); |
| |
| if (new_distance < current_distance) { |
| conf->sect_count = 0; |
| current_distance = new_distance; |
| new_disk = disk; |
| } |
| } while (disk != conf->last_used); |
| |
| rb_out: |
| conf->mirrors[new_disk].head_position = this_sector + sectors; |
| |
| conf->last_used = new_disk; |
| conf->sect_count += sectors; |
| |
| return new_disk; |
| } |
| |
| static int raid1_make_request (mddev_t *mddev, int rw, |
| struct buffer_head * bh) |
| { |
| raid1_conf_t *conf = mddev_to_conf(mddev); |
| struct buffer_head *bh_req, *bhl; |
| struct raid1_bh * r1_bh; |
| int disks = MD_SB_DISKS; |
| int i, sum_bhs = 0; |
| struct mirror_info *mirror; |
| kdev_t dev; |
| |
| if (!buffer_locked(bh)) |
| BUG(); |
| |
| /* |
| * make_request() can abort the operation when READA is being |
| * used and no empty request is available. |
| * |
| * Currently, just replace the command with READ/WRITE. |
| */ |
| if (rw == READA) |
| rw = READ; |
| |
| r1_bh = raid1_alloc_r1bh (conf); |
| |
| spin_lock_irq(&conf->segment_lock); |
| wait_event_lock_irq(conf->wait_done, |
| bh->b_rsector < conf->start_active || |
| bh->b_rsector >= conf->start_future, |
| conf->segment_lock); |
| if (bh->b_rsector < conf->start_active) |
| conf->cnt_done++; |
| else { |
| conf->cnt_future++; |
| if (conf->phase) |
| set_bit(R1BH_SyncPhase, &r1_bh->state); |
| } |
| spin_unlock_irq(&conf->segment_lock); |
| |
| /* |
| * i think the read and write branch should be separated completely, |
| * since we want to do read balancing on the read side for example. |
| * Alternative implementations? :) --mingo |
| */ |
| |
| r1_bh->master_bh = bh; |
| r1_bh->mddev = mddev; |
| r1_bh->cmd = rw; |
| |
| if (rw == READ) { |
| /* |
| * read balancing logic: |
| */ |
| spin_lock_irq(&conf->device_lock); |
| mirror = conf->mirrors + raid1_read_balance(conf, bh); |
| dev = mirror->dev; |
| spin_unlock_irq(&conf->device_lock); |
| |
| bh_req = &r1_bh->bh_req; |
| memcpy(bh_req, bh, sizeof(*bh)); |
| bh_req->b_blocknr = bh->b_rsector; |
| bh_req->b_dev = dev; |
| bh_req->b_rdev = dev; |
| /* bh_req->b_rsector = bh->n_rsector; */ |
| bh_req->b_end_io = raid1_end_request; |
| bh_req->b_private = r1_bh; |
| generic_make_request (rw, bh_req); |
| return 0; |
| } |
| |
| /* |
| * WRITE: |
| */ |
| |
| bhl = raid1_alloc_bh(conf, conf->raid_disks); |
| spin_lock_irq(&conf->device_lock); |
| for (i = 0; i < disks; i++) { |
| struct buffer_head *mbh; |
| if (!conf->mirrors[i].operational) |
| continue; |
| |
| /* |
| * We should use a private pool (size depending on NR_REQUEST), |
| * to avoid writes filling up the memory with bhs |
| * |
| * Such pools are much faster than kmalloc anyways (so we waste |
| * almost nothing by not using the master bh when writing and |
| * win alot of cleanness) but for now we are cool enough. --mingo |
| * |
| * It's safe to sleep here, buffer heads cannot be used in a shared |
| * manner in the write branch. Look how we lock the buffer at the |
| * beginning of this function to grok the difference ;) |
| */ |
| mbh = bhl; |
| if (mbh == NULL) { |
| MD_BUG(); |
| break; |
| } |
| bhl = mbh->b_next; |
| mbh->b_next = NULL; |
| mbh->b_this_page = (struct buffer_head *)1; |
| |
| /* |
| * prepare mirrored mbh (fields ordered for max mem throughput): |
| */ |
| mbh->b_blocknr = bh->b_rsector; |
| mbh->b_dev = conf->mirrors[i].dev; |
| mbh->b_rdev = conf->mirrors[i].dev; |
| mbh->b_rsector = bh->b_rsector; |
| mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) | |
| (1<<BH_Mapped) | (1<<BH_Lock); |
| |
| atomic_set(&mbh->b_count, 1); |
| mbh->b_size = bh->b_size; |
| mbh->b_page = bh->b_page; |
| mbh->b_data = bh->b_data; |
| mbh->b_list = BUF_LOCKED; |
| mbh->b_end_io = raid1_end_request; |
| mbh->b_private = r1_bh; |
| |
| mbh->b_next = r1_bh->mirror_bh_list; |
| r1_bh->mirror_bh_list = mbh; |
| sum_bhs++; |
| } |
| spin_unlock_irq(&conf->device_lock); |
| if (bhl) raid1_free_bh(conf,bhl); |
| if (!sum_bhs) { |
| /* Gag - all mirrors non-operational.. */ |
| raid1_end_bh_io(r1_bh, 0); |
| return 0; |
| } |
| md_atomic_set(&r1_bh->remaining, sum_bhs); |
| |
| /* |
| * We have to be a bit careful about the semaphore above, thats |
| * why we start the requests separately. Since kmalloc() could |
| * fail, sleep and make_request() can sleep too, this is the |
| * safer solution. Imagine, end_request decreasing the semaphore |
| * before we could have set it up ... We could play tricks with |
| * the semaphore (presetting it and correcting at the end if |
| * sum_bhs is not 'n' but we have to do end_request by hand if |
| * all requests finish until we had a chance to set up the |
| * semaphore correctly ... lots of races). |
| */ |
| bh = r1_bh->mirror_bh_list; |
| while(bh) { |
| struct buffer_head *bh2 = bh; |
| bh = bh->b_next; |
| generic_make_request(rw, bh2); |
| } |
| return (0); |
| } |
| |
| static void raid1_status(struct seq_file *seq, mddev_t *mddev) |
| { |
| raid1_conf_t *conf = mddev_to_conf(mddev); |
| int i; |
| |
| seq_printf(seq, " [%d/%d] [", conf->raid_disks, |
| conf->working_disks); |
| for (i = 0; i < conf->raid_disks; i++) |
| seq_printf(seq, "%s", |
| conf->mirrors[i].operational ? "U" : "_"); |
| seq_printf(seq, "]"); |
| } |
| |
| #define LAST_DISK KERN_ALERT \ |
| "raid1: only one disk left and IO error.\n" |
| |
| #define NO_SPARE_DISK KERN_ALERT \ |
| "raid1: no spare disk left, degrading mirror level by one.\n" |
| |
| #define DISK_FAILED KERN_ALERT \ |
| "raid1: Disk failure on %s, disabling device. \n" \ |
| " Operation continuing on %d devices\n" |
| |
| #define START_SYNCING KERN_ALERT \ |
| "raid1: start syncing spare disk.\n" |
| |
| #define ALREADY_SYNCING KERN_INFO \ |
| "raid1: syncing already in progress.\n" |
| |
| static void mark_disk_bad (mddev_t *mddev, int failed) |
| { |
| raid1_conf_t *conf = mddev_to_conf(mddev); |
| struct mirror_info *mirror = conf->mirrors+failed; |
| mdp_super_t *sb = mddev->sb; |
| |
| mirror->operational = 0; |
| mark_disk_faulty(sb->disks+mirror->number); |
| mark_disk_nonsync(sb->disks+mirror->number); |
| mark_disk_inactive(sb->disks+mirror->number); |
| if (!mirror->write_only) |
| sb->active_disks--; |
| else |
| sb->spare_disks--; |
| sb->working_disks--; |
| sb->failed_disks++; |
| mddev->sb_dirty = 1; |
| md_wakeup_thread(conf->thread); |
| if (!mirror->write_only) |
| conf->working_disks--; |
| printk (DISK_FAILED, partition_name (mirror->dev), |
| conf->working_disks); |
| } |
| |
| static int raid1_error (mddev_t *mddev, kdev_t dev) |
| { |
| raid1_conf_t *conf = mddev_to_conf(mddev); |
| struct mirror_info * mirrors = conf->mirrors; |
| int disks = MD_SB_DISKS; |
| int i; |
| unsigned long flags; |
| |
| /* Find the drive. |
| * If it is not operational, then we have already marked it as dead |
| * else if it is the last working disks, ignore the error, let the |
| * next level up know. |
| * else mark the drive as failed |
| */ |
| |
| for (i = 0; i < disks; i++) |
| if (mirrors[i].dev==dev && mirrors[i].operational) |
| break; |
| if (i == disks) |
| return 0; |
| |
| if (i < conf->raid_disks && conf->working_disks == 1) { |
| /* Don't fail the drive, act as though we were just a |
| * normal single drive |
| */ |
| |
| return 1; |
| } |
| md_spin_lock_irqsave(&conf->device_lock, flags); |
| mark_disk_bad(mddev, i); |
| md_spin_unlock_irqrestore(&conf->device_lock, flags); |
| return 0; |
| } |
| |
| #undef LAST_DISK |
| #undef NO_SPARE_DISK |
| #undef DISK_FAILED |
| #undef START_SYNCING |
| |
| |
| static void print_raid1_conf (raid1_conf_t *conf) |
| { |
| int i; |
| struct mirror_info *tmp; |
| |
| printk("RAID1 conf printout:\n"); |
| if (!conf) { |
| printk("(conf==NULL)\n"); |
| return; |
| } |
| printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks, |
| conf->raid_disks, conf->nr_disks); |
| |
| for (i = 0; i < MD_SB_DISKS; i++) { |
| tmp = conf->mirrors + i; |
| printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", |
| i, tmp->spare,tmp->operational, |
| tmp->number,tmp->raid_disk,tmp->used_slot, |
| partition_name(tmp->dev)); |
| } |
| } |
| |
| static void close_sync(raid1_conf_t *conf) |
| { |
| mddev_t *mddev = conf->mddev; |
| /* If reconstruction was interrupted, we need to close the "active" and "pending" |
| * holes. |
| * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0 |
| */ |
| /* this is really needed when recovery stops too... */ |
| spin_lock_irq(&conf->segment_lock); |
| conf->start_active = conf->start_pending; |
| conf->start_ready = conf->start_pending; |
| wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); |
| conf->start_active =conf->start_ready = conf->start_pending = conf->start_future; |
| conf->start_future = (mddev->sb->size<<1)+1; |
| conf->cnt_pending = conf->cnt_future; |
| conf->cnt_future = 0; |
| conf->phase = conf->phase ^1; |
| wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); |
| conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0; |
| conf->phase = 0; |
| conf->cnt_future = conf->cnt_done;; |
| conf->cnt_done = 0; |
| spin_unlock_irq(&conf->segment_lock); |
| wake_up(&conf->wait_done); |
| } |
| |
| static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state) |
| { |
| int err = 0; |
| int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; |
| raid1_conf_t *conf = mddev->private; |
| struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; |
| mdp_super_t *sb = mddev->sb; |
| mdp_disk_t *failed_desc, *spare_desc, *added_desc; |
| mdk_rdev_t *spare_rdev, *failed_rdev; |
| |
| if (conf->resync_mirrors) |
| return 1; /* Cannot do any diskops during a resync */ |
| |
| switch (state) { |
| case DISKOP_SPARE_ACTIVE: |
| case DISKOP_SPARE_INACTIVE: |
| /* need to wait for pending sync io before locking device */ |
| close_sync(conf); |
| } |
| |
| md_spin_lock_irq(&conf->device_lock); |
| /* |
| * Need the conf lock when printing out state else we get BUG()s |
| */ |
| print_raid1_conf(conf); |
| /* |
| * find the disk ... |
| */ |
| switch (state) { |
| |
| case DISKOP_SPARE_ACTIVE: |
| |
| /* |
| * Find the failed disk within the RAID1 configuration ... |
| * (this can only be in the first conf->working_disks part) |
| */ |
| for (i = 0; i < conf->raid_disks; i++) { |
| tmp = conf->mirrors + i; |
| if ((!tmp->operational && !tmp->spare) || |
| !tmp->used_slot) { |
| failed_disk = i; |
| break; |
| } |
| } |
| /* |
| * When we activate a spare disk we _must_ have a disk in |
| * the lower (active) part of the array to replace. |
| */ |
| if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| /* fall through */ |
| |
| case DISKOP_SPARE_WRITE: |
| case DISKOP_SPARE_INACTIVE: |
| |
| /* |
| * Find the spare disk ... (can only be in the 'high' |
| * area of the array) |
| */ |
| for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { |
| tmp = conf->mirrors + i; |
| if (tmp->spare && tmp->number == (*d)->number) { |
| spare_disk = i; |
| break; |
| } |
| } |
| if (spare_disk == -1) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| break; |
| |
| case DISKOP_HOT_REMOVE_DISK: |
| |
| for (i = 0; i < MD_SB_DISKS; i++) { |
| tmp = conf->mirrors + i; |
| if (tmp->used_slot && (tmp->number == (*d)->number)) { |
| if (tmp->operational) { |
| err = -EBUSY; |
| goto abort; |
| } |
| removed_disk = i; |
| break; |
| } |
| } |
| if (removed_disk == -1) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| break; |
| |
| case DISKOP_HOT_ADD_DISK: |
| |
| for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { |
| tmp = conf->mirrors + i; |
| if (!tmp->used_slot) { |
| added_disk = i; |
| break; |
| } |
| } |
| if (added_disk == -1) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| break; |
| } |
| |
| switch (state) { |
| /* |
| * Switch the spare disk to write-only mode: |
| */ |
| case DISKOP_SPARE_WRITE: |
| sdisk = conf->mirrors + spare_disk; |
| sdisk->operational = 1; |
| sdisk->write_only = 1; |
| break; |
| /* |
| * Deactivate a spare disk: |
| */ |
| case DISKOP_SPARE_INACTIVE: |
| if (conf->start_future > 0) { |
| MD_BUG(); |
| err = -EBUSY; |
| break; |
| } |
| sdisk = conf->mirrors + spare_disk; |
| sdisk->operational = 0; |
| sdisk->write_only = 0; |
| break; |
| /* |
| * Activate (mark read-write) the (now sync) spare disk, |
| * which means we switch it's 'raid position' (->raid_disk) |
| * with the failed disk. (only the first 'conf->nr_disks' |
| * slots are used for 'real' disks and we must preserve this |
| * property) |
| */ |
| case DISKOP_SPARE_ACTIVE: |
| if (conf->start_future > 0) { |
| MD_BUG(); |
| err = -EBUSY; |
| break; |
| } |
| sdisk = conf->mirrors + spare_disk; |
| fdisk = conf->mirrors + failed_disk; |
| |
| spare_desc = &sb->disks[sdisk->number]; |
| failed_desc = &sb->disks[fdisk->number]; |
| |
| if (spare_desc != *d) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| |
| if (spare_desc->raid_disk != sdisk->raid_disk) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| |
| if (sdisk->raid_disk != spare_disk) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| |
| if (failed_desc->raid_disk != fdisk->raid_disk) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| |
| if (fdisk->raid_disk != failed_disk) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| |
| /* |
| * do the switch finally |
| */ |
| spare_rdev = find_rdev_nr(mddev, spare_desc->number); |
| failed_rdev = find_rdev_nr(mddev, failed_desc->number); |
| |
| /* There must be a spare_rdev, but there may not be a |
| * failed_rdev. That slot might be empty... |
| */ |
| spare_rdev->desc_nr = failed_desc->number; |
| if (failed_rdev) |
| failed_rdev->desc_nr = spare_desc->number; |
| |
| xchg_values(*spare_desc, *failed_desc); |
| xchg_values(*fdisk, *sdisk); |
| |
| /* |
| * (careful, 'failed' and 'spare' are switched from now on) |
| * |
| * we want to preserve linear numbering and we want to |
| * give the proper raid_disk number to the now activated |
| * disk. (this means we switch back these values) |
| */ |
| |
| xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); |
| xchg_values(sdisk->raid_disk, fdisk->raid_disk); |
| xchg_values(spare_desc->number, failed_desc->number); |
| xchg_values(sdisk->number, fdisk->number); |
| |
| *d = failed_desc; |
| |
| if (sdisk->dev == MKDEV(0,0)) |
| sdisk->used_slot = 0; |
| /* |
| * this really activates the spare. |
| */ |
| fdisk->spare = 0; |
| fdisk->write_only = 0; |
| |
| /* |
| * if we activate a spare, we definitely replace a |
| * non-operational disk slot in the 'low' area of |
| * the disk array. |
| */ |
| |
| conf->working_disks++; |
| |
| break; |
| |
| case DISKOP_HOT_REMOVE_DISK: |
| rdisk = conf->mirrors + removed_disk; |
| |
| if (rdisk->spare && (removed_disk < conf->raid_disks)) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| rdisk->dev = MKDEV(0,0); |
| rdisk->used_slot = 0; |
| conf->nr_disks--; |
| break; |
| |
| case DISKOP_HOT_ADD_DISK: |
| adisk = conf->mirrors + added_disk; |
| added_desc = *d; |
| |
| if (added_disk != added_desc->number) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| |
| adisk->number = added_desc->number; |
| adisk->raid_disk = added_desc->raid_disk; |
| adisk->dev = MKDEV(added_desc->major,added_desc->minor); |
| |
| adisk->operational = 0; |
| adisk->write_only = 0; |
| adisk->spare = 1; |
| adisk->used_slot = 1; |
| adisk->head_position = 0; |
| conf->nr_disks++; |
| |
| break; |
| |
| default: |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| abort: |
| print_raid1_conf(conf); |
| md_spin_unlock_irq(&conf->device_lock); |
| if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) |
| /* should move to "END_REBUILD" when such exists */ |
| raid1_shrink_buffers(conf); |
| |
| return err; |
| } |
| |
| |
| #define IO_ERROR KERN_ALERT \ |
| "raid1: %s: unrecoverable I/O read error for block %lu\n" |
| |
| #define REDIRECT_SECTOR KERN_ERR \ |
| "raid1: %s: redirecting sector %lu to another mirror\n" |
| |
| /* |
| * This is a kernel thread which: |
| * |
| * 1. Retries failed read operations on working mirrors. |
| * 2. Updates the raid superblock when problems encounter. |
| * 3. Performs writes following reads for array syncronising. |
| */ |
| static void end_sync_write(struct buffer_head *bh, int uptodate); |
| static void end_sync_read(struct buffer_head *bh, int uptodate); |
| |
| static void raid1d (void *data) |
| { |
| struct raid1_bh *r1_bh; |
| struct buffer_head *bh; |
| unsigned long flags; |
| raid1_conf_t *conf = data; |
| mddev_t *mddev = conf->mddev; |
| kdev_t dev; |
| |
| if (mddev->sb_dirty) |
| md_update_sb(mddev); |
| |
| for (;;) { |
| md_spin_lock_irqsave(&retry_list_lock, flags); |
| r1_bh = raid1_retry_list; |
| if (!r1_bh) |
| break; |
| raid1_retry_list = r1_bh->next_r1; |
| md_spin_unlock_irqrestore(&retry_list_lock, flags); |
| |
| mddev = r1_bh->mddev; |
| bh = &r1_bh->bh_req; |
| switch(r1_bh->cmd) { |
| case SPECIAL: |
| /* have to allocate lots of bh structures and |
| * schedule writes |
| */ |
| if (test_bit(R1BH_Uptodate, &r1_bh->state)) { |
| int i, sum_bhs = 0; |
| int disks = MD_SB_DISKS; |
| struct buffer_head *bhl, *mbh; |
| |
| conf = mddev_to_conf(mddev); |
| bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */ |
| spin_lock_irq(&conf->device_lock); |
| for (i = 0; i < disks ; i++) { |
| if (!conf->mirrors[i].operational) |
| continue; |
| if (i==conf->last_used) |
| /* we read from here, no need to write */ |
| continue; |
| if (i < conf->raid_disks |
| && !conf->resync_mirrors) |
| /* don't need to write this, |
| * we are just rebuilding */ |
| continue; |
| mbh = bhl; |
| if (!mbh) { |
| MD_BUG(); |
| break; |
| } |
| bhl = mbh->b_next; |
| mbh->b_this_page = (struct buffer_head *)1; |
| |
| |
| /* |
| * prepare mirrored bh (fields ordered for max mem throughput): |
| */ |
| mbh->b_blocknr = bh->b_blocknr; |
| mbh->b_dev = conf->mirrors[i].dev; |
| mbh->b_rdev = conf->mirrors[i].dev; |
| mbh->b_rsector = bh->b_blocknr; |
| mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) | |
| (1<<BH_Mapped) | (1<<BH_Lock); |
| atomic_set(&mbh->b_count, 1); |
| mbh->b_size = bh->b_size; |
| mbh->b_page = bh->b_page; |
| mbh->b_data = bh->b_data; |
| mbh->b_list = BUF_LOCKED; |
| mbh->b_end_io = end_sync_write; |
| mbh->b_private = r1_bh; |
| |
| mbh->b_next = r1_bh->mirror_bh_list; |
| r1_bh->mirror_bh_list = mbh; |
| |
| sum_bhs++; |
| } |
| spin_unlock_irq(&conf->device_lock); |
| md_atomic_set(&r1_bh->remaining, sum_bhs); |
| if (bhl) raid1_free_bh(conf, bhl); |
| mbh = r1_bh->mirror_bh_list; |
| |
| if (!sum_bhs) { |
| /* nowhere to write this too... I guess we |
| * must be done |
| */ |
| sync_request_done(bh->b_blocknr, conf); |
| md_done_sync(mddev, bh->b_size>>9, 0); |
| raid1_free_buf(r1_bh); |
| } else |
| while (mbh) { |
| struct buffer_head *bh1 = mbh; |
| mbh = mbh->b_next; |
| generic_make_request(WRITE, bh1); |
| md_sync_acct(bh1->b_dev, bh1->b_size/512); |
| } |
| } else { |
| /* There is no point trying a read-for-reconstruct |
| * as reconstruct is about to be aborted |
| */ |
| |
| printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); |
| md_done_sync(mddev, bh->b_size>>9, 0); |
| } |
| |
| break; |
| case READ: |
| case READA: |
| dev = bh->b_dev; |
| raid1_map (mddev, &bh->b_dev); |
| if (bh->b_dev == dev) { |
| printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); |
| raid1_end_bh_io(r1_bh, 0); |
| } else { |
| printk (REDIRECT_SECTOR, |
| partition_name(bh->b_dev), bh->b_blocknr); |
| bh->b_rdev = bh->b_dev; |
| bh->b_rsector = bh->b_blocknr; |
| generic_make_request (r1_bh->cmd, bh); |
| } |
| break; |
| } |
| } |
| md_spin_unlock_irqrestore(&retry_list_lock, flags); |
| } |
| #undef IO_ERROR |
| #undef REDIRECT_SECTOR |
| |
| /* |
| * Private kernel thread to reconstruct mirrors after an unclean |
| * shutdown. |
| */ |
| static void raid1syncd (void *data) |
| { |
| raid1_conf_t *conf = data; |
| mddev_t *mddev = conf->mddev; |
| |
| if (!conf->resync_mirrors) |
| return; |
| if (conf->resync_mirrors == 2) |
| return; |
| down(&mddev->recovery_sem); |
| if (!md_do_sync(mddev, NULL)) { |
| /* |
| * Only if everything went Ok. |
| */ |
| conf->resync_mirrors = 0; |
| } |
| |
| close_sync(conf); |
| |
| up(&mddev->recovery_sem); |
| raid1_shrink_buffers(conf); |
| |
| md_recover_arrays(); /* incase we are degraded and a spare is available */ |
| } |
| |
| /* |
| * perform a "sync" on one "block" |
| * |
| * We need to make sure that no normal I/O request - particularly write |
| * requests - conflict with active sync requests. |
| * This is achieved by conceptually dividing the device space into a |
| * number of sections: |
| * DONE: 0 .. a-1 These blocks are in-sync |
| * ACTIVE: a.. b-1 These blocks may have active sync requests, but |
| * no normal IO requests |
| * READY: b .. c-1 These blocks have no normal IO requests - sync |
| * request may be happening |
| * PENDING: c .. d-1 These blocks may have IO requests, but no new |
| * ones will be added |
| * FUTURE: d .. end These blocks are not to be considered yet. IO may |
| * be happening, but not sync |
| * |
| * We keep a |
| * phase which flips (0 or 1) each time d moves and |
| * a count of: |
| * z = active io requests in FUTURE since d moved - marked with |
| * current phase |
| * y = active io requests in FUTURE before d moved, or PENDING - |
| * marked with previous phase |
| * x = active sync requests in READY |
| * w = active sync requests in ACTIVE |
| * v = active io requests in DONE |
| * |
| * Normally, a=b=c=d=0 and z= active io requests |
| * or a=b=c=d=END and v= active io requests |
| * Allowed changes to a,b,c,d: |
| * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase |
| * B: y==0 -> c=d |
| * C: b=c, w+=x, x=0 |
| * D: w==0 -> a=b |
| * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0 |
| * |
| * At start of sync we apply A. |
| * When y reaches 0, we apply B then A then being sync requests |
| * When sync point reaches c-1, we wait for y==0, and W==0, and |
| * then apply apply B then A then D then C. |
| * Finally, we apply E |
| * |
| * The sync request simply issues a "read" against a working drive |
| * This is marked so that on completion the raid1d thread is woken to |
| * issue suitable write requests |
| */ |
| |
| static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr) |
| { |
| raid1_conf_t *conf = mddev_to_conf(mddev); |
| struct mirror_info *mirror; |
| struct raid1_bh *r1_bh; |
| struct buffer_head *bh; |
| int bsize; |
| int disk; |
| int block_nr; |
| int buffs; |
| kdev_t dev; |
| |
| if (!sector_nr) { |
| /* we want enough buffers to hold twice the window of 128*/ |
| buffs = 128 *2 / (PAGE_SIZE>>9); |
| buffs = raid1_grow_buffers(conf, buffs); |
| if (buffs < 2) |
| goto nomem; |
| conf->window = buffs*(PAGE_SIZE>>9)/2; |
| } |
| spin_lock_irq(&conf->segment_lock); |
| if (!sector_nr) { |
| /* initialize ...*/ |
| conf->start_active = 0; |
| conf->start_ready = 0; |
| conf->start_pending = 0; |
| conf->start_future = 0; |
| conf->phase = 0; |
| |
| conf->cnt_future += conf->cnt_done+conf->cnt_pending; |
| conf->cnt_done = conf->cnt_pending = 0; |
| if (conf->cnt_ready || conf->cnt_active) |
| MD_BUG(); |
| } |
| while (sector_nr >= conf->start_pending) { |
| PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n", |
| sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future, |
| conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future); |
| wait_event_lock_irq(conf->wait_done, |
| !conf->cnt_active, |
| conf->segment_lock); |
| wait_event_lock_irq(conf->wait_ready, |
| !conf->cnt_pending, |
| conf->segment_lock); |
| conf->start_active = conf->start_ready; |
| conf->start_ready = conf->start_pending; |
| conf->start_pending = conf->start_future; |
| conf->start_future = conf->start_future+conf->window; |
| // Note: falling off the end is not a problem |
| conf->phase = conf->phase ^1; |
| conf->cnt_active = conf->cnt_ready; |
| conf->cnt_ready = 0; |
| conf->cnt_pending = conf->cnt_future; |
| conf->cnt_future = 0; |
| wake_up(&conf->wait_done); |
| } |
| conf->cnt_ready++; |
| spin_unlock_irq(&conf->segment_lock); |
| |
| |
| /* If reconstructing, and >1 working disc, |
| * could dedicate one to rebuild and others to |
| * service read requests .. |
| */ |
| spin_lock_irq(&conf->device_lock); |
| disk = conf->last_used; |
| /* make sure disk is operational */ |
| while (!conf->mirrors[disk].operational) { |
| if (disk <= 0) disk = conf->raid_disks; |
| disk--; |
| if (disk == conf->last_used) |
| break; |
| } |
| conf->last_used = disk; |
| |
| mirror = conf->mirrors+conf->last_used; |
| dev = mirror->dev; |
| spin_unlock_irq(&conf->device_lock); |
| |
| r1_bh = raid1_alloc_buf (conf); |
| r1_bh->master_bh = NULL; |
| r1_bh->mddev = mddev; |
| r1_bh->cmd = SPECIAL; |
| bh = &r1_bh->bh_req; |
| |
| block_nr = sector_nr; |
| bsize = 512; |
| while (!(block_nr & 1) && bsize < PAGE_SIZE |
| && (block_nr+2)*(bsize>>9) <= (mddev->sb->size *2)) { |
| block_nr >>= 1; |
| bsize <<= 1; |
| } |
| bh->b_size = bsize; |
| bh->b_list = BUF_LOCKED; |
| bh->b_dev = dev; |
| bh->b_rdev = dev; |
| bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock); |
| if (!bh->b_page) |
| BUG(); |
| if (!bh->b_data) |
| BUG(); |
| if (bh->b_data != page_address(bh->b_page)) |
| BUG(); |
| bh->b_end_io = end_sync_read; |
| bh->b_private = r1_bh; |
| bh->b_blocknr = sector_nr; |
| bh->b_rsector = sector_nr; |
| init_waitqueue_head(&bh->b_wait); |
| |
| generic_make_request(READ, bh); |
| md_sync_acct(bh->b_dev, bh->b_size/512); |
| |
| return (bsize >> 9); |
| |
| nomem: |
| raid1_shrink_buffers(conf); |
| return -ENOMEM; |
| } |
| |
| static void end_sync_read(struct buffer_head *bh, int uptodate) |
| { |
| struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); |
| |
| /* we have read a block, now it needs to be re-written, |
| * or re-read if the read failed. |
| * We don't do much here, just schedule handling by raid1d |
| */ |
| if (!uptodate) |
| md_error (r1_bh->mddev, bh->b_dev); |
| else |
| set_bit(R1BH_Uptodate, &r1_bh->state); |
| raid1_reschedule_retry(r1_bh); |
| } |
| |
| static void end_sync_write(struct buffer_head *bh, int uptodate) |
| { |
| struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); |
| |
| if (!uptodate) |
| md_error (r1_bh->mddev, bh->b_dev); |
| if (atomic_dec_and_test(&r1_bh->remaining)) { |
| mddev_t *mddev = r1_bh->mddev; |
| unsigned long sect = bh->b_blocknr; |
| int size = bh->b_size; |
| raid1_free_buf(r1_bh); |
| sync_request_done(sect, mddev_to_conf(mddev)); |
| md_done_sync(mddev,size>>9, uptodate); |
| } |
| } |
| |
| #define INVALID_LEVEL KERN_WARNING \ |
| "raid1: md%d: raid level not set to mirroring (%d)\n" |
| |
| #define NO_SB KERN_ERR \ |
| "raid1: disabled mirror %s (couldn't access raid superblock)\n" |
| |
| #define ERRORS KERN_ERR \ |
| "raid1: disabled mirror %s (errors detected)\n" |
| |
| #define NOT_IN_SYNC KERN_ERR \ |
| "raid1: disabled mirror %s (not in sync)\n" |
| |
| #define INCONSISTENT KERN_ERR \ |
| "raid1: disabled mirror %s (inconsistent descriptor)\n" |
| |
| #define ALREADY_RUNNING KERN_ERR \ |
| "raid1: disabled mirror %s (mirror %d already operational)\n" |
| |
| #define OPERATIONAL KERN_INFO \ |
| "raid1: device %s operational as mirror %d\n" |
| |
| #define MEM_ERROR KERN_ERR \ |
| "raid1: couldn't allocate memory for md%d\n" |
| |
| #define SPARE KERN_INFO \ |
| "raid1: spare disk %s\n" |
| |
| #define NONE_OPERATIONAL KERN_ERR \ |
| "raid1: no operational mirrors for md%d\n" |
| |
| #define ARRAY_IS_ACTIVE KERN_INFO \ |
| "raid1: raid set md%d active with %d out of %d mirrors\n" |
| |
| #define THREAD_ERROR KERN_ERR \ |
| "raid1: couldn't allocate thread for md%d\n" |
| |
| #define START_RESYNC KERN_WARNING \ |
| "raid1: raid set md%d not clean; reconstructing mirrors\n" |
| |
| static int raid1_run (mddev_t *mddev) |
| { |
| raid1_conf_t *conf; |
| int i, j, disk_idx; |
| struct mirror_info *disk; |
| mdp_super_t *sb = mddev->sb; |
| mdp_disk_t *descriptor; |
| mdk_rdev_t *rdev; |
| struct md_list_head *tmp; |
| int start_recovery = 0; |
| |
| MOD_INC_USE_COUNT; |
| |
| if (sb->level != 1) { |
| printk(INVALID_LEVEL, mdidx(mddev), sb->level); |
| goto out; |
| } |
| /* |
| * copy the already verified devices into our private RAID1 |
| * bookkeeping area. [whatever we allocate in raid1_run(), |
| * should be freed in raid1_stop()] |
| */ |
| |
| conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL); |
| mddev->private = conf; |
| if (!conf) { |
| printk(MEM_ERROR, mdidx(mddev)); |
| goto out; |
| } |
| memset(conf, 0, sizeof(*conf)); |
| |
| ITERATE_RDEV(mddev,rdev,tmp) { |
| if (rdev->faulty) { |
| printk(ERRORS, partition_name(rdev->dev)); |
| } else { |
| if (!rdev->sb) { |
| MD_BUG(); |
| continue; |
| } |
| } |
| if (rdev->desc_nr == -1) { |
| MD_BUG(); |
| continue; |
| } |
| descriptor = &sb->disks[rdev->desc_nr]; |
| disk_idx = descriptor->raid_disk; |
| disk = conf->mirrors + disk_idx; |
| |
| if (disk_faulty(descriptor)) { |
| disk->number = descriptor->number; |
| disk->raid_disk = disk_idx; |
| disk->dev = rdev->dev; |
| disk->sect_limit = MAX_WORK_PER_DISK; |
| disk->operational = 0; |
| disk->write_only = 0; |
| disk->spare = 0; |
| disk->used_slot = 1; |
| disk->head_position = 0; |
| continue; |
| } |
| if (disk_active(descriptor)) { |
| if (!disk_sync(descriptor)) { |
| printk(NOT_IN_SYNC, |
| partition_name(rdev->dev)); |
| continue; |
| } |
| if ((descriptor->number > MD_SB_DISKS) || |
| (disk_idx > sb->raid_disks)) { |
| |
| printk(INCONSISTENT, |
| partition_name(rdev->dev)); |
| continue; |
| } |
| if (disk->operational) { |
| printk(ALREADY_RUNNING, |
| partition_name(rdev->dev), |
| disk_idx); |
| continue; |
| } |
| printk(OPERATIONAL, partition_name(rdev->dev), |
| disk_idx); |
| disk->number = descriptor->number; |
| disk->raid_disk = disk_idx; |
| disk->dev = rdev->dev; |
| disk->sect_limit = MAX_WORK_PER_DISK; |
| disk->operational = 1; |
| disk->write_only = 0; |
| disk->spare = 0; |
| disk->used_slot = 1; |
| disk->head_position = 0; |
| conf->working_disks++; |
| } else { |
| /* |
| * Must be a spare disk .. |
| */ |
| printk(SPARE, partition_name(rdev->dev)); |
| disk->number = descriptor->number; |
| disk->raid_disk = disk_idx; |
| disk->dev = rdev->dev; |
| disk->sect_limit = MAX_WORK_PER_DISK; |
| disk->operational = 0; |
| disk->write_only = 0; |
| disk->spare = 1; |
| disk->used_slot = 1; |
| disk->head_position = 0; |
| } |
| } |
| conf->raid_disks = sb->raid_disks; |
| conf->nr_disks = sb->nr_disks; |
| conf->mddev = mddev; |
| conf->device_lock = MD_SPIN_LOCK_UNLOCKED; |
| |
| conf->segment_lock = MD_SPIN_LOCK_UNLOCKED; |
| init_waitqueue_head(&conf->wait_buffer); |
| init_waitqueue_head(&conf->wait_done); |
| init_waitqueue_head(&conf->wait_ready); |
| |
| if (!conf->working_disks) { |
| printk(NONE_OPERATIONAL, mdidx(mddev)); |
| goto out_free_conf; |
| } |
| |
| |
| /* pre-allocate some buffer_head structures. |
| * As a minimum, 1 r1bh and raid_disks buffer_heads |
| * would probably get us by in tight memory situations, |
| * but a few more is probably a good idea. |
| * For now, try NR_RESERVED_BUFS r1bh and |
| * NR_RESERVED_BUFS*raid_disks bufferheads |
| * This will allow at least NR_RESERVED_BUFS concurrent |
| * reads or writes even if kmalloc starts failing |
| */ |
| if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS || |
| raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks) |
| < NR_RESERVED_BUFS*conf->raid_disks) { |
| printk(MEM_ERROR, mdidx(mddev)); |
| goto out_free_conf; |
| } |
| |
| for (i = 0; i < MD_SB_DISKS; i++) { |
| |
| descriptor = sb->disks+i; |
| disk_idx = descriptor->raid_disk; |
| disk = conf->mirrors + disk_idx; |
| |
| if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) && |
| !disk->used_slot) { |
| |
| disk->number = descriptor->number; |
| disk->raid_disk = disk_idx; |
| disk->dev = MKDEV(0,0); |
| |
| disk->operational = 0; |
| disk->write_only = 0; |
| disk->spare = 0; |
| disk->used_slot = 1; |
| disk->head_position = 0; |
| } |
| } |
| |
| /* |
| * find the first working one and use it as a starting point |
| * to read balancing. |
| */ |
| for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++) |
| /* nothing */; |
| conf->last_used = j; |
| |
| |
| |
| { |
| const char * name = "raid1d"; |
| |
| conf->thread = md_register_thread(raid1d, conf, name); |
| if (!conf->thread) { |
| printk(THREAD_ERROR, mdidx(mddev)); |
| goto out_free_conf; |
| } |
| } |
| |
| if (!(sb->state & (1 << MD_SB_CLEAN)) && |
| (conf->working_disks > 1)) { |
| const char * name = "raid1syncd"; |
| |
| conf->resync_thread = md_register_thread(raid1syncd, conf,name); |
| if (!conf->resync_thread) { |
| printk(THREAD_ERROR, mdidx(mddev)); |
| goto out_free_conf; |
| } |
| |
| printk(START_RESYNC, mdidx(mddev)); |
| conf->resync_mirrors = 1; |
| md_wakeup_thread(conf->resync_thread); |
| } else if (conf->working_disks != sb->raid_disks) { |
| printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); |
| start_recovery = 1; |
| } |
| |
| /* |
| * Regenerate the "device is in sync with the raid set" bit for |
| * each device. |
| */ |
| for (i = 0; i < MD_SB_DISKS; i++) { |
| mark_disk_nonsync(sb->disks+i); |
| for (j = 0; j < sb->raid_disks; j++) { |
| if (!conf->mirrors[j].operational) |
| continue; |
| if (sb->disks[i].number == conf->mirrors[j].number) |
| mark_disk_sync(sb->disks+i); |
| } |
| } |
| sb->active_disks = conf->working_disks; |
| |
| if (start_recovery) |
| md_recover_arrays(); |
| |
| |
| printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks); |
| /* |
| * Ok, everything is just fine now |
| */ |
| return 0; |
| |
| out_free_conf: |
| raid1_shrink_r1bh(conf); |
| raid1_shrink_bh(conf); |
| raid1_shrink_buffers(conf); |
| kfree(conf); |
| mddev->private = NULL; |
| out: |
| MOD_DEC_USE_COUNT; |
| return -EIO; |
| } |
| |
| #undef INVALID_LEVEL |
| #undef NO_SB |
| #undef ERRORS |
| #undef NOT_IN_SYNC |
| #undef INCONSISTENT |
| #undef ALREADY_RUNNING |
| #undef OPERATIONAL |
| #undef SPARE |
| #undef NONE_OPERATIONAL |
| #undef ARRAY_IS_ACTIVE |
| |
| static int raid1_stop_resync (mddev_t *mddev) |
| { |
| raid1_conf_t *conf = mddev_to_conf(mddev); |
| |
| if (conf->resync_thread) { |
| if (conf->resync_mirrors) { |
| conf->resync_mirrors = 2; |
| md_interrupt_thread(conf->resync_thread); |
| |
| printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); |
| return 1; |
| } |
| return 0; |
| } |
| return 0; |
| } |
| |
| static int raid1_restart_resync (mddev_t *mddev) |
| { |
| raid1_conf_t *conf = mddev_to_conf(mddev); |
| |
| if (conf->resync_mirrors) { |
| if (!conf->resync_thread) { |
| MD_BUG(); |
| return 0; |
| } |
| conf->resync_mirrors = 1; |
| md_wakeup_thread(conf->resync_thread); |
| return 1; |
| } |
| return 0; |
| } |
| |
| static int raid1_stop (mddev_t *mddev) |
| { |
| raid1_conf_t *conf = mddev_to_conf(mddev); |
| |
| md_unregister_thread(conf->thread); |
| if (conf->resync_thread) |
| md_unregister_thread(conf->resync_thread); |
| raid1_shrink_r1bh(conf); |
| raid1_shrink_bh(conf); |
| raid1_shrink_buffers(conf); |
| kfree(conf); |
| mddev->private = NULL; |
| MOD_DEC_USE_COUNT; |
| return 0; |
| } |
| |
| static mdk_personality_t raid1_personality= |
| { |
| name: "raid1", |
| make_request: raid1_make_request, |
| run: raid1_run, |
| stop: raid1_stop, |
| status: raid1_status, |
| error_handler: raid1_error, |
| diskop: raid1_diskop, |
| stop_resync: raid1_stop_resync, |
| restart_resync: raid1_restart_resync, |
| sync_request: raid1_sync_request |
| }; |
| |
| static int md__init raid1_init (void) |
| { |
| return register_md_personality (RAID1, &raid1_personality); |
| } |
| |
| static void raid1_exit (void) |
| { |
| unregister_md_personality (RAID1); |
| } |
| |
| module_init(raid1_init); |
| module_exit(raid1_exit); |
| MODULE_LICENSE("GPL"); |