| /* |
| * multipath.c : Multiple Devices driver for Linux |
| * |
| * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat |
| * |
| * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman |
| * |
| * MULTIPATH management functions. |
| * |
| * derived from raid1.c. |
| * |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License as published by |
| * the Free Software Foundation; either version 2, or (at your option) |
| * any later version. |
| * |
| * You should have received a copy of the GNU General Public License |
| * (for example /usr/src/linux/COPYING); if not, write to the Free |
| * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| */ |
| |
| #include <linux/module.h> |
| #include <linux/slab.h> |
| #include <linux/raid/multipath.h> |
| #include <asm/atomic.h> |
| |
| #define MAJOR_NR MD_MAJOR |
| #define MD_DRIVER |
| #define MD_PERSONALITY |
| |
| #define MAX_WORK_PER_DISK 128 |
| |
| #define NR_RESERVED_BUFS 32 |
| |
| |
| /* |
| * The following can be used to debug the driver |
| */ |
| #define MULTIPATH_DEBUG 0 |
| |
| #if MULTIPATH_DEBUG |
| #define PRINTK(x...) printk(x) |
| #define inline |
| #define __inline__ |
| #else |
| #define PRINTK(x...) do { } while (0) |
| #endif |
| |
| |
| static mdk_personality_t multipath_personality; |
| static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED; |
| struct multipath_bh *multipath_retry_list = NULL, **multipath_retry_tail; |
| |
| static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state); |
| |
| |
| |
| static struct multipath_bh *multipath_alloc_mpbh(multipath_conf_t *conf) |
| { |
| struct multipath_bh *mp_bh = NULL; |
| |
| do { |
| md_spin_lock_irq(&conf->device_lock); |
| if (!conf->freer1_blocked && conf->freer1) { |
| mp_bh = conf->freer1; |
| conf->freer1 = mp_bh->next_mp; |
| conf->freer1_cnt--; |
| mp_bh->next_mp = NULL; |
| mp_bh->state = (1 << MPBH_PreAlloc); |
| mp_bh->bh_req.b_state = 0; |
| } |
| md_spin_unlock_irq(&conf->device_lock); |
| if (mp_bh) |
| return mp_bh; |
| mp_bh = (struct multipath_bh *) kmalloc(sizeof(struct multipath_bh), |
| GFP_NOIO); |
| if (mp_bh) { |
| memset(mp_bh, 0, sizeof(*mp_bh)); |
| return mp_bh; |
| } |
| conf->freer1_blocked = 1; |
| wait_disk_event(conf->wait_buffer, |
| !conf->freer1_blocked || |
| conf->freer1_cnt > NR_RESERVED_BUFS/2 |
| ); |
| conf->freer1_blocked = 0; |
| } while (1); |
| } |
| |
| static inline void multipath_free_mpbh(struct multipath_bh *mp_bh) |
| { |
| multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); |
| |
| if (test_bit(MPBH_PreAlloc, &mp_bh->state)) { |
| unsigned long flags; |
| spin_lock_irqsave(&conf->device_lock, flags); |
| mp_bh->next_mp = conf->freer1; |
| conf->freer1 = mp_bh; |
| conf->freer1_cnt++; |
| spin_unlock_irqrestore(&conf->device_lock, flags); |
| wake_up(&conf->wait_buffer); |
| } else { |
| kfree(mp_bh); |
| } |
| } |
| |
| static int multipath_grow_mpbh (multipath_conf_t *conf, int cnt) |
| { |
| int i = 0; |
| |
| while (i < cnt) { |
| struct multipath_bh *mp_bh; |
| mp_bh = (struct multipath_bh*)kmalloc(sizeof(*mp_bh), GFP_KERNEL); |
| if (!mp_bh) |
| break; |
| memset(mp_bh, 0, sizeof(*mp_bh)); |
| set_bit(MPBH_PreAlloc, &mp_bh->state); |
| mp_bh->mddev = conf->mddev; |
| |
| multipath_free_mpbh(mp_bh); |
| i++; |
| } |
| return i; |
| } |
| |
| static void multipath_shrink_mpbh(multipath_conf_t *conf) |
| { |
| md_spin_lock_irq(&conf->device_lock); |
| while (conf->freer1) { |
| struct multipath_bh *mp_bh = conf->freer1; |
| conf->freer1 = mp_bh->next_mp; |
| conf->freer1_cnt--; |
| kfree(mp_bh); |
| } |
| md_spin_unlock_irq(&conf->device_lock); |
| } |
| |
| |
| static int multipath_map (mddev_t *mddev, kdev_t *rdev) |
| { |
| multipath_conf_t *conf = mddev_to_conf(mddev); |
| int i, disks = MD_SB_DISKS; |
| |
| /* |
| * Later we do read balancing on the read side |
| * now we use the first available disk. |
| */ |
| |
| for (i = 0; i < disks; i++) { |
| if (conf->multipaths[i].operational) { |
| *rdev = conf->multipaths[i].dev; |
| return (0); |
| } |
| } |
| |
| printk (KERN_ERR "multipath_map(): no more operational IO paths?\n"); |
| return (-1); |
| } |
| |
| static void multipath_reschedule_retry (struct multipath_bh *mp_bh) |
| { |
| unsigned long flags; |
| mddev_t *mddev = mp_bh->mddev; |
| multipath_conf_t *conf = mddev_to_conf(mddev); |
| |
| md_spin_lock_irqsave(&retry_list_lock, flags); |
| if (multipath_retry_list == NULL) |
| multipath_retry_tail = &multipath_retry_list; |
| *multipath_retry_tail = mp_bh; |
| multipath_retry_tail = &mp_bh->next_mp; |
| mp_bh->next_mp = NULL; |
| md_spin_unlock_irqrestore(&retry_list_lock, flags); |
| md_wakeup_thread(conf->thread); |
| } |
| |
| |
| /* |
| * multipath_end_bh_io() is called when we have finished servicing a multipathed |
| * operation and are ready to return a success/failure code to the buffer |
| * cache layer. |
| */ |
| static void multipath_end_bh_io (struct multipath_bh *mp_bh, int uptodate) |
| { |
| struct buffer_head *bh = mp_bh->master_bh; |
| |
| bh->b_end_io(bh, uptodate); |
| multipath_free_mpbh(mp_bh); |
| } |
| |
| void multipath_end_request (struct buffer_head *bh, int uptodate) |
| { |
| struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private); |
| |
| /* |
| * this branch is our 'one multipath IO has finished' event handler: |
| */ |
| if (!uptodate) |
| md_error (mp_bh->mddev, bh->b_dev); |
| else |
| /* |
| * Set MPBH_Uptodate in our master buffer_head, so that |
| * we will return a good error code for to the higher |
| * levels even if IO on some other multipathed buffer fails. |
| * |
| * The 'master' represents the complex operation to |
| * user-side. So if something waits for IO, then it will |
| * wait for the 'master' buffer_head. |
| */ |
| set_bit (MPBH_Uptodate, &mp_bh->state); |
| |
| |
| if (uptodate) { |
| multipath_end_bh_io(mp_bh, uptodate); |
| return; |
| } |
| /* |
| * oops, IO error: |
| */ |
| printk(KERN_ERR "multipath: %s: rescheduling block %lu\n", |
| partition_name(bh->b_dev), bh->b_blocknr); |
| multipath_reschedule_retry(mp_bh); |
| return; |
| } |
| |
| /* |
| * This routine returns the disk from which the requested read should |
| * be done. |
| */ |
| |
| static int multipath_read_balance (multipath_conf_t *conf) |
| { |
| int disk; |
| |
| for (disk = 0; disk < conf->raid_disks; disk++) |
| if (conf->multipaths[disk].operational) |
| return disk; |
| BUG(); |
| return 0; |
| } |
| |
| static int multipath_make_request (mddev_t *mddev, int rw, |
| struct buffer_head * bh) |
| { |
| multipath_conf_t *conf = mddev_to_conf(mddev); |
| struct buffer_head *bh_req; |
| struct multipath_bh * mp_bh; |
| struct multipath_info *multipath; |
| |
| if (!buffer_locked(bh)) |
| BUG(); |
| |
| /* |
| * make_request() can abort the operation when READA is being |
| * used and no empty request is available. |
| * |
| * Currently, just replace the command with READ/WRITE. |
| */ |
| if (rw == READA) |
| rw = READ; |
| |
| mp_bh = multipath_alloc_mpbh (conf); |
| |
| mp_bh->master_bh = bh; |
| mp_bh->mddev = mddev; |
| mp_bh->cmd = rw; |
| |
| /* |
| * read balancing logic: |
| */ |
| multipath = conf->multipaths + multipath_read_balance(conf); |
| |
| bh_req = &mp_bh->bh_req; |
| memcpy(bh_req, bh, sizeof(*bh)); |
| bh_req->b_blocknr = bh->b_rsector; |
| bh_req->b_dev = multipath->dev; |
| bh_req->b_rdev = multipath->dev; |
| /* bh_req->b_rsector = bh->n_rsector; */ |
| bh_req->b_end_io = multipath_end_request; |
| bh_req->b_private = mp_bh; |
| generic_make_request (rw, bh_req); |
| return 0; |
| } |
| |
| static void multipath_status (struct seq_file *seq, mddev_t *mddev) |
| { |
| multipath_conf_t *conf = mddev_to_conf(mddev); |
| int i; |
| |
| seq_printf (seq, " [%d/%d] [", conf->raid_disks, |
| conf->working_disks); |
| for (i = 0; i < conf->raid_disks; i++) |
| seq_printf (seq, "%s", |
| conf->multipaths[i].operational ? "U" : "_"); |
| seq_printf (seq, "]"); |
| } |
| |
| #define LAST_DISK KERN_ALERT \ |
| "multipath: only one IO path left and IO error.\n" |
| |
| #define NO_SPARE_DISK KERN_ALERT \ |
| "multipath: no spare IO path left!\n" |
| |
| #define DISK_FAILED KERN_ALERT \ |
| "multipath: IO failure on %s, disabling IO path. \n" \ |
| " Operation continuing on %d IO paths.\n" |
| |
| static void mark_disk_bad (mddev_t *mddev, int failed) |
| { |
| multipath_conf_t *conf = mddev_to_conf(mddev); |
| struct multipath_info *multipath = conf->multipaths+failed; |
| mdp_super_t *sb = mddev->sb; |
| |
| multipath->operational = 0; |
| mark_disk_faulty(sb->disks+multipath->number); |
| mark_disk_nonsync(sb->disks+multipath->number); |
| mark_disk_inactive(sb->disks+multipath->number); |
| sb->active_disks--; |
| sb->working_disks--; |
| sb->failed_disks++; |
| mddev->sb_dirty = 1; |
| md_wakeup_thread(conf->thread); |
| conf->working_disks--; |
| printk (DISK_FAILED, partition_name (multipath->dev), |
| conf->working_disks); |
| } |
| |
| /* |
| * Careful, this can execute in IRQ contexts as well! |
| */ |
| static int multipath_error (mddev_t *mddev, kdev_t dev) |
| { |
| multipath_conf_t *conf = mddev_to_conf(mddev); |
| struct multipath_info * multipaths = conf->multipaths; |
| int disks = MD_SB_DISKS; |
| int other_paths = 1; |
| int i; |
| |
| if (conf->working_disks == 1) { |
| other_paths = 0; |
| for (i = 0; i < disks; i++) { |
| if (multipaths[i].spare) { |
| other_paths = 1; |
| break; |
| } |
| } |
| } |
| |
| if (!other_paths) { |
| /* |
| * Uh oh, we can do nothing if this is our last path, but |
| * first check if this is a queued request for a device |
| * which has just failed. |
| */ |
| for (i = 0; i < disks; i++) { |
| if (multipaths[i].dev==dev && !multipaths[i].operational) |
| return 0; |
| } |
| printk (LAST_DISK); |
| } else { |
| /* |
| * Mark disk as unusable |
| */ |
| for (i = 0; i < disks; i++) { |
| if (multipaths[i].dev==dev && multipaths[i].operational) { |
| mark_disk_bad(mddev, i); |
| break; |
| } |
| } |
| if (!conf->working_disks) { |
| int err = 1; |
| mdp_disk_t *spare; |
| mdp_super_t *sb = mddev->sb; |
| |
| spare = get_spare(mddev); |
| if (spare) { |
| err = multipath_diskop(mddev, &spare, DISKOP_SPARE_WRITE); |
| printk("got DISKOP_SPARE_WRITE err: %d. (spare_faulty(): %d)\n", err, disk_faulty(spare)); |
| } |
| if (!err && !disk_faulty(spare)) { |
| multipath_diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); |
| mark_disk_sync(spare); |
| mark_disk_active(spare); |
| sb->active_disks++; |
| sb->spare_disks--; |
| } |
| } |
| } |
| return 0; |
| } |
| |
| #undef LAST_DISK |
| #undef NO_SPARE_DISK |
| #undef DISK_FAILED |
| |
| |
| static void print_multipath_conf (multipath_conf_t *conf) |
| { |
| int i; |
| struct multipath_info *tmp; |
| |
| printk("MULTIPATH conf printout:\n"); |
| if (!conf) { |
| printk("(conf==NULL)\n"); |
| return; |
| } |
| printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks, |
| conf->raid_disks, conf->nr_disks); |
| |
| for (i = 0; i < MD_SB_DISKS; i++) { |
| tmp = conf->multipaths + i; |
| if (tmp->spare || tmp->operational || tmp->number || |
| tmp->raid_disk || tmp->used_slot) |
| printk(" disk%d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", |
| i, tmp->spare,tmp->operational, |
| tmp->number,tmp->raid_disk,tmp->used_slot, |
| partition_name(tmp->dev)); |
| } |
| } |
| |
| static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state) |
| { |
| int err = 0; |
| int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; |
| multipath_conf_t *conf = mddev->private; |
| struct multipath_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; |
| mdp_super_t *sb = mddev->sb; |
| mdp_disk_t *failed_desc, *spare_desc, *added_desc; |
| mdk_rdev_t *spare_rdev, *failed_rdev; |
| |
| print_multipath_conf(conf); |
| md_spin_lock_irq(&conf->device_lock); |
| /* |
| * find the disk ... |
| */ |
| switch (state) { |
| |
| case DISKOP_SPARE_ACTIVE: |
| |
| /* |
| * Find the failed disk within the MULTIPATH configuration ... |
| * (this can only be in the first conf->working_disks part) |
| */ |
| for (i = 0; i < conf->raid_disks; i++) { |
| tmp = conf->multipaths + i; |
| if ((!tmp->operational && !tmp->spare) || |
| !tmp->used_slot) { |
| failed_disk = i; |
| break; |
| } |
| } |
| /* |
| * When we activate a spare disk we _must_ have a disk in |
| * the lower (active) part of the array to replace. |
| */ |
| if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| /* fall through */ |
| |
| case DISKOP_SPARE_WRITE: |
| case DISKOP_SPARE_INACTIVE: |
| |
| /* |
| * Find the spare disk ... (can only be in the 'high' |
| * area of the array) |
| */ |
| for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { |
| tmp = conf->multipaths + i; |
| if (tmp->spare && tmp->number == (*d)->number) { |
| spare_disk = i; |
| break; |
| } |
| } |
| if (spare_disk == -1) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| break; |
| |
| case DISKOP_HOT_REMOVE_DISK: |
| |
| for (i = 0; i < MD_SB_DISKS; i++) { |
| tmp = conf->multipaths + i; |
| if (tmp->used_slot && (tmp->number == (*d)->number)) { |
| if (tmp->operational) { |
| printk(KERN_ERR "hot-remove-disk, slot %d is identified to be the requested disk (number %d), but is still operational!\n", i, (*d)->number); |
| err = -EBUSY; |
| goto abort; |
| } |
| removed_disk = i; |
| break; |
| } |
| } |
| if (removed_disk == -1) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| break; |
| |
| case DISKOP_HOT_ADD_DISK: |
| |
| for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { |
| tmp = conf->multipaths + i; |
| if (!tmp->used_slot) { |
| added_disk = i; |
| break; |
| } |
| } |
| if (added_disk == -1) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| break; |
| } |
| |
| switch (state) { |
| /* |
| * Switch the spare disk to write-only mode: |
| */ |
| case DISKOP_SPARE_WRITE: |
| sdisk = conf->multipaths + spare_disk; |
| sdisk->operational = 1; |
| break; |
| /* |
| * Deactivate a spare disk: |
| */ |
| case DISKOP_SPARE_INACTIVE: |
| sdisk = conf->multipaths + spare_disk; |
| sdisk->operational = 0; |
| break; |
| /* |
| * Activate (mark read-write) the (now sync) spare disk, |
| * which means we switch it's 'raid position' (->raid_disk) |
| * with the failed disk. (only the first 'conf->nr_disks' |
| * slots are used for 'real' disks and we must preserve this |
| * property) |
| */ |
| case DISKOP_SPARE_ACTIVE: |
| sdisk = conf->multipaths + spare_disk; |
| fdisk = conf->multipaths + failed_disk; |
| |
| spare_desc = &sb->disks[sdisk->number]; |
| failed_desc = &sb->disks[fdisk->number]; |
| |
| if (spare_desc != *d) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| |
| if (spare_desc->raid_disk != sdisk->raid_disk) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| |
| if (sdisk->raid_disk != spare_disk) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| |
| if (failed_desc->raid_disk != fdisk->raid_disk) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| |
| if (fdisk->raid_disk != failed_disk) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| |
| /* |
| * do the switch finally |
| */ |
| spare_rdev = find_rdev_nr(mddev, spare_desc->number); |
| failed_rdev = find_rdev_nr(mddev, failed_desc->number); |
| xchg_values(spare_rdev->desc_nr, failed_rdev->desc_nr); |
| spare_rdev->alias_device = 0; |
| failed_rdev->alias_device = 1; |
| |
| xchg_values(*spare_desc, *failed_desc); |
| xchg_values(*fdisk, *sdisk); |
| |
| /* |
| * (careful, 'failed' and 'spare' are switched from now on) |
| * |
| * we want to preserve linear numbering and we want to |
| * give the proper raid_disk number to the now activated |
| * disk. (this means we switch back these values) |
| */ |
| |
| xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); |
| xchg_values(sdisk->raid_disk, fdisk->raid_disk); |
| xchg_values(spare_desc->number, failed_desc->number); |
| xchg_values(sdisk->number, fdisk->number); |
| |
| *d = failed_desc; |
| |
| if (sdisk->dev == MKDEV(0,0)) |
| sdisk->used_slot = 0; |
| /* |
| * this really activates the spare. |
| */ |
| fdisk->spare = 0; |
| |
| /* |
| * if we activate a spare, we definitely replace a |
| * non-operational disk slot in the 'low' area of |
| * the disk array. |
| */ |
| |
| conf->working_disks++; |
| |
| break; |
| |
| case DISKOP_HOT_REMOVE_DISK: |
| rdisk = conf->multipaths + removed_disk; |
| |
| if (rdisk->spare && (removed_disk < conf->raid_disks)) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| rdisk->dev = MKDEV(0,0); |
| rdisk->used_slot = 0; |
| conf->nr_disks--; |
| break; |
| |
| case DISKOP_HOT_ADD_DISK: |
| adisk = conf->multipaths + added_disk; |
| added_desc = *d; |
| |
| if (added_disk != added_desc->number) { |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| |
| adisk->number = added_desc->number; |
| adisk->raid_disk = added_desc->raid_disk; |
| adisk->dev = MKDEV(added_desc->major,added_desc->minor); |
| |
| adisk->operational = 0; |
| adisk->spare = 1; |
| adisk->used_slot = 1; |
| conf->nr_disks++; |
| |
| break; |
| |
| default: |
| MD_BUG(); |
| err = 1; |
| goto abort; |
| } |
| abort: |
| md_spin_unlock_irq(&conf->device_lock); |
| |
| print_multipath_conf(conf); |
| return err; |
| } |
| |
| |
| #define IO_ERROR KERN_ALERT \ |
| "multipath: %s: unrecoverable IO read error for block %lu\n" |
| |
| #define REDIRECT_SECTOR KERN_ERR \ |
| "multipath: %s: redirecting sector %lu to another IO path\n" |
| |
| /* |
| * This is a kernel thread which: |
| * |
| * 1. Retries failed read operations on working multipaths. |
| * 2. Updates the raid superblock when problems encounter. |
| * 3. Performs writes following reads for array syncronising. |
| */ |
| |
| static void multipathd (void *data) |
| { |
| struct multipath_bh *mp_bh; |
| struct buffer_head *bh; |
| unsigned long flags; |
| mddev_t *mddev; |
| kdev_t dev; |
| |
| |
| for (;;) { |
| md_spin_lock_irqsave(&retry_list_lock, flags); |
| mp_bh = multipath_retry_list; |
| if (!mp_bh) |
| break; |
| multipath_retry_list = mp_bh->next_mp; |
| md_spin_unlock_irqrestore(&retry_list_lock, flags); |
| |
| mddev = mp_bh->mddev; |
| if (mddev->sb_dirty) |
| md_update_sb(mddev); |
| bh = &mp_bh->bh_req; |
| dev = bh->b_dev; |
| |
| multipath_map (mddev, &bh->b_dev); |
| if (bh->b_dev == dev) { |
| printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); |
| multipath_end_bh_io(mp_bh, 0); |
| } else { |
| printk (REDIRECT_SECTOR, |
| partition_name(bh->b_dev), bh->b_blocknr); |
| bh->b_rdev = bh->b_dev; |
| bh->b_rsector = bh->b_blocknr; |
| generic_make_request (mp_bh->cmd, bh); |
| } |
| } |
| md_spin_unlock_irqrestore(&retry_list_lock, flags); |
| } |
| #undef IO_ERROR |
| #undef REDIRECT_SECTOR |
| |
| /* |
| * This will catch the scenario in which one of the multipaths was |
| * mounted as a normal device rather than as a part of a raid set. |
| * |
| * check_consistency is very personality-dependent, eg. RAID5 cannot |
| * do this check, it uses another method. |
| */ |
| static int __check_consistency (mddev_t *mddev, int row) |
| { |
| multipath_conf_t *conf = mddev_to_conf(mddev); |
| int disks = MD_SB_DISKS; |
| kdev_t dev; |
| struct buffer_head *bh = NULL; |
| int i, rc = 0; |
| char *buffer = NULL; |
| |
| for (i = 0; i < disks; i++) { |
| if (!conf->multipaths[i].operational) |
| continue; |
| printk("(checking disk %d)\n",i); |
| dev = conf->multipaths[i].dev; |
| set_blocksize(dev, 4096); |
| if ((bh = bread(dev, row / 4, 4096)) == NULL) |
| break; |
| if (!buffer) { |
| buffer = (char *) __get_free_page(GFP_KERNEL); |
| if (!buffer) |
| break; |
| memcpy(buffer, bh->b_data, 4096); |
| } else if (memcmp(buffer, bh->b_data, 4096)) { |
| rc = 1; |
| break; |
| } |
| bforget(bh); |
| fsync_dev(dev); |
| invalidate_buffers(dev); |
| bh = NULL; |
| } |
| if (buffer) |
| free_page((unsigned long) buffer); |
| if (bh) { |
| dev = bh->b_dev; |
| bforget(bh); |
| fsync_dev(dev); |
| invalidate_buffers(dev); |
| } |
| return rc; |
| } |
| |
| static int check_consistency (mddev_t *mddev) |
| { |
| if (__check_consistency(mddev, 0)) |
| /* |
| * we do not do this currently, as it's perfectly possible to |
| * have an inconsistent array when it's freshly created. Only |
| * newly written data has to be consistent. |
| */ |
| return 0; |
| |
| return 0; |
| } |
| |
| #define INVALID_LEVEL KERN_WARNING \ |
| "multipath: md%d: raid level not set to multipath IO (%d)\n" |
| |
| #define NO_SB KERN_ERR \ |
| "multipath: disabled IO path %s (couldn't access raid superblock)\n" |
| |
| #define ERRORS KERN_ERR \ |
| "multipath: disabled IO path %s (errors detected)\n" |
| |
| #define NOT_IN_SYNC KERN_ERR \ |
| "multipath: making IO path %s a spare path (not in sync)\n" |
| |
| #define INCONSISTENT KERN_ERR \ |
| "multipath: disabled IO path %s (inconsistent descriptor)\n" |
| |
| #define ALREADY_RUNNING KERN_ERR \ |
| "multipath: disabled IO path %s (multipath %d already operational)\n" |
| |
| #define OPERATIONAL KERN_INFO \ |
| "multipath: device %s operational as IO path %d\n" |
| |
| #define MEM_ERROR KERN_ERR \ |
| "multipath: couldn't allocate memory for md%d\n" |
| |
| #define SPARE KERN_INFO \ |
| "multipath: spare IO path %s\n" |
| |
| #define NONE_OPERATIONAL KERN_ERR \ |
| "multipath: no operational IO paths for md%d\n" |
| |
| #define SB_DIFFERENCES KERN_ERR \ |
| "multipath: detected IO path differences!\n" |
| |
| #define ARRAY_IS_ACTIVE KERN_INFO \ |
| "multipath: array md%d active with %d out of %d IO paths (%d spare IO paths)\n" |
| |
| #define THREAD_ERROR KERN_ERR \ |
| "multipath: couldn't allocate thread for md%d\n" |
| |
| static int multipath_run (mddev_t *mddev) |
| { |
| multipath_conf_t *conf; |
| int i, j, disk_idx; |
| struct multipath_info *disk, *disk2; |
| mdp_super_t *sb = mddev->sb; |
| mdp_disk_t *desc, *desc2; |
| mdk_rdev_t *rdev, *def_rdev = NULL; |
| struct md_list_head *tmp; |
| int num_rdevs = 0; |
| |
| MOD_INC_USE_COUNT; |
| |
| if (sb->level != -4) { |
| printk(INVALID_LEVEL, mdidx(mddev), sb->level); |
| goto out; |
| } |
| /* |
| * copy the already verified devices into our private MULTIPATH |
| * bookkeeping area. [whatever we allocate in multipath_run(), |
| * should be freed in multipath_stop()] |
| */ |
| |
| conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL); |
| mddev->private = conf; |
| if (!conf) { |
| printk(MEM_ERROR, mdidx(mddev)); |
| goto out; |
| } |
| memset(conf, 0, sizeof(*conf)); |
| |
| ITERATE_RDEV(mddev,rdev,tmp) { |
| if (rdev->faulty) { |
| /* this is a "should never happen" case and if it */ |
| /* ever does happen, a continue; won't help */ |
| printk(ERRORS, partition_name(rdev->dev)); |
| continue; |
| } else { |
| /* this is a "should never happen" case and if it */ |
| /* ever does happen, a continue; won't help */ |
| if (!rdev->sb) { |
| MD_BUG(); |
| continue; |
| } |
| } |
| if (rdev->desc_nr == -1) { |
| MD_BUG(); |
| continue; |
| } |
| |
| desc = &sb->disks[rdev->desc_nr]; |
| disk_idx = desc->raid_disk; |
| disk = conf->multipaths + disk_idx; |
| |
| if (!disk_sync(desc)) |
| printk(NOT_IN_SYNC, partition_name(rdev->dev)); |
| |
| /* |
| * Mark all disks as spare to start with, then pick our |
| * active disk. If we have a disk that is marked active |
| * in the sb, then use it, else use the first rdev. |
| */ |
| disk->number = desc->number; |
| disk->raid_disk = desc->raid_disk; |
| disk->dev = rdev->dev; |
| disk->operational = 0; |
| disk->spare = 1; |
| disk->used_slot = 1; |
| mark_disk_sync(desc); |
| |
| if (disk_active(desc)) { |
| if(!conf->working_disks) { |
| printk(OPERATIONAL, partition_name(rdev->dev), |
| desc->raid_disk); |
| disk->operational = 1; |
| disk->spare = 0; |
| conf->working_disks++; |
| def_rdev = rdev; |
| } else { |
| mark_disk_spare(desc); |
| } |
| } else |
| mark_disk_spare(desc); |
| |
| if(!num_rdevs++) def_rdev = rdev; |
| } |
| if(!conf->working_disks && num_rdevs) { |
| desc = &sb->disks[def_rdev->desc_nr]; |
| disk = conf->multipaths + desc->raid_disk; |
| printk(OPERATIONAL, partition_name(def_rdev->dev), |
| disk->raid_disk); |
| disk->operational = 1; |
| disk->spare = 0; |
| conf->working_disks++; |
| mark_disk_active(desc); |
| } |
| /* |
| * Make sure our active path is in desc spot 0 |
| */ |
| if(def_rdev->desc_nr != 0) { |
| rdev = find_rdev_nr(mddev, 0); |
| desc = &sb->disks[def_rdev->desc_nr]; |
| desc2 = sb->disks; |
| disk = conf->multipaths + desc->raid_disk; |
| disk2 = conf->multipaths + desc2->raid_disk; |
| xchg_values(*desc2,*desc); |
| xchg_values(*disk2,*disk); |
| xchg_values(desc2->number, desc->number); |
| xchg_values(disk2->number, disk->number); |
| xchg_values(desc2->raid_disk, desc->raid_disk); |
| xchg_values(disk2->raid_disk, disk->raid_disk); |
| if(rdev) { |
| xchg_values(def_rdev->desc_nr,rdev->desc_nr); |
| } else { |
| def_rdev->desc_nr = 0; |
| } |
| } |
| conf->raid_disks = sb->raid_disks = sb->active_disks = 1; |
| conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs; |
| sb->failed_disks = 0; |
| sb->spare_disks = num_rdevs - 1; |
| mddev->sb_dirty = 1; |
| conf->mddev = mddev; |
| conf->device_lock = MD_SPIN_LOCK_UNLOCKED; |
| |
| init_waitqueue_head(&conf->wait_buffer); |
| |
| if (!conf->working_disks) { |
| printk(NONE_OPERATIONAL, mdidx(mddev)); |
| goto out_free_conf; |
| } |
| |
| |
| /* pre-allocate some buffer_head structures. |
| * As a minimum, 1 mpbh and raid_disks buffer_heads |
| * would probably get us by in tight memory situations, |
| * but a few more is probably a good idea. |
| * For now, try NR_RESERVED_BUFS mpbh and |
| * NR_RESERVED_BUFS*raid_disks bufferheads |
| * This will allow at least NR_RESERVED_BUFS concurrent |
| * reads or writes even if kmalloc starts failing |
| */ |
| if (multipath_grow_mpbh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS) { |
| printk(MEM_ERROR, mdidx(mddev)); |
| goto out_free_conf; |
| } |
| |
| if ((sb->state & (1 << MD_SB_CLEAN))) { |
| /* |
| * we do sanity checks even if the device says |
| * it's clean ... |
| */ |
| if (check_consistency(mddev)) { |
| printk(SB_DIFFERENCES); |
| sb->state &= ~(1 << MD_SB_CLEAN); |
| } |
| } |
| |
| { |
| const char * name = "multipathd"; |
| |
| conf->thread = md_register_thread(multipathd, conf, name); |
| if (!conf->thread) { |
| printk(THREAD_ERROR, mdidx(mddev)); |
| goto out_free_conf; |
| } |
| } |
| |
| /* |
| * Regenerate the "device is in sync with the raid set" bit for |
| * each device. |
| */ |
| for (i = 0; i < MD_SB_DISKS; i++) { |
| mark_disk_nonsync(sb->disks+i); |
| for (j = 0; j < sb->raid_disks; j++) { |
| if (sb->disks[i].number == conf->multipaths[j].number) |
| mark_disk_sync(sb->disks+i); |
| } |
| } |
| |
| printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, |
| sb->raid_disks, sb->spare_disks); |
| /* |
| * Ok, everything is just fine now |
| */ |
| return 0; |
| |
| out_free_conf: |
| multipath_shrink_mpbh(conf); |
| kfree(conf); |
| mddev->private = NULL; |
| out: |
| MOD_DEC_USE_COUNT; |
| return -EIO; |
| } |
| |
| #undef INVALID_LEVEL |
| #undef NO_SB |
| #undef ERRORS |
| #undef NOT_IN_SYNC |
| #undef INCONSISTENT |
| #undef ALREADY_RUNNING |
| #undef OPERATIONAL |
| #undef SPARE |
| #undef NONE_OPERATIONAL |
| #undef SB_DIFFERENCES |
| #undef ARRAY_IS_ACTIVE |
| |
| static int multipath_stop (mddev_t *mddev) |
| { |
| multipath_conf_t *conf = mddev_to_conf(mddev); |
| |
| md_unregister_thread(conf->thread); |
| multipath_shrink_mpbh(conf); |
| kfree(conf); |
| mddev->private = NULL; |
| MOD_DEC_USE_COUNT; |
| return 0; |
| } |
| |
| static mdk_personality_t multipath_personality= |
| { |
| name: "multipath", |
| make_request: multipath_make_request, |
| run: multipath_run, |
| stop: multipath_stop, |
| status: multipath_status, |
| error_handler: multipath_error, |
| diskop: multipath_diskop, |
| }; |
| |
| static int md__init multipath_init (void) |
| { |
| return register_md_personality (MULTIPATH, &multipath_personality); |
| } |
| |
| static void multipath_exit (void) |
| { |
| unregister_md_personality (MULTIPATH); |
| } |
| |
| module_init(multipath_init); |
| module_exit(multipath_exit); |
| MODULE_LICENSE("GPL"); |