~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Linux Cross Reference
Linux/drivers/md/md.c

Version: ~ [ 2.4.0 ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2    md.c : Multiple Devices driver for Linux
  3           Copyright (C) 1998, 1999, 2000 Ingo Molnar
  4 
  5      completely rewritten, based on the MD driver code from Marc Zyngier
  6 
  7    Changes:
  8 
  9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
 10    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
 11    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
 12    - kmod support by: Cyrus Durgin
 13    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
 14    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
 15 
 16    - lots of fixes and improvements to the RAID1/RAID5 and generic
 17      RAID code (such as request based resynchronization):
 18 
 19      Neil Brown <neilb@cse.unsw.edu.au>.
 20 
 21    This program is free software; you can redistribute it and/or modify
 22    it under the terms of the GNU General Public License as published by
 23    the Free Software Foundation; either version 2, or (at your option)
 24    any later version.
 25 
 26    You should have received a copy of the GNU General Public License
 27    (for example /usr/src/linux/COPYING); if not, write to the Free
 28    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 29 */
 30 
 31 #include <linux/module.h>
 32 #include <linux/config.h>
 33 #include <linux/raid/md.h>
 34 #include <linux/sysctl.h>
 35 #include <linux/raid/xor.h>
 36 #include <linux/devfs_fs_kernel.h>
 37 
 38 #include <linux/init.h>
 39 
 40 #ifdef CONFIG_KMOD
 41 #include <linux/kmod.h>
 42 #endif
 43 
 44 #define __KERNEL_SYSCALLS__
 45 #include <linux/unistd.h>
 46 
 47 #include <asm/unaligned.h>
 48 
 49 extern asmlinkage int sys_sched_yield(void);
 50 extern asmlinkage long sys_setsid(void);
 51 
 52 #define MAJOR_NR MD_MAJOR
 53 #define MD_DRIVER
 54 
 55 #include <linux/blk.h>
 56 
 57 #define DEBUG 0
 58 #if DEBUG
 59 # define dprintk(x...) printk(x)
 60 #else
 61 # define dprintk(x...) do { } while(0)
 62 #endif
 63 
 64 static mdk_personality_t *pers[MAX_PERSONALITY];
 65 
 66 /*
 67  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
 68  * is 100 KB/sec, so the extra system load does not show up that much.
 69  * Increase it if you want to have more _guaranteed_ speed. Note that
 70  * the RAID driver will use the maximum available bandwith if the IO
 71  * subsystem is idle. There is also an 'absolute maximum' reconstruction
 72  * speed limit - in case reconstruction slows down your system despite
 73  * idle IO detection.
 74  *
 75  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
 76  */
 77 
 78 static int sysctl_speed_limit_min = 100;
 79 static int sysctl_speed_limit_max = 100000;
 80 
 81 static struct ctl_table_header *raid_table_header;
 82 
 83 static ctl_table raid_table[] = {
 84         {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min",
 85          &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
 86         {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max",
 87          &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
 88         {0}
 89 };
 90 
 91 static ctl_table raid_dir_table[] = {
 92         {DEV_RAID, "raid", NULL, 0, 0555, raid_table},
 93         {0}
 94 };
 95 
 96 static ctl_table raid_root_table[] = {
 97         {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table},
 98         {0}
 99 };
100 
101 /*
102  * these have to be allocated separately because external
103  * subsystems want to have a pre-defined structure
104  */
105 struct hd_struct md_hd_struct[MAX_MD_DEVS];
106 static int md_blocksizes[MAX_MD_DEVS];
107 static int md_hardsect_sizes[MAX_MD_DEVS];
108 static int md_maxreadahead[MAX_MD_DEVS];
109 static mdk_thread_t *md_recovery_thread;
110 
111 int md_size[MAX_MD_DEVS];
112 
113 extern struct block_device_operations md_fops;
114 static devfs_handle_t devfs_handle;
115 
116 static struct gendisk md_gendisk=
117 {
118         major: MD_MAJOR,
119         major_name: "md",
120         minor_shift: 0,
121         max_p: 1,
122         part: md_hd_struct,
123         sizes: md_size,
124         nr_real: MAX_MD_DEVS,
125         real_devices: NULL,
126         next: NULL,
127         fops: &md_fops,
128 };
129 
130 /*
131  * Enables to iterate over all existing md arrays
132  */
133 static MD_LIST_HEAD(all_mddevs);
134 
135 /*
136  * The mapping between kdev and mddev is not necessary a simple
137  * one! Eg. HSM uses several sub-devices to implement Logical
138  * Volumes. All these sub-devices map to the same mddev.
139  */
140 dev_mapping_t mddev_map[MAX_MD_DEVS];
141 
142 void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
143 {
144         unsigned int minor = MINOR(dev);
145 
146         if (MAJOR(dev) != MD_MAJOR) {
147                 MD_BUG();
148                 return;
149         }
150         if (mddev_map[minor].mddev != NULL) {
151                 MD_BUG();
152                 return;
153         }
154         mddev_map[minor].mddev = mddev;
155         mddev_map[minor].data = data;
156 }
157 
158 void del_mddev_mapping (mddev_t * mddev, kdev_t dev)
159 {
160         unsigned int minor = MINOR(dev);
161 
162         if (MAJOR(dev) != MD_MAJOR) {
163                 MD_BUG();
164                 return;
165         }
166         if (mddev_map[minor].mddev != mddev) {
167                 MD_BUG();
168                 return;
169         }
170         mddev_map[minor].mddev = NULL;
171         mddev_map[minor].data = NULL;
172 }
173 
174 static int md_make_request (request_queue_t *q, int rw, struct buffer_head * bh)
175 {
176         mddev_t *mddev = kdev_to_mddev(bh->b_rdev);
177 
178         if (mddev && mddev->pers)
179                 return mddev->pers->make_request(mddev, rw, bh);
180         else {
181                 buffer_IO_error(bh);
182                 return 0;
183         }
184 }
185 
186 static mddev_t * alloc_mddev (kdev_t dev)
187 {
188         mddev_t *mddev;
189 
190         if (MAJOR(dev) != MD_MAJOR) {
191                 MD_BUG();
192                 return 0;
193         }
194         mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
195         if (!mddev)
196                 return NULL;
197                 
198         memset(mddev, 0, sizeof(*mddev));
199 
200         mddev->__minor = MINOR(dev);
201         init_MUTEX(&mddev->reconfig_sem);
202         init_MUTEX(&mddev->recovery_sem);
203         init_MUTEX(&mddev->resync_sem);
204         MD_INIT_LIST_HEAD(&mddev->disks);
205         MD_INIT_LIST_HEAD(&mddev->all_mddevs);
206         atomic_set(&mddev->active, 0);
207 
208         /*
209          * The 'base' mddev is the one with data NULL.
210          * personalities can create additional mddevs
211          * if necessary.
212          */
213         add_mddev_mapping(mddev, dev, 0);
214         md_list_add(&mddev->all_mddevs, &all_mddevs);
215 
216         MOD_INC_USE_COUNT;
217 
218         return mddev;
219 }
220 
221 struct gendisk * find_gendisk (kdev_t dev)
222 {
223         struct gendisk *tmp = gendisk_head;
224 
225         while (tmp != NULL) {
226                 if (tmp->major == MAJOR(dev))
227                         return (tmp);
228                 tmp = tmp->next;
229         }
230         return (NULL);
231 }
232 
233 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
234 {
235         mdk_rdev_t * rdev;
236         struct md_list_head *tmp;
237 
238         ITERATE_RDEV(mddev,rdev,tmp) {
239                 if (rdev->desc_nr == nr)
240                         return rdev;
241         }
242         return NULL;
243 }
244 
245 mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
246 {
247         struct md_list_head *tmp;
248         mdk_rdev_t *rdev;
249 
250         ITERATE_RDEV(mddev,rdev,tmp) {
251                 if (rdev->dev == dev)
252                         return rdev;
253         }
254         return NULL;
255 }
256 
257 static MD_LIST_HEAD(device_names);
258 
259 char * partition_name (kdev_t dev)
260 {
261         struct gendisk *hd;
262         static char nomem [] = "<nomem>";
263         dev_name_t *dname;
264         struct md_list_head *tmp = device_names.next;
265 
266         while (tmp != &device_names) {
267                 dname = md_list_entry(tmp, dev_name_t, list);
268                 if (dname->dev == dev)
269                         return dname->name;
270                 tmp = tmp->next;
271         }
272 
273         dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
274 
275         if (!dname)
276                 return nomem;
277         /*
278          * ok, add this new device name to the list
279          */
280         hd = find_gendisk (dev);
281         dname->name = NULL;
282         if (hd)
283                 dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
284         if (!dname->name) {
285                 sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
286                 dname->name = dname->namebuf;
287         }
288 
289         dname->dev = dev;
290         MD_INIT_LIST_HEAD(&dname->list);
291         md_list_add(&dname->list, &device_names);
292 
293         return dname->name;
294 }
295 
296 static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev,
297                                                 int persistent)
298 {
299         unsigned int size = 0;
300 
301         if (blk_size[MAJOR(dev)])
302                 size = blk_size[MAJOR(dev)][MINOR(dev)];
303         if (persistent)
304                 size = MD_NEW_SIZE_BLOCKS(size);
305         return size;
306 }
307 
308 static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent)
309 {
310         unsigned int size;
311 
312         size = calc_dev_sboffset(dev, mddev, persistent);
313         if (!mddev->sb) {
314                 MD_BUG();
315                 return size;
316         }
317         if (mddev->sb->chunk_size)
318                 size &= ~(mddev->sb->chunk_size/1024 - 1);
319         return size;
320 }
321 
322 static unsigned int zoned_raid_size (mddev_t *mddev)
323 {
324         unsigned int mask;
325         mdk_rdev_t * rdev;
326         struct md_list_head *tmp;
327 
328         if (!mddev->sb) {
329                 MD_BUG();
330                 return -EINVAL;
331         }
332         /*
333          * do size and offset calculations.
334          */
335         mask = ~(mddev->sb->chunk_size/1024 - 1);
336 
337         ITERATE_RDEV(mddev,rdev,tmp) {
338                 rdev->size &= mask;
339                 md_size[mdidx(mddev)] += rdev->size;
340         }
341         return 0;
342 }
343 
344 /*
345  * We check wether all devices are numbered from 0 to nb_dev-1. The
346  * order is guaranteed even after device name changes.
347  *
348  * Some personalities (raid0, linear) use this. Personalities that
349  * provide data have to be able to deal with loss of individual
350  * disks, so they do their checking themselves.
351  */
352 int md_check_ordering (mddev_t *mddev)
353 {
354         int i, c;
355         mdk_rdev_t *rdev;
356         struct md_list_head *tmp;
357 
358         /*
359          * First, all devices must be fully functional
360          */
361         ITERATE_RDEV(mddev,rdev,tmp) {
362                 if (rdev->faulty) {
363                         printk("md: md%d's device %s faulty, aborting.\n",
364                                 mdidx(mddev), partition_name(rdev->dev));
365                         goto abort;
366                 }
367         }
368 
369         c = 0;
370         ITERATE_RDEV(mddev,rdev,tmp) {
371                 c++;
372         }
373         if (c != mddev->nb_dev) {
374                 MD_BUG();
375                 goto abort;
376         }
377         if (mddev->nb_dev != mddev->sb->raid_disks) {
378                 printk("md: md%d, array needs %d disks, has %d, aborting.\n",
379                         mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
380                 goto abort;
381         }
382         /*
383          * Now the numbering check
384          */
385         for (i = 0; i < mddev->nb_dev; i++) {
386                 c = 0;
387                 ITERATE_RDEV(mddev,rdev,tmp) {
388                         if (rdev->desc_nr == i)
389                                 c++;
390                 }
391                 if (!c) {
392                         printk("md: md%d, missing disk #%d, aborting.\n",
393                                 mdidx(mddev), i);
394                         goto abort;
395                 }
396                 if (c > 1) {
397                         printk("md: md%d, too many disks #%d, aborting.\n",
398                                 mdidx(mddev), i);
399                         goto abort;
400                 }
401         }
402         return 0;
403 abort:
404         return 1;
405 }
406 
407 static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
408 {
409         if (disk_active(disk)) {
410                 sb->working_disks--;
411         } else {
412                 if (disk_spare(disk)) {
413                         sb->spare_disks--;
414                         sb->working_disks--;
415                 } else  {
416                         sb->failed_disks--;
417                 }
418         }
419         sb->nr_disks--;
420         disk->major = 0;
421         disk->minor = 0;
422         mark_disk_removed(disk);
423 }
424 
425 #define BAD_MAGIC KERN_ERR \
426 "md: invalid raid superblock magic on %s\n"
427 
428 #define BAD_MINOR KERN_ERR \
429 "md: %s: invalid raid minor (%x)\n"
430 
431 #define OUT_OF_MEM KERN_ALERT \
432 "md: out of memory.\n"
433 
434 #define NO_SB KERN_ERR \
435 "md: disabled device %s, could not read superblock.\n"
436 
437 #define BAD_CSUM KERN_WARNING \
438 "md: invalid superblock checksum on %s\n"
439 
440 static int alloc_array_sb (mddev_t * mddev)
441 {
442         if (mddev->sb) {
443                 MD_BUG();
444                 return 0;
445         }
446 
447         mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
448         if (!mddev->sb)
449                 return -ENOMEM;
450         md_clear_page(mddev->sb);
451         return 0;
452 }
453 
454 static int alloc_disk_sb (mdk_rdev_t * rdev)
455 {
456         if (rdev->sb)
457                 MD_BUG();
458 
459         rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
460         if (!rdev->sb) {
461                 printk (OUT_OF_MEM);
462                 return -EINVAL;
463         }
464         md_clear_page(rdev->sb);
465 
466         return 0;
467 }
468 
469 static void free_disk_sb (mdk_rdev_t * rdev)
470 {
471         if (rdev->sb) {
472                 free_page((unsigned long) rdev->sb);
473                 rdev->sb = NULL;
474                 rdev->sb_offset = 0;
475                 rdev->size = 0;
476         } else {
477                 if (!rdev->faulty)
478                         MD_BUG();
479         }
480 }
481 
482 static void mark_rdev_faulty (mdk_rdev_t * rdev)
483 {
484         if (!rdev) {
485                 MD_BUG();
486                 return;
487         }
488         free_disk_sb(rdev);
489         rdev->faulty = 1;
490 }
491 
492 static int read_disk_sb (mdk_rdev_t * rdev)
493 {
494         int ret = -EINVAL;
495         struct buffer_head *bh = NULL;
496         kdev_t dev = rdev->dev;
497         mdp_super_t *sb;
498         unsigned long sb_offset;
499 
500         if (!rdev->sb) {
501                 MD_BUG();
502                 goto abort;
503         }       
504         
505         /*
506          * Calculate the position of the superblock,
507          * it's at the end of the disk
508          */
509         sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
510         rdev->sb_offset = sb_offset;
511         printk("(read) %s's sb offset: %ld", partition_name(dev), sb_offset);
512         fsync_dev(dev);
513         set_blocksize (dev, MD_SB_BYTES);
514         bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
515 
516         if (bh) {
517                 sb = (mdp_super_t *) bh->b_data;
518                 memcpy (rdev->sb, sb, MD_SB_BYTES);
519         } else {
520                 printk (NO_SB,partition_name(rdev->dev));
521                 goto abort;
522         }
523         printk(" [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
524         ret = 0;
525 abort:
526         if (bh)
527                 brelse (bh);
528         return ret;
529 }
530 
531 static unsigned int calc_sb_csum (mdp_super_t * sb)
532 {
533         unsigned int disk_csum, csum;
534 
535         disk_csum = sb->sb_csum;
536         sb->sb_csum = 0;
537         csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
538         sb->sb_csum = disk_csum;
539         return csum;
540 }
541 
542 /*
543  * Check one RAID superblock for generic plausibility
544  */
545 
546 static int check_disk_sb (mdk_rdev_t * rdev)
547 {
548         mdp_super_t *sb;
549         int ret = -EINVAL;
550 
551         sb = rdev->sb;
552         if (!sb) {
553                 MD_BUG();
554                 goto abort;
555         }
556 
557         if (sb->md_magic != MD_SB_MAGIC) {
558                 printk (BAD_MAGIC, partition_name(rdev->dev));
559                 goto abort;
560         }
561 
562         if (sb->md_minor >= MAX_MD_DEVS) {
563                 printk (BAD_MINOR, partition_name(rdev->dev),
564                                                         sb->md_minor);
565                 goto abort;
566         }
567 
568         if (calc_sb_csum(sb) != sb->sb_csum)
569                 printk(BAD_CSUM, partition_name(rdev->dev));
570         ret = 0;
571 abort:
572         return ret;
573 }
574 
575 static kdev_t dev_unit(kdev_t dev)
576 {
577         unsigned int mask;
578         struct gendisk *hd = find_gendisk(dev);
579 
580         if (!hd)
581                 return 0;
582         mask = ~((1 << hd->minor_shift) - 1);
583 
584         return MKDEV(MAJOR(dev), MINOR(dev) & mask);
585 }
586 
587 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
588 {
589         struct md_list_head *tmp;
590         mdk_rdev_t *rdev;
591 
592         ITERATE_RDEV(mddev,rdev,tmp)
593                 if (dev_unit(rdev->dev) == dev_unit(dev))
594                         return rdev;
595 
596         return NULL;
597 }
598 
599 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
600 {
601         struct md_list_head *tmp;
602         mdk_rdev_t *rdev;
603 
604         ITERATE_RDEV(mddev1,rdev,tmp)
605                 if (match_dev_unit(mddev2, rdev->dev))
606                         return 1;
607 
608         return 0;
609 }
610 
611 static MD_LIST_HEAD(all_raid_disks);
612 static MD_LIST_HEAD(pending_raid_disks);
613 
614 static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
615 {
616         mdk_rdev_t *same_pdev;
617 
618         if (rdev->mddev) {
619                 MD_BUG();
620                 return;
621         }
622         same_pdev = match_dev_unit(mddev, rdev->dev);
623         if (same_pdev)
624                 printk( KERN_WARNING
625 "md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
626 "     protection against single-disk failure might be compromised.\n",
627                         mdidx(mddev), partition_name(rdev->dev),
628                                 partition_name(same_pdev->dev));
629                 
630         md_list_add(&rdev->same_set, &mddev->disks);
631         rdev->mddev = mddev;
632         mddev->nb_dev++;
633         printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
634 }
635 
636 static void unbind_rdev_from_array (mdk_rdev_t * rdev)
637 {
638         if (!rdev->mddev) {
639                 MD_BUG();
640                 return;
641         }
642         md_list_del(&rdev->same_set);
643         MD_INIT_LIST_HEAD(&rdev->same_set);
644         rdev->mddev->nb_dev--;
645         printk("unbind<%s,%d>\n", partition_name(rdev->dev),
646                                                  rdev->mddev->nb_dev);
647         rdev->mddev = NULL;
648 }
649 
650 /*
651  * prevent the device from being mounted, repartitioned or
652  * otherwise reused by a RAID array (or any other kernel
653  * subsystem), by opening the device. [simply getting an
654  * inode is not enough, the SCSI module usage code needs
655  * an explicit open() on the device]
656  */
657 static int lock_rdev (mdk_rdev_t *rdev)
658 {
659         int err = 0;
660         struct block_device *bdev;
661 
662         bdev = bdget(rdev->dev);
663         if (bdev == NULL)
664                 return -ENOMEM;
665         err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FILE);
666         if (!err) {
667                 rdev->bdev = bdev;
668         }
669         return err;
670 }
671 
672 static void unlock_rdev (mdk_rdev_t *rdev)
673 {
674         if (!rdev->bdev)
675                 MD_BUG();
676         blkdev_put(rdev->bdev, BDEV_FILE);
677         bdput(rdev->bdev);
678         rdev->bdev = NULL;
679 }
680 
681 static void export_rdev (mdk_rdev_t * rdev)
682 {
683         printk("export_rdev(%s)\n",partition_name(rdev->dev));
684         if (rdev->mddev)
685                 MD_BUG();
686         unlock_rdev(rdev);
687         free_disk_sb(rdev);
688         md_list_del(&rdev->all);
689         MD_INIT_LIST_HEAD(&rdev->all);
690         if (rdev->pending.next != &rdev->pending) {
691                 printk("(%s was pending)\n",partition_name(rdev->dev));
692                 md_list_del(&rdev->pending);
693                 MD_INIT_LIST_HEAD(&rdev->pending);
694         }
695         rdev->dev = 0;
696         rdev->faulty = 0;
697         kfree(rdev);
698 }
699 
700 static void kick_rdev_from_array (mdk_rdev_t * rdev)
701 {
702         unbind_rdev_from_array(rdev);
703         export_rdev(rdev);
704 }
705 
706 static void export_array (mddev_t *mddev)
707 {
708         struct md_list_head *tmp;
709         mdk_rdev_t *rdev;
710         mdp_super_t *sb = mddev->sb;
711 
712         if (mddev->sb) {
713                 mddev->sb = NULL;
714                 free_page((unsigned long) sb);
715         }
716 
717         ITERATE_RDEV(mddev,rdev,tmp) {
718                 if (!rdev->mddev) {
719                         MD_BUG();
720                         continue;
721                 }
722                 kick_rdev_from_array(rdev);
723         }
724         if (mddev->nb_dev)
725                 MD_BUG();
726 }
727 
728 static void free_mddev (mddev_t *mddev)
729 {
730         if (!mddev) {
731                 MD_BUG();
732                 return;
733         }
734 
735         export_array(mddev);
736         md_size[mdidx(mddev)] = 0;
737         md_hd_struct[mdidx(mddev)].nr_sects = 0;
738 
739         /*
740          * Make sure nobody else is using this mddev
741          * (careful, we rely on the global kernel lock here)
742          */
743         while (md_atomic_read(&mddev->resync_sem.count) != 1)
744                 schedule();
745         while (md_atomic_read(&mddev->recovery_sem.count) != 1)
746                 schedule();
747 
748         del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
749         md_list_del(&mddev->all_mddevs);
750         MD_INIT_LIST_HEAD(&mddev->all_mddevs);
751         kfree(mddev);
752         MOD_DEC_USE_COUNT;
753 }
754 
755 #undef BAD_CSUM
756 #undef BAD_MAGIC
757 #undef OUT_OF_MEM
758 #undef NO_SB
759 
760 static void print_desc(mdp_disk_t *desc)
761 {
762         printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
763                 partition_name(MKDEV(desc->major,desc->minor)),
764                 desc->major,desc->minor,desc->raid_disk,desc->state);
765 }
766 
767 static void print_sb(mdp_super_t *sb)
768 {
769         int i;
770 
771         printk("  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
772                 sb->major_version, sb->minor_version, sb->patch_version,
773                 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
774                 sb->ctime);
775         printk("     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
776                 sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
777                 sb->layout, sb->chunk_size);
778         printk("     UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
779                 sb->utime, sb->state, sb->active_disks, sb->working_disks,
780                 sb->failed_disks, sb->spare_disks,
781                 sb->sb_csum, (unsigned long)sb->events_lo);
782 
783         for (i = 0; i < MD_SB_DISKS; i++) {
784                 mdp_disk_t *desc;
785 
786                 desc = sb->disks + i;
787                 printk("     D %2d: ", i);
788                 print_desc(desc);
789         }
790         printk("     THIS: ");
791         print_desc(&sb->this_disk);
792 
793 }
794 
795 static void print_rdev(mdk_rdev_t *rdev)
796 {
797         printk(" rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
798                 partition_name(rdev->dev), partition_name(rdev->old_dev),
799                 rdev->size, rdev->faulty, rdev->desc_nr);
800         if (rdev->sb) {
801                 printk("rdev superblock:\n");
802                 print_sb(rdev->sb);
803         } else
804                 printk("no rdev superblock!\n");
805 }
806 
807 void md_print_devices (void)
808 {
809         struct md_list_head *tmp, *tmp2;
810         mdk_rdev_t *rdev;
811         mddev_t *mddev;
812 
813         printk("\n");
814         printk("        **********************************\n");
815         printk("        * <COMPLETE RAID STATE PRINTOUT> *\n");
816         printk("        **********************************\n");
817         ITERATE_MDDEV(mddev,tmp) {
818                 printk("md%d: ", mdidx(mddev));
819 
820                 ITERATE_RDEV(mddev,rdev,tmp2)
821                         printk("<%s>", partition_name(rdev->dev));
822 
823                 if (mddev->sb) {
824                         printk(" array superblock:\n");
825                         print_sb(mddev->sb);
826                 } else
827                         printk(" no array superblock.\n");
828 
829                 ITERATE_RDEV(mddev,rdev,tmp2)
830                         print_rdev(rdev);
831         }
832         printk("        **********************************\n");
833         printk("\n");
834 }
835 
836 static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
837 {
838         int ret;
839         mdp_super_t *tmp1, *tmp2;
840 
841         tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
842         tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
843 
844         if (!tmp1 || !tmp2) {
845                 ret = 0;
846                 goto abort;
847         }
848 
849         *tmp1 = *sb1;
850         *tmp2 = *sb2;
851 
852         /*
853          * nr_disks is not constant
854          */
855         tmp1->nr_disks = 0;
856         tmp2->nr_disks = 0;
857 
858         if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
859                 ret = 0;
860         else
861                 ret = 1;
862 
863 abort:
864         if (tmp1)
865                 kfree(tmp1);
866         if (tmp2)
867                 kfree(tmp2);
868 
869         return ret;
870 }
871 
872 static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
873 {
874         if (    (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
875                 (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
876                 (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
877                 (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
878 
879                 return 1;
880 
881         return 0;
882 }
883 
884 static mdk_rdev_t * find_rdev_all (kdev_t dev)
885 {
886         struct md_list_head *tmp;
887         mdk_rdev_t *rdev;
888 
889         tmp = all_raid_disks.next;
890         while (tmp != &all_raid_disks) {
891                 rdev = md_list_entry(tmp, mdk_rdev_t, all);
892                 if (rdev->dev == dev)
893                         return rdev;
894                 tmp = tmp->next;
895         }
896         return NULL;
897 }
898 
899 #define GETBLK_FAILED KERN_ERR \
900 "md: getblk failed for device %s\n"
901 
902 static int write_disk_sb(mdk_rdev_t * rdev)
903 {
904         struct buffer_head *bh;
905         kdev_t dev;
906         unsigned long sb_offset, size;
907         mdp_super_t *sb;
908 
909         if (!rdev->sb) {
910                 MD_BUG();
911                 return -1;
912         }
913         if (rdev->faulty) {
914                 MD_BUG();
915                 return -1;
916         }
917         if (rdev->sb->md_magic != MD_SB_MAGIC) {
918                 MD_BUG();
919                 return -1;
920         }
921 
922         dev = rdev->dev;
923         sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
924         if (rdev->sb_offset != sb_offset) {
925                 printk("%s's sb offset has changed from %ld to %ld, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
926                 goto skip;
927         }
928         /*
929          * If the disk went offline meanwhile and it's just a spare, then
930          * it's size has changed to zero silently, and the MD code does
931          * not yet know that it's faulty.
932          */
933         size = calc_dev_size(dev, rdev->mddev, 1);
934         if (size != rdev->size) {
935                 printk("%s's size has changed from %ld to %ld since import, skipping\n", partition_name(dev), rdev->size, size);
936                 goto skip;
937         }
938 
939         printk("(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset);
940         fsync_dev(dev);
941         set_blocksize(dev, MD_SB_BYTES);
942         bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
943         if (!bh) {
944                 printk(GETBLK_FAILED, partition_name(dev));
945                 return 1;
946         }
947         memset(bh->b_data,0,bh->b_size);
948         sb = (mdp_super_t *) bh->b_data;
949         memcpy(sb, rdev->sb, MD_SB_BYTES);
950 
951         mark_buffer_uptodate(bh, 1);
952         mark_buffer_dirty(bh);
953         ll_rw_block(WRITE, 1, &bh);
954         wait_on_buffer(bh);
955         brelse(bh);
956         fsync_dev(dev);
957 skip:
958         return 0;
959 }
960 #undef GETBLK_FAILED 
961 
962 static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
963 {
964         int i, ok = 0;
965         mdp_disk_t *desc;
966 
967         for (i = 0; i < MD_SB_DISKS; i++) {
968                 desc = mddev->sb->disks + i;
969 #if 0
970                 if (disk_faulty(desc)) {
971                         if (MKDEV(desc->major,desc->minor) == rdev->dev)
972                                 ok = 1;
973                         continue;
974                 }
975 #endif
976                 if (MKDEV(desc->major,desc->minor) == rdev->dev) {
977                         rdev->sb->this_disk = *desc;
978                         rdev->desc_nr = desc->number;
979                         ok = 1;
980                         break;
981                 }
982         }
983 
984         if (!ok) {
985                 MD_BUG();
986         }
987 }
988 
989 static int sync_sbs(mddev_t * mddev)
990 {
991         mdk_rdev_t *rdev;
992         mdp_super_t *sb;
993         struct md_list_head *tmp;
994 
995         ITERATE_RDEV(mddev,rdev,tmp) {
996                 if (rdev->faulty)
997                         continue;
998                 sb = rdev->sb;
999                 *sb = *mddev->sb;
1000                 set_this_disk(mddev, rdev);
1001                 sb->sb_csum = calc_sb_csum(sb);
1002         }
1003         return 0;
1004 }
1005 
1006 int md_update_sb(mddev_t * mddev)
1007 {
1008         int first, err, count = 100;
1009         struct md_list_head *tmp;
1010         mdk_rdev_t *rdev;
1011 
1012 repeat:
1013         mddev->sb->utime = CURRENT_TIME;
1014         if ((++mddev->sb->events_lo)==0)
1015                 ++mddev->sb->events_hi;
1016 
1017         if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
1018                 /*
1019                  * oops, this 64-bit counter should never wrap.
1020                  * Either we are in around ~1 trillion A.C., assuming
1021                  * 1 reboot per second, or we have a bug:
1022                  */
1023                 MD_BUG();
1024                 mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
1025         }
1026         sync_sbs(mddev);
1027 
1028         /*
1029          * do not write anything to disk if using
1030          * nonpersistent superblocks
1031          */
1032         if (mddev->sb->not_persistent)
1033                 return 0;
1034 
1035         printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
1036                                         mdidx(mddev));
1037 
1038         first = 1;
1039         err = 0;
1040         ITERATE_RDEV(mddev,rdev,tmp) {
1041                 if (!first) {
1042                         first = 0;
1043                         printk(", ");
1044                 }
1045                 if (rdev->faulty)
1046                         printk("(skipping faulty ");
1047                 printk("%s ", partition_name(rdev->dev));
1048                 if (!rdev->faulty) {
1049                         printk("[events: %08lx]",
1050                                 (unsigned long)rdev->sb->events_lo);
1051                         err += write_disk_sb(rdev);
1052                 } else
1053                         printk(")\n");
1054         }
1055         printk(".\n");
1056         if (err) {
1057                 printk("errors occured during superblock update, repeating\n");
1058                 if (--count)
1059                         goto repeat;
1060                 printk("excessive errors occured during superblock update, exiting\n");
1061         }
1062         return 0;
1063 }
1064 
1065 /*
1066  * Import a device. If 'on_disk', then sanity check the superblock
1067  *
1068  * mark the device faulty if:
1069  *
1070  *   - the device is nonexistent (zero size)
1071  *   - the device has no valid superblock
1072  *
1073  * a faulty rdev _never_ has rdev->sb set.
1074  */
1075 static int md_import_device (kdev_t newdev, int on_disk)
1076 {
1077         int err;
1078         mdk_rdev_t *rdev;
1079         unsigned int size;
1080 
1081         if (find_rdev_all(newdev))
1082                 return -EEXIST;
1083 
1084         rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
1085         if (!rdev) {
1086                 printk("could not alloc mem for %s!\n", partition_name(newdev));
1087                 return -ENOMEM;
1088         }
1089         memset(rdev, 0, sizeof(*rdev));
1090 
1091         if (get_super(newdev)) {
1092                 printk("md: can not import %s, has active inodes!\n",
1093                         partition_name(newdev));
1094                 err = -EBUSY;
1095                 goto abort_free;
1096         }
1097 
1098         if ((err = alloc_disk_sb(rdev)))
1099                 goto abort_free;
1100 
1101         rdev->dev = newdev;
1102         if (lock_rdev(rdev)) {
1103                 printk("md: could not lock %s, zero-size? Marking faulty.\n",
1104                         partition_name(newdev));
1105                 err = -EINVAL;
1106                 goto abort_free;
1107         }
1108         rdev->desc_nr = -1;
1109         rdev->faulty = 0;
1110 
1111         size = 0;
1112         if (blk_size[MAJOR(newdev)])
1113                 size = blk_size[MAJOR(newdev)][MINOR(newdev)];
1114         if (!size) {
1115                 printk("md: %s has zero size, marking faulty!\n",
1116                                 partition_name(newdev));
1117                 err = -EINVAL;
1118                 goto abort_free;
1119         }
1120 
1121         if (on_disk) {
1122                 if ((err = read_disk_sb(rdev))) {
1123                         printk("md: could not read %s's sb, not importing!\n",
1124                                         partition_name(newdev));
1125                         goto abort_free;
1126                 }
1127                 if ((err = check_disk_sb(rdev))) {
1128                         printk("md: %s has invalid sb, not importing!\n",
1129                                         partition_name(newdev));
1130                         goto abort_free;
1131                 }
1132 
1133                 rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
1134                                         rdev->sb->this_disk.minor);
1135                 rdev->desc_nr = rdev->sb->this_disk.number;
1136         }
1137         md_list_add(&rdev->all, &all_raid_disks);
1138         MD_INIT_LIST_HEAD(&rdev->pending);
1139 
1140         if (rdev->faulty && rdev->sb)
1141                 free_disk_sb(rdev);
1142         return 0;
1143 
1144 abort_free:
1145         if (rdev->sb) {
1146                 if (rdev->bdev)
1147                         unlock_rdev(rdev);
1148                 free_disk_sb(rdev);
1149         }
1150         kfree(rdev);
1151         return err;
1152 }
1153 
1154 /*
1155  * Check a full RAID array for plausibility
1156  */
1157 
1158 #define INCONSISTENT KERN_ERR \
1159 "md: fatal superblock inconsistency in %s -- removing from array\n"
1160 
1161 #define OUT_OF_DATE KERN_ERR \
1162 "md: superblock update time inconsistency -- using the most recent one\n"
1163 
1164 #define OLD_VERSION KERN_ALERT \
1165 "md: md%d: unsupported raid array version %d.%d.%d\n"
1166 
1167 #define NOT_CLEAN_IGNORE KERN_ERR \
1168 "md: md%d: raid array is not clean -- starting background reconstruction\n"
1169 
1170 #define UNKNOWN_LEVEL KERN_ERR \
1171 "md: md%d: unsupported raid level %d\n"
1172 
1173 static int analyze_sbs (mddev_t * mddev)
1174 {
1175         int out_of_date = 0, i;
1176         struct md_list_head *tmp, *tmp2;
1177         mdk_rdev_t *rdev, *rdev2, *freshest;
1178         mdp_super_t *sb;
1179 
1180         /*
1181          * Verify the RAID superblock on each real device
1182          */
1183         ITERATE_RDEV(mddev,rdev,tmp) {
1184                 if (rdev->faulty) {
1185                         MD_BUG();
1186                         goto abort;
1187                 }
1188                 if (!rdev->sb) {
1189                         MD_BUG();
1190                         goto abort;
1191                 }
1192                 if (check_disk_sb(rdev))
1193                         goto abort;
1194         }
1195 
1196         /*
1197          * The superblock constant part has to be the same
1198          * for all disks in the array.
1199          */
1200         sb = NULL;
1201 
1202         ITERATE_RDEV(mddev,rdev,tmp) {
1203                 if (!sb) {
1204                         sb = rdev->sb;
1205                         continue;
1206                 }
1207                 if (!sb_equal(sb, rdev->sb)) {
1208                         printk (INCONSISTENT, partition_name(rdev->dev));
1209                         kick_rdev_from_array(rdev);
1210                         continue;
1211                 }
1212         }
1213 
1214         /*
1215          * OK, we have all disks and the array is ready to run. Let's
1216          * find the freshest superblock, that one will be the superblock
1217          * that represents the whole array.
1218          */
1219         if (!mddev->sb)
1220                 if (alloc_array_sb(mddev))
1221                         goto abort;
1222         sb = mddev->sb;
1223         freshest = NULL;
1224 
1225         ITERATE_RDEV(mddev,rdev,tmp) {
1226                 __u64 ev1, ev2;
1227                 /*
1228                  * if the checksum is invalid, use the superblock
1229                  * only as a last resort. (decrease it's age by
1230                  * one event)
1231                  */
1232                 if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
1233                         if (rdev->sb->events_lo || rdev->sb->events_hi)
1234                                 if ((rdev->sb->events_lo--)==0)
1235                                         rdev->sb->events_hi--;
1236                 }
1237 
1238                 printk("%s's event counter: %08lx\n", partition_name(rdev->dev),
1239                         (unsigned long)rdev->sb->events_lo);
1240                 if (!freshest) {
1241                         freshest = rdev;
1242                         continue;
1243                 }
1244                 /*
1245                  * Find the newest superblock version
1246                  */
1247                 ev1 = md_event(rdev->sb);
1248                 ev2 = md_event(freshest->sb);
1249                 if (ev1 != ev2) {
1250                         out_of_date = 1;
1251                         if (ev1 > ev2)
1252                                 freshest = rdev;
1253                 }
1254         }
1255         if (out_of_date) {
1256                 printk(OUT_OF_DATE);
1257                 printk("freshest: %s\n", partition_name(freshest->dev));
1258         }
1259         memcpy (sb, freshest->sb, sizeof(*sb));
1260 
1261         /*
1262          * at this point we have picked the 'best' superblock
1263          * from all available superblocks.
1264          * now we validate this superblock and kick out possibly
1265          * failed disks.
1266          */
1267         ITERATE_RDEV(mddev,rdev,tmp) {
1268                 /*
1269                  * Kick all non-fresh devices faulty
1270                  */
1271                 __u64 ev1, ev2;
1272                 ev1 = md_event(rdev->sb);
1273                 ev2 = md_event(sb);
1274                 ++ev1;
1275                 if (ev1 < ev2) {
1276                         printk("md: kicking non-fresh %s from array!\n",
1277                                                 partition_name(rdev->dev));
1278                         kick_rdev_from_array(rdev);
1279                         continue;
1280                 }
1281         }
1282 
1283         /*
1284          * Fix up changed device names ... but only if this disk has a
1285          * recent update time. Use faulty checksum ones too.
1286          */
1287         ITERATE_RDEV(mddev,rdev,tmp) {
1288                 __u64 ev1, ev2, ev3;
1289                 if (rdev->faulty) { /* REMOVEME */
1290                         MD_BUG();
1291                         goto abort;
1292                 }
1293                 ev1 = md_event(rdev->sb);
1294                 ev2 = md_event(sb);
1295                 ev3 = ev2;
1296                 --ev3;
1297                 if ((rdev->dev != rdev->old_dev) &&
1298                     ((ev1 == ev2) || (ev1 == ev3))) {
1299                         mdp_disk_t *desc;
1300 
1301                         printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
1302                         if (rdev->desc_nr == -1) {
1303                                 MD_BUG();
1304                                 goto abort;
1305                         }
1306                         desc = &sb->disks[rdev->desc_nr];
1307                         if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
1308                                 MD_BUG();
1309                                 goto abort;
1310                         }
1311                         desc->major = MAJOR(rdev->dev);
1312                         desc->minor = MINOR(rdev->dev);
1313                         desc = &rdev->sb->this_disk;
1314                         desc->major = MAJOR(rdev->dev);
1315                         desc->minor = MINOR(rdev->dev);
1316                 }
1317         }
1318 
1319         /*
1320          * Remove unavailable and faulty devices ...
1321          *
1322          * note that if an array becomes completely unrunnable due to
1323          * missing devices, we do not write the superblock back, so the
1324          * administrator has a chance to fix things up. The removal thus
1325          * only happens if it's nonfatal to the contents of the array.
1326          */
1327         for (i = 0; i < MD_SB_DISKS; i++) {
1328                 int found;
1329                 mdp_disk_t *desc;
1330                 kdev_t dev;
1331 
1332                 desc = sb->disks + i;
1333                 dev = MKDEV(desc->major, desc->minor);
1334 
1335                 /*
1336                  * We kick faulty devices/descriptors immediately.
1337                  */
1338                 if (disk_faulty(desc)) {
1339                         found = 0;
1340                         ITERATE_RDEV(mddev,rdev,tmp) {
1341                                 if (rdev->desc_nr != desc->number)
1342                                         continue;
1343                                 printk("md%d: kicking faulty %s!\n",
1344                                         mdidx(mddev),partition_name(rdev->dev));
1345                                 kick_rdev_from_array(rdev);
1346                                 found = 1;
1347                                 break;
1348                         }
1349                         if (!found) {
1350                                 if (dev == MKDEV(0,0))
1351                                         continue;
1352                                 printk("md%d: removing former faulty %s!\n",
1353                                         mdidx(mddev), partition_name(dev));
1354                         }
1355                         remove_descriptor(desc, sb);
1356                         continue;
1357                 }
1358 
1359                 if (dev == MKDEV(0,0))
1360                         continue;
1361                 /*
1362                  * Is this device present in the rdev ring?
1363                  */
1364                 found = 0;
1365                 ITERATE_RDEV(mddev,rdev,tmp) {
1366                         if (rdev->desc_nr == desc->number) {
1367                                 found = 1;
1368                                 break;
1369                         }
1370                 }
1371                 if (found)
1372                         continue;
1373 
1374                 printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev));
1375                 remove_descriptor(desc, sb);
1376         }
1377 
1378         /*
1379          * Double check wether all devices mentioned in the
1380          * superblock are in the rdev ring.
1381          */
1382         for (i = 0; i < MD_SB_DISKS; i++) {
1383                 mdp_disk_t *desc;
1384                 kdev_t dev;
1385 
1386                 desc = sb->disks + i;
1387                 dev = MKDEV(desc->major, desc->minor);
1388 
1389                 if (dev == MKDEV(0,0))
1390                         continue;
1391 
1392                 if (disk_faulty(desc)) {
1393                         MD_BUG();
1394                         goto abort;
1395                 }
1396 
1397                 rdev = find_rdev(mddev, dev);
1398                 if (!rdev) {
1399                         MD_BUG();
1400                         goto abort;
1401                 }
1402         }
1403 
1404         /*
1405          * Do a final reality check.
1406          */
1407         ITERATE_RDEV(mddev,rdev,tmp) {
1408                 if (rdev->desc_nr == -1) {
1409                         MD_BUG();
1410                         goto abort;
1411                 }
1412                 /*
1413                  * is the desc_nr unique?
1414                  */
1415                 ITERATE_RDEV(mddev,rdev2,tmp2) {
1416                         if ((rdev2 != rdev) &&
1417                                         (rdev2->desc_nr == rdev->desc_nr)) {
1418                                 MD_BUG();
1419                                 goto abort;
1420                         }
1421                 }
1422                 /*
1423                  * is the device unique?
1424                  */
1425                 ITERATE_RDEV(mddev,rdev2,tmp2) {
1426                         if ((rdev2 != rdev) &&
1427                                         (rdev2->dev == rdev->dev)) {
1428                                 MD_BUG();
1429                                 goto abort;
1430                         }
1431                 }
1432         }
1433 
1434         /*
1435          * Check if we can support this RAID array
1436          */
1437         if (sb->major_version != MD_MAJOR_VERSION ||
1438                         sb->minor_version > MD_MINOR_VERSION) {
1439 
1440                 printk (OLD_VERSION, mdidx(mddev), sb->major_version,
1441                                 sb->minor_version, sb->patch_version);
1442                 goto abort;
1443         }
1444 
1445         if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
1446                         (sb->level == 4) || (sb->level == 5)))
1447                 printk (NOT_CLEAN_IGNORE, mdidx(mddev));
1448 
1449         return 0;
1450 abort:
1451         return 1;
1452 }
1453 
1454 #undef INCONSISTENT
1455 #undef OUT_OF_DATE
1456 #undef OLD_VERSION
1457 #undef OLD_LEVEL
1458 
1459 static int device_size_calculation (mddev_t * mddev)
1460 {
1461         int data_disks = 0, persistent;
1462         unsigned int readahead;
1463         mdp_super_t *sb = mddev->sb;
1464         struct md_list_head *tmp;
1465         mdk_rdev_t *rdev;
1466 
1467         /*
1468          * Do device size calculation. Bail out if too small.
1469          * (we have to do this after having validated chunk_size,
1470          * because device size has to be modulo chunk_size)
1471          */
1472         persistent = !mddev->sb->not_persistent;
1473         ITERATE_RDEV(mddev,rdev,tmp) {
1474                 if (rdev->faulty)
1475                         continue;
1476                 if (rdev->size) {
1477                         MD_BUG();
1478                         continue;
1479                 }
1480                 rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
1481                 if (rdev->size < sb->chunk_size / 1024) {
1482                         printk (KERN_WARNING
1483                                 "Dev %s smaller than chunk_size: %ldk < %dk\n",
1484                                 partition_name(rdev->dev),
1485                                 rdev->size, sb->chunk_size / 1024);
1486                         return -EINVAL;
1487                 }
1488         }
1489 
1490         switch (sb->level) {
1491                 case -3:
1492                         data_disks = 1;
1493                         break;
1494                 case -2:
1495                         data_disks = 1;
1496                         break;
1497                 case -1:
1498                         zoned_raid_size(mddev);
1499                         data_disks = 1;
1500                         break;
1501                 case 0:
1502                         zoned_raid_size(mddev);
1503                         data_disks = sb->raid_disks;
1504                         break;
1505                 case 1:
1506                         data_disks = 1;
1507                         break;
1508                 case 4:
1509                 case 5:
1510                         data_disks = sb->raid_disks-1;
1511                         break;
1512                 default:
1513                         printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level);
1514                         goto abort;
1515         }
1516         if (!md_size[mdidx(mddev)])
1517                 md_size[mdidx(mddev)] = sb->size * data_disks;
1518 
1519         readahead = MD_READAHEAD;
1520         if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) {
1521                 readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
1522                 if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
1523                         readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
1524         } else {
1525                 if (sb->level == -3)
1526                         readahead = 0;
1527         }
1528         md_maxreadahead[mdidx(mddev)] = readahead;
1529 
1530         printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
1531                 mdidx(mddev), readahead*(PAGE_SIZE/1024));
1532 
1533         printk(KERN_INFO
1534                 "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
1535                         mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
1536         return 0;
1537 abort:
1538         return 1;
1539 }
1540 
1541 
1542 #define TOO_BIG_CHUNKSIZE KERN_ERR \
1543 "too big chunk_size: %d > %d\n"
1544 
1545 #define TOO_SMALL_CHUNKSIZE KERN_ERR \
1546 "too small chunk_size: %d < %ld\n"
1547 
1548 #define BAD_CHUNKSIZE KERN_ERR \
1549 "no chunksize specified, see 'man raidtab'\n"
1550 
1551 static int do_md_run (mddev_t * mddev)
1552 {
1553         int pnum, err;
1554         int chunk_size;
1555         struct md_list_head *tmp;
1556         mdk_rdev_t *rdev;
1557 
1558 
1559         if (!mddev->nb_dev) {
1560                 MD_BUG();
1561                 return -EINVAL;
1562         }
1563 
1564         if (mddev->pers)
1565                 return -EBUSY;
1566 
1567         /*
1568          * Resize disks to align partitions size on a given
1569          * chunk size.
1570          */
1571         md_size[mdidx(mddev)] = 0;
1572 
1573         /*
1574          * Analyze all RAID superblock(s)
1575          */
1576         if (analyze_sbs(mddev)) {
1577                 MD_BUG();
1578                 return -EINVAL;
1579         }
1580 
1581         chunk_size = mddev->sb->chunk_size;
1582         pnum = level_to_pers(mddev->sb->level);
1583 
1584         mddev->param.chunk_size = chunk_size;
1585         mddev->param.personality = pnum;
1586 
1587         if (chunk_size > MAX_CHUNK_SIZE) {
1588                 printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
1589                 return -EINVAL;
1590         }
1591         /*
1592          * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
1593          */
1594         if ( (1 << ffz(~chunk_size)) != chunk_size) {
1595                 MD_BUG();
1596                 return -EINVAL;
1597         }
1598         if (chunk_size < PAGE_SIZE) {
1599                 printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
1600                 return -EINVAL;
1601         }
1602 
1603         if (pnum >= MAX_PERSONALITY) {
1604                 MD_BUG();
1605                 return -EINVAL;
1606         }
1607 
1608         if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) {
1609                 /*
1610                  * 'default chunksize' in the old md code used to
1611                  * be PAGE_SIZE, baaad.
1612                  * we abort here to be on the safe side. We dont
1613                  * want to continue the bad practice.
1614                  */
1615                 printk(BAD_CHUNKSIZE);
1616                 return -EINVAL;
1617         }
1618 
1619         if (!pers[pnum])
1620         {
1621 #ifdef CONFIG_KMOD
1622                 char module_name[80];
1623                 sprintf (module_name, "md-personality-%d", pnum);
1624                 request_module (module_name);
1625                 if (!pers[pnum])
1626 #endif
1627                         return -EINVAL;
1628         }
1629 
1630         if (device_size_calculation(mddev))
1631                 return -EINVAL;
1632 
1633         /*
1634          * Drop all container device buffers, from now on
1635          * the only valid external interface is through the md
1636          * device.
1637          * Also find largest hardsector size
1638          */
1639         md_hardsect_sizes[mdidx(mddev)] = 512;
1640         ITERATE_RDEV(mddev,rdev,tmp) {
1641                 if (rdev->faulty)
1642                         continue;
1643                 fsync_dev(rdev->dev);
1644                 invalidate_buffers(rdev->dev);
1645                 if (get_hardsect_size(rdev->dev)
1646                     > md_hardsect_sizes[mdidx(mddev)]) 
1647                         md_hardsect_sizes[mdidx(mddev)] =
1648                                 get_hardsect_size(rdev->dev);
1649         }
1650         md_blocksizes[mdidx(mddev)] = 1024;
1651         if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
1652                 md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
1653         mddev->pers = pers[pnum];
1654 
1655         err = mddev->pers->run(mddev);
1656         if (err) {
1657                 printk("pers->run() failed ...\n");
1658                 mddev->pers = NULL;
1659                 return -EINVAL;
1660         }
1661 
1662         mddev->sb->state &= ~(1 << MD_SB_CLEAN);
1663         md_update_sb(mddev);
1664 
1665         /*
1666          * md_size has units of 1K blocks, which are
1667          * twice as large as sectors.
1668          */
1669         md_hd_struct[mdidx(mddev)].start_sect = 0;
1670         md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1;
1671 
1672         read_ahead[MD_MAJOR] = 1024;
1673         return (0);
1674 }
1675 
1676 #undef TOO_BIG_CHUNKSIZE
1677 #undef BAD_CHUNKSIZE
1678 
1679 #define OUT(x) do { err = (x); goto out; } while (0)
1680 
1681 static int restart_array (mddev_t *mddev)
1682 {
1683         int err = 0;
1684 
1685         /*
1686          * Complain if it has no devices
1687          */
1688         if (!mddev->nb_dev)
1689                 OUT(-ENXIO);
1690 
1691         if (mddev->pers) {
1692                 if (!mddev->ro)
1693                         OUT(-EBUSY);
1694 
1695                 mddev->ro = 0;
1696                 set_device_ro(mddev_to_kdev(mddev), 0);
1697 
1698                 printk (KERN_INFO
1699                         "md%d switched to read-write mode.\n", mdidx(mddev));
1700                 /*
1701                  * Kick recovery or resync if necessary
1702                  */
1703                 md_recover_arrays();
1704                 if (mddev->pers->restart_resync)
1705                         mddev->pers->restart_resync(mddev);
1706         } else
1707                 err = -EINVAL;
1708 
1709 out:
1710         return err;
1711 }
1712 
1713 #define STILL_MOUNTED KERN_WARNING \
1714 "md: md%d still mounted.\n"
1715 #define STILL_IN_USE \
1716 "md: md%d still in use.\n"
1717 
1718 static int do_md_stop (mddev_t * mddev, int ro)
1719 {
1720         int err = 0, resync_interrupted = 0;
1721         kdev_t dev = mddev_to_kdev(mddev);
1722 
1723         if (atomic_read(&mddev->active)>1) {
1724                 printk(STILL_IN_USE, mdidx(mddev));
1725                 OUT(-EBUSY);
1726         }
1727  
1728         /* this shouldn't be needed as above would have fired */
1729         if (!ro && get_super(dev)) {
1730                 printk (STILL_MOUNTED, mdidx(mddev));
1731                 OUT(-EBUSY);
1732         }
1733 
1734         if (mddev->pers) {
1735                 /*
1736                  * It is safe to call stop here, it only frees private
1737                  * data. Also, it tells us if a device is unstoppable
1738                  * (eg. resyncing is in progress)
1739                  */
1740                 if (mddev->pers->stop_resync)
1741                         if (mddev->pers->stop_resync(mddev))
1742                                 resync_interrupted = 1;
1743 
1744                 if (mddev->recovery_running)
1745                         md_interrupt_thread(md_recovery_thread);
1746 
1747                 /*
1748                  * This synchronizes with signal delivery to the
1749                  * resync or reconstruction thread. It also nicely
1750                  * hangs the process if some reconstruction has not
1751                  * finished.
1752                  */
1753                 down(&mddev->recovery_sem);
1754                 up(&mddev->recovery_sem);
1755 
1756                 /*
1757                  *  sync and invalidate buffers because we cannot kill the
1758                  *  main thread with valid IO transfers still around.
1759                  *  the kernel lock protects us from new requests being
1760                  *  added after invalidate_buffers().
1761                  */
1762                 fsync_dev (mddev_to_kdev(mddev));
1763                 fsync_dev (dev);
1764                 invalidate_buffers (dev);
1765 
1766                 if (ro) {
1767                         if (mddev->ro)
1768                                 OUT(-ENXIO);
1769                         mddev->ro = 1;
1770                 } else {
1771                         if (mddev->ro)
1772                                 set_device_ro(dev, 0);
1773                         if (mddev->pers->stop(mddev)) {
1774                                 if (mddev->ro)
1775                                         set_device_ro(dev, 1);
1776                                 OUT(-EBUSY);
1777                         }
1778                         if (mddev->ro)
1779                                 mddev->ro = 0;
1780                 }
1781                 if (mddev->sb) {
1782                         /*
1783                          * mark it clean only if there was no resync
1784                          * interrupted.
1785                          */
1786                         if (!mddev->recovery_running && !resync_interrupted) {
1787                                 printk("marking sb clean...\n");
1788                                 mddev->sb->state |= 1 << MD_SB_CLEAN;
1789                         }
1790                         md_update_sb(mddev);
1791                 }
1792                 if (ro)
1793                         set_device_ro(dev, 1);
1794         }
1795 
1796         /*
1797          * Free resources if final stop
1798          */
1799         if (!ro) {
1800                 printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
1801                 free_mddev(mddev);
1802 
1803         } else
1804                 printk (KERN_INFO
1805                         "md%d switched to read-only mode.\n", mdidx(mddev));
1806 out:
1807         return err;
1808 }
1809 
1810 #undef OUT
1811 
1812 /*
1813  * We have to safely support old arrays too.
1814  */
1815 int detect_old_array (mdp_super_t *sb)
1816 {
1817         if (sb->major_version > 0)
1818                 return 0;
1819         if (sb->minor_version >= 90)
1820                 return 0;
1821 
1822         return -EINVAL;
1823 }
1824 
1825 
1826 static void autorun_array (mddev_t *mddev)
1827 {
1828         mdk_rdev_t *rdev;
1829         struct md_list_head *tmp;
1830         int err;
1831 
1832         if (mddev->disks.prev == &mddev->disks) {
1833                 MD_BUG();
1834                 return;
1835         }
1836 
1837         printk("running: ");
1838 
1839         ITERATE_RDEV(mddev,rdev,tmp) {
1840                 printk("<%s>", partition_name(rdev->dev));
1841         }
1842         printk("\nnow!\n");
1843 
1844         err = do_md_run (mddev);
1845         if (err) {
1846                 printk("do_md_run() returned %d\n", err);
1847                 /*
1848                  * prevent the writeback of an unrunnable array
1849                  */
1850                 mddev->sb_dirty = 0;
1851                 do_md_stop (mddev, 0);
1852         }
1853 }
1854 
1855 /*
1856  * lets try to run arrays based on all disks that have arrived
1857  * until now. (those are in the ->pending list)
1858  *
1859  * the method: pick the first pending disk, collect all disks with
1860  * the same UUID, remove all from the pending list and put them into
1861  * the 'same_array' list. Then order this list based on superblock
1862  * update time (freshest comes first), kick out 'old' disks and
1863  * compare superblocks. If everything's fine then run it.
1864  *
1865  * If "unit" is allocated, then bump its reference count
1866  */
1867 static void autorun_devices (kdev_t countdev)
1868 {
1869         struct md_list_head candidates;
1870         struct md_list_head *tmp;
1871         mdk_rdev_t *rdev0, *rdev;
1872         mddev_t *mddev;
1873         kdev_t md_kdev;
1874 
1875 
1876         printk("autorun ...\n");
1877         while (pending_raid_disks.next != &pending_raid_disks) {
1878                 rdev0 = md_list_entry(pending_raid_disks.next,
1879                                          mdk_rdev_t, pending);
1880 
1881                 printk("considering %s ...\n", partition_name(rdev0->dev));
1882                 MD_INIT_LIST_HEAD(&candidates);
1883                 ITERATE_RDEV_PENDING(rdev,tmp) {
1884                         if (uuid_equal(rdev0, rdev)) {
1885                                 if (!sb_equal(rdev0->sb, rdev->sb)) {
1886                                         printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev));
1887                                         continue;
1888                                 }
1889                                 printk("  adding %s ...\n", partition_name(rdev->dev));
1890                                 md_list_del(&rdev->pending);
1891                                 md_list_add(&rdev->pending, &candidates);
1892                         }
1893                 }
1894                 /*
1895                  * now we have a set of devices, with all of them having
1896                  * mostly sane superblocks. It's time to allocate the
1897                  * mddev.
1898                  */
1899                 md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
1900                 mddev = kdev_to_mddev(md_kdev);
1901                 if (mddev) {
1902                         printk("md%d already running, cannot run %s\n",
1903                                  mdidx(mddev), partition_name(rdev0->dev));
1904                         ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
1905                                 export_rdev(rdev);
1906                         continue;
1907                 }
1908                 mddev = alloc_mddev(md_kdev);
1909                 if (mddev == NULL) {
1910                         printk("md: cannot allocate memory for md drive.\n");
1911                         break;
1912                 }
1913                 if (md_kdev == countdev)
1914                         atomic_inc(&mddev->active);
1915                 printk("created md%d\n", mdidx(mddev));
1916                 ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
1917                         bind_rdev_to_array(rdev, mddev);
1918                         md_list_del(&rdev->pending);
1919                         MD_INIT_LIST_HEAD(&rdev->pending);
1920                 }
1921                 autorun_array(mddev);
1922         }
1923         printk("... autorun DONE.\n");
1924 }
1925 
1926 /*
1927  * import RAID devices based on one partition
1928  * if possible, the array gets run as well.
1929  */
1930 
1931 #define BAD_VERSION KERN_ERR \
1932 "md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
1933 
1934 #define OUT_OF_MEM KERN_ALERT \
1935 "md: out of memory.\n"
1936 
1937 #define NO_DEVICE KERN_ERR \
1938 "md: disabled device %s\n"
1939 
1940 #define AUTOADD_FAILED KERN_ERR \
1941 "md: auto-adding devices to md%d FAILED (error %d).\n"
1942 
1943 #define AUTOADD_FAILED_USED KERN_ERR \
1944 "md: cannot auto-add device %s to md%d, already used.\n"
1945 
1946 #define AUTORUN_FAILED KERN_ERR \
1947 "md: auto-running md%d FAILED (error %d).\n"
1948 
1949 #define MDDEV_BUSY KERN_ERR \
1950 "md: cannot auto-add to md%d, already running.\n"
1951 
1952 #define AUTOADDING KERN_INFO \
1953 "md: auto-adding devices to md%d, based on %s's superblock.\n"
1954 
1955 #define AUTORUNNING KERN_INFO \
1956 "md: auto-running md%d.\n"
1957 
1958 static int autostart_array (kdev_t startdev, kdev_t countdev)
1959 {
1960         int err = -EINVAL, i;
1961         mdp_super_t *sb = NULL;
1962         mdk_rdev_t *start_rdev = NULL, *rdev;
1963 
1964         if (md_import_device(startdev, 1)) {
1965                 printk("could not import %s!\n", partition_name(startdev));
1966                 goto abort;
1967         }
1968 
1969         start_rdev = find_rdev_all(startdev);
1970         if (!start_rdev) {
1971                 MD_BUG();
1972                 goto abort;
1973         }
1974         if (start_rdev->faulty) {
1975                 printk("can not autostart based on faulty %s!\n",
1976                                                 partition_name(startdev));
1977                 goto abort;
1978         }
1979         md_list_add(&start_rdev->pending, &pending_raid_disks);
1980 
1981         sb = start_rdev->sb;
1982 
1983         err = detect_old_array(sb);
1984         if (err) {
1985                 printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
1986                 goto abort;
1987         }
1988 
1989         for (i = 0; i < MD_SB_DISKS; i++) {
1990                 mdp_disk_t *desc;
1991                 kdev_t dev;
1992 
1993                 desc = sb->disks + i;
1994                 dev = MKDEV(desc->major, desc->minor);
1995 
1996                 if (dev == MKDEV(0,0))
1997                         continue;
1998                 if (dev == startdev)
1999                         continue;
2000                 if (md_import_device(dev, 1)) {
2001                         printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev));
2002                         continue;
2003                 }
2004                 rdev = find_rdev_all(dev);
2005                 if (!rdev) {
2006                         MD_BUG();
2007                         goto abort;
2008                 }
2009                 md_list_add(&rdev->pending, &pending_raid_disks);
2010         }
2011 
2012         /*
2013          * possibly return codes
2014          */
2015         autorun_devices(countdev);
2016         return 0;
2017 
2018 abort:
2019         if (start_rdev)
2020                 export_rdev(start_rdev);
2021         return err;
2022 }
2023 
2024 #undef BAD_VERSION
2025 #undef OUT_OF_MEM
2026 #undef NO_DEVICE
2027 #undef AUTOADD_FAILED_USED
2028 #undef AUTOADD_FAILED
2029 #undef AUTORUN_FAILED
2030 #undef AUTOADDING
2031 #undef AUTORUNNING
2032 
2033 struct {
2034         int set;
2035         int noautodetect;
2036 
2037 } raid_setup_args md__initdata = { 0, 0 };
2038 
2039 void md_setup_drive(void) md__init;
2040 
2041 /*
2042  * Searches all registered partitions for autorun RAID arrays
2043  * at boot time.
2044  */
2045 #ifdef CONFIG_AUTODETECT_RAID
2046 static int detected_devices[128] md__initdata = { 0, };
2047 static int dev_cnt=0;
2048 void md_autodetect_dev(kdev_t dev)
2049 {
2050         if (dev_cnt >= 0 && dev_cnt < 127)
2051                 detected_devices[dev_cnt++] = dev;
2052 }
2053 #endif
2054 
2055 int md__init md_run_setup(void)
2056 {
2057 #ifdef CONFIG_AUTODETECT_RAID
2058         mdk_rdev_t *rdev;
2059         int i;
2060 
2061         if (raid_setup_args.noautodetect)
2062                 printk(KERN_INFO "skipping autodetection of RAID arrays\n");
2063         else {
2064 
2065                 printk(KERN_INFO "autodetecting RAID arrays\n");
2066 
2067                 for (i=0; i<dev_cnt; i++) {
2068                         kdev_t dev = detected_devices[i];
2069 
2070                         if (md_import_device(dev,1)) {
2071                                 printk(KERN_ALERT "could not import %s!\n",
2072                                        partition_name(dev));
2073                                 continue;
2074                         }
2075                         /*
2076                          * Sanity checks:
2077                          */
2078                         rdev = find_rdev_all(dev);
2079                         if (!rdev) {
2080                                 MD_BUG();
2081                                 continue;
2082                         }
2083                         if (rdev->faulty) {
2084                                 MD_BUG();
2085                                 continue;
2086                         }
2087                         md_list_add(&rdev->pending, &pending_raid_disks);
2088                 }
2089 
2090                 autorun_devices(-1);
2091         }
2092 
2093         dev_cnt = -1; /* make sure further calls to md_autodetect_dev are ignored */
2094 #endif
2095 #ifdef CONFIG_MD_BOOT
2096         md_setup_drive();
2097 #endif
2098         return 0;
2099 }
2100 
2101 static int get_version (void * arg)
2102 {
2103         mdu_version_t ver;
2104 
2105         ver.major = MD_MAJOR_VERSION;
2106         ver.minor = MD_MINOR_VERSION;
2107         ver.patchlevel = MD_PATCHLEVEL_VERSION;
2108 
2109         if (md_copy_to_user(arg, &ver, sizeof(ver)))
2110                 return -EFAULT;
2111 
2112         return 0;
2113 }
2114 
2115 #define SET_FROM_SB(x) info.x = mddev->sb->x
2116 static int get_array_info (mddev_t * mddev, void * arg)
2117 {
2118         mdu_array_info_t info;
2119 
2120         if (!mddev->sb)
2121                 return -EINVAL;
2122 
2123         SET_FROM_SB(major_version);
2124         SET_FROM_SB(minor_version);
2125         SET_FROM_SB(patch_version);
2126         SET_FROM_SB(ctime);
2127         SET_FROM_SB(level);
2128         SET_FROM_SB(size);
2129         SET_FROM_SB(nr_disks);
2130         SET_FROM_SB(raid_disks);
2131         SET_FROM_SB(md_minor);
2132         SET_FROM_SB(not_persistent);
2133 
2134         SET_FROM_SB(utime);
2135         SET_FROM_SB(state);
2136         SET_FROM_SB(active_disks);
2137         SET_FROM_SB(working_disks);
2138         SET_FROM_SB(failed_disks);
2139         SET_FROM_SB(spare_disks);
2140 
2141         SET_FROM_SB(layout);
2142         SET_FROM_SB(chunk_size);
2143 
2144         if (md_copy_to_user(arg, &info, sizeof(info)))
2145                 return -EFAULT;
2146 
2147         return 0;
2148 }
2149 #undef SET_FROM_SB
2150 
2151 #define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
2152 static int get_disk_info (mddev_t * mddev, void * arg)
2153 {
2154         mdu_disk_info_t info;
2155         unsigned int nr;
2156 
2157         if (!mddev->sb)
2158                 return -EINVAL;
2159 
2160         if (md_copy_from_user(&info, arg, sizeof(info)))
2161                 return -EFAULT;
2162 
2163         nr = info.number;
2164         if (nr >= mddev->sb->nr_disks)
2165                 return -EINVAL;
2166 
2167         SET_FROM_SB(major);
2168         SET_FROM_SB(minor);
2169         SET_FROM_SB(raid_disk);
2170         SET_FROM_SB(state);
2171 
2172         if (md_copy_to_user(arg, &info, sizeof(info)))
2173                 return -EFAULT;
2174 
2175         return 0;
2176 }
2177 #undef SET_FROM_SB
2178 
2179 #define SET_SB(x) mddev->sb->disks[nr].x = info->x
2180 
2181 static int add_new_disk (mddev_t * mddev, mdu_disk_info_t *info)
2182 {
2183         int err, size, persistent;
2184         mdk_rdev_t *rdev;
2185         unsigned int nr;
2186         kdev_t dev;
2187         dev = MKDEV(info->major,info->minor);
2188 
2189         if (find_rdev_all(dev)) {
2190                 printk("device %s already used in a RAID array!\n",
2191                                 partition_name(dev));
2192                 return -EBUSY;
2193         }
2194         if (!mddev->sb) {
2195                 /* expecting a device which has a superblock */
2196                 err = md_import_device(dev, 1);
2197                 if (err) {
2198                         printk("md error, md_import_device returned %d\n", err);
2199                         return -EINVAL;
2200                 }
2201                 rdev = find_rdev_all(dev);
2202                 if (!rdev) {
2203                         MD_BUG();
2204                         return -EINVAL;
2205                 }
2206                 if (mddev->nb_dev) {
2207                         mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next,
2208                                                           mdk_rdev_t, same_set);
2209                         if (!uuid_equal(rdev0, rdev)) {
2210                                 printk("md: %s has different UUID to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev));
2211                                 export_rdev(rdev);
2212                                 return -EINVAL;
2213                         }
2214                         if (!sb_equal(rdev0->sb, rdev->sb)) {
2215                                 printk("md: %s has same UUID but different superblock to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev));
2216                                 export_rdev(rdev);
2217                                 return -EINVAL;
2218                         }
2219                 }
2220                 bind_rdev_to_array(rdev, mddev);
2221                 return 0;
2222         }
2223 
2224         nr = info->number;
2225         if (nr >= mddev->sb->nr_disks)
2226                 return -EINVAL;
2227 
2228         SET_SB(number);
2229         SET_SB(major);
2230         SET_SB(minor);
2231         SET_SB(raid_disk);
2232         SET_SB(state);
2233 
2234         if ((info->state & (1<<MD_DISK_FAULTY))==0) {
2235                 err = md_import_device (dev, 0);
2236                 if (err) {
2237                         printk("md: error, md_import_device() returned %d\n", err);
2238                         return -EINVAL;
2239                 }
2240                 rdev = find_rdev_all(dev);
2241                 if (!rdev) {
2242                         MD_BUG();
2243                         return -EINVAL;
2244                 }
2245 
2246                 rdev->old_dev = dev;
2247                 rdev->desc_nr = info->number;
2248 
2249                 bind_rdev_to_array(rdev, mddev);
2250 
2251                 persistent = !mddev->sb->not_persistent;
2252                 if (!persistent)
2253                         printk("nonpersistent superblock ...\n");
2254                 if (!mddev->sb->chunk_size)
2255                         printk("no chunksize?\n");
2256 
2257                 size = calc_dev_size(dev, mddev, persistent);
2258                 rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2259 
2260                 if (!mddev->sb->size || (mddev->sb->size > size))
2261                         mddev->sb->size = size;
2262         }
2263 
2264         /*
2265          * sync all other superblocks with the main superblock
2266          */
2267         sync_sbs(mddev);
2268 
2269         return 0;
2270 }
2271 #undef SET_SB
2272 
2273 static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
2274 {
2275         int err;
2276         mdk_rdev_t *rdev;
2277         mdp_disk_t *disk;
2278 
2279         if (!mddev->pers)
2280                 return -ENODEV;
2281 
2282         printk("trying to remove %s from md%d ... \n",
2283                 partition_name(dev), mdidx(mddev));
2284 
2285         if (!mddev->pers->diskop) {
2286                 printk("md%d: personality does not support diskops!\n",
2287                                                                  mdidx(mddev));
2288                 return -EINVAL;
2289         }
2290 
2291         rdev = find_rdev(mddev, dev);
2292         if (!rdev)
2293                 return -ENXIO;
2294 
2295         if (rdev->desc_nr == -1) {
2296                 MD_BUG();
2297                 return -EINVAL;
2298         }
2299         disk = &mddev->sb->disks[rdev->desc_nr];
2300         if (disk_active(disk))
2301                 goto busy;
2302         if (disk_removed(disk)) {
2303                 MD_BUG();
2304                 return -EINVAL;
2305         }
2306         
2307         err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
2308         if (err == -EBUSY)
2309                 goto busy;
2310         if (err) {
2311                 MD_BUG();
2312                 return -EINVAL;
2313         }
2314 
2315         remove_descriptor(disk, mddev->sb);
2316         kick_rdev_from_array(rdev);
2317         mddev->sb_dirty = 1;
2318         md_update_sb(mddev);
2319 
2320         return 0;
2321 busy:
2322         printk("cannot remove active disk %s from md%d ... \n",
2323                 partition_name(dev), mdidx(mddev));
2324         return -EBUSY;
2325 }
2326 
2327 static int hot_add_disk (mddev_t * mddev, kdev_t dev)
2328 {
2329         int i, err, persistent;
2330         unsigned int size;
2331         mdk_rdev_t *rdev;
2332         mdp_disk_t *disk;
2333 
2334         if (!mddev->pers)
2335                 return -ENODEV;
2336 
2337         printk("trying to hot-add %s to md%d ... \n",
2338                 partition_name(dev), mdidx(mddev));
2339 
2340         if (!mddev->pers->diskop) {
2341                 printk("md%d: personality does not support diskops!\n",
2342                                                                  mdidx(mddev));
2343                 return -EINVAL;
2344         }
2345 
2346         persistent = !mddev->sb->not_persistent;
2347         size = calc_dev_size(dev, mddev, persistent);
2348 
2349         if (size < mddev->sb->size) {
2350                 printk("md%d: disk size %d blocks < array size %d\n",
2351                                 mdidx(mddev), size, mddev->sb->size);
2352                 return -ENOSPC;
2353         }
2354 
2355         rdev = find_rdev(mddev, dev);
2356         if (rdev)
2357                 return -EBUSY;
2358 
2359         err = md_import_device (dev, 0);
2360         if (err) {
2361                 printk("md: error, md_import_device() returned %d\n", err);
2362                 return -EINVAL;
2363         }
2364         rdev = find_rdev_all(dev);
2365         if (!rdev) {
2366                 MD_BUG();
2367                 return -EINVAL;
2368         }
2369         if (rdev->faulty) {
2370                 printk("md: can not hot-add faulty %s disk to md%d!\n",
2371                                 partition_name(dev), mdidx(mddev));
2372                 err = -EINVAL;
2373                 goto abort_export;
2374         }
2375         bind_rdev_to_array(rdev, mddev);
2376 
2377         /*
2378          * The rest should better be atomic, we can have disk failures
2379          * noticed in interrupt contexts ...
2380          */
2381         rdev->old_dev = dev;
2382         rdev->size = size;
2383         rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2384 
2385         disk = mddev->sb->disks + mddev->sb->raid_disks;
2386         for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
2387                 disk = mddev->sb->disks + i;
2388 
2389                 if (!disk->major && !disk->minor)
2390                         break;
2391                 if (disk_removed(disk))
2392                         break;
2393         }
2394         if (i == MD_SB_DISKS) {
2395                 printk("md%d: can not hot-add to full array!\n", mdidx(mddev));
2396                 err = -EBUSY;
2397                 goto abort_unbind_export;
2398         }
2399 
2400         if (disk_removed(disk)) {
2401                 /*
2402                  * reuse slot
2403                  */
2404                 if (disk->number != i) {
2405                         MD_BUG();
2406                         err = -EINVAL;
2407                         goto abort_unbind_export;
2408                 }
2409         } else {
2410                 disk->number = i;
2411         }
2412 
2413         disk->raid_disk = disk->number;
2414         disk->major = MAJOR(dev);
2415         disk->minor = MINOR(dev);
2416 
2417         if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
2418                 MD_BUG();
2419                 err = -EINVAL;
2420                 goto abort_unbind_export;
2421         }
2422 
2423         mark_disk_spare(disk);
2424         mddev->sb->nr_disks++;
2425         mddev->sb->spare_disks++;
2426         mddev->sb->working_disks++;
2427 
2428         mddev->sb_dirty = 1;
2429 
2430         md_update_sb(mddev);
2431 
2432         /*
2433          * Kick recovery, maybe this spare has to be added to the
2434          * array immediately.
2435          */
2436         md_recover_arrays();
2437 
2438         return 0;
2439 
2440 abort_unbind_export:
2441         unbind_rdev_from_array(rdev);
2442 
2443 abort_export:
2444         export_rdev(rdev);
2445         return err;
2446 }
2447 
2448 #define SET_SB(x) mddev->sb->x = info->x
2449 static int set_array_info (mddev_t * mddev, mdu_array_info_t *info)
2450 {
2451 
2452         if (alloc_array_sb(mddev))
2453                 return -ENOMEM;
2454 
2455         mddev->sb->major_version = MD_MAJOR_VERSION;
2456         mddev->sb->minor_version = MD_MINOR_VERSION;
2457         mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
2458         mddev->sb->ctime = CURRENT_TIME;
2459 
2460         SET_SB(level);
2461         SET_SB(size);
2462         SET_SB(nr_disks);
2463         SET_SB(raid_disks);
2464         SET_SB(md_minor);
2465         SET_SB(not_persistent);
2466 
2467         SET_SB(state);
2468         SET_SB(active_disks);
2469         SET_SB(working_disks);
2470         SET_SB(failed_disks);
2471         SET_SB(spare_disks);
2472 
2473         SET_SB(layout);
2474         SET_SB(chunk_size);
2475 
2476         mddev->sb->md_magic = MD_SB_MAGIC;
2477 
2478         /*
2479          * Generate a 128 bit UUID
2480          */
2481         get_random_bytes(&mddev->sb->set_uuid0, 4);
2482         get_random_bytes(&mddev->sb->set_uuid1, 4);
2483         get_random_bytes(&mddev->sb->set_uuid2, 4);
2484         get_random_bytes(&mddev->sb->set_uuid3, 4);
2485 
2486         return 0;
2487 }
2488 #undef SET_SB
2489 
2490 static int set_disk_info (mddev_t * mddev, void * arg)
2491 {
2492         printk("not yet");
2493         return -EINVAL;
2494 }
2495 
2496 static int clear_array (mddev_t * mddev)
2497 {
2498         printk("not yet");
2499         return -EINVAL;
2500 }
2501 
2502 static int write_raid_info (mddev_t * mddev)
2503 {
2504         printk("not yet");
2505         return -EINVAL;
2506 }
2507 
2508 static int protect_array (mddev_t * mddev)
2509 {
2510         printk("not yet");
2511         return -EINVAL;
2512 }
2513 
2514 static int unprotect_array (mddev_t * mddev)
2515 {
2516         printk("not yet");
2517         return -EINVAL;
2518 }
2519 
2520 static int set_disk_faulty (mddev_t *mddev, kdev_t dev)
2521 {
2522         int ret;
2523 
2524         fsync_dev(mddev_to_kdev(mddev));
2525         ret = md_error(mddev_to_kdev(mddev), dev);
2526         return ret;
2527 }
2528 
2529 static int md_ioctl (struct inode *inode, struct file *file,
2530                         unsigned int cmd, unsigned long arg)
2531 {
2532         unsigned int minor;
2533         int err = 0;
2534         struct hd_geometry *loc = (struct hd_geometry *) arg;
2535         mddev_t *mddev = NULL;
2536         kdev_t dev;
2537 
2538         if (!md_capable_admin())
2539                 return -EACCES;
2540 
2541         dev = inode->i_rdev;
2542         minor = MINOR(dev);
2543         if (minor >= MAX_MD_DEVS)
2544                 return -EINVAL;
2545 
2546         /*
2547          * Commands dealing with the RAID driver but not any
2548          * particular array:
2549          */
2550         switch (cmd)
2551         {
2552                 case RAID_VERSION:
2553                         err = get_version((void *)arg);
2554                         goto done;
2555 
2556                 case PRINT_RAID_DEBUG:
2557                         err = 0;
2558                         md_print_devices();
2559                         goto done_unlock;
2560 
2561                 case BLKGETSIZE:   /* Return device size */
2562                         if (!arg) {
2563                                 err = -EINVAL;
2564                                 goto abort;
2565                         }
2566                         err = md_put_user(md_hd_struct[minor].nr_sects,
2567                                                 (long *) arg);
2568                         goto done;
2569 
2570                 case BLKFLSBUF:
2571                         fsync_dev(dev);
2572                         invalidate_buffers(dev);
2573                         goto done;
2574 
2575                 case BLKRASET:
2576                         if (arg > 0xff) {
2577                                 err = -EINVAL;
2578                                 goto abort;
2579                         }
2580                         read_ahead[MAJOR(dev)] = arg;
2581                         goto done;
2582 
2583                 case BLKRAGET:
2584                         if (!arg) {
2585                                 err = -EINVAL;
2586                                 goto abort;
2587                         }
2588                         err = md_put_user (read_ahead[
2589                                 MAJOR(dev)], (long *) arg);
2590                         goto done;
2591                 default:
2592         }
2593 
2594         /*
2595          * Commands creating/starting a new array:
2596          */
2597 
2598         mddev = kdev_to_mddev(dev);
2599 
2600         switch (cmd)
2601         {
2602                 case SET_ARRAY_INFO:
2603                 case START_ARRAY:
2604                         if (mddev) {
2605                                 printk("array md%d already exists!\n",
2606                                                                 mdidx(mddev));
2607                                 err = -EEXIST;
2608                                 goto abort;
2609                         }
2610                 default:
2611         }
2612         switch (cmd)
2613         {
2614                 case SET_ARRAY_INFO:
2615                         mddev = alloc_mddev(dev);
2616                         if (!mddev) {
2617                                 err = -ENOMEM;
2618                                 goto abort;
2619                         }
2620                         atomic_inc(&mddev->active);
2621 
2622                         /*
2623                          * alloc_mddev() should possibly self-lock.
2624                          */
2625                         err = lock_mddev(mddev);
2626                         if (err) {
2627                                 printk("ioctl, reason %d, cmd %d\n", err, cmd);
2628                                 goto abort;
2629                         }
2630 
2631                         if (mddev->sb) {
2632                                 printk("array md%d already has a superblock!\n",
2633                                        mdidx(mddev));
2634                                 err = -EBUSY;
2635                                 goto abort_unlock;
2636                         }
2637                         if (arg) {
2638                                 mdu_array_info_t info;
2639                                 if (md_copy_from_user(&info, (void*)arg, sizeof(info))) {
2640                                         err = -EFAULT;
2641                                         goto abort_unlock;
2642                                 }
2643                                 err = set_array_info(mddev, &info);
2644                                 if (err) {
2645                                         printk("couldnt set array info. %d\n", err);
2646                                         goto abort_unlock;
2647                                 }
2648                         }
2649                         goto done_unlock;
2650 
2651                 case START_ARRAY:
2652                         /*
2653                          * possibly make it lock the array ...
2654                          */
2655                         err = autostart_array((kdev_t)arg, dev);
2656                         if (err) {
2657                                 printk("autostart %s failed!\n",
2658                                         partition_name((kdev_t)arg));
2659                                 goto abort;
2660                         }
2661                         goto done;
2662 
2663                 default:
2664         }
2665 
2666         /*
2667          * Commands querying/configuring an existing array:
2668          */
2669 
2670         if (!mddev) {
2671                 err = -ENODEV;
2672                 goto abort;
2673         }
2674         err = lock_mddev(mddev);
2675         if (err) {
2676                 printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
2677                 goto abort;
2678         }
2679         /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
2680         if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
2681                 err = -ENODEV;
2682                 goto abort_unlock;
2683         }
2684 
2685         /*
2686          * Commands even a read-only array can execute:
2687          */
2688         switch (cmd)
2689         {
2690                 case GET_ARRAY_INFO:
2691                         err = get_array_info(mddev, (void *)arg);
2692                         goto done_unlock;
2693 
2694                 case GET_DISK_INFO:
2695                         err = get_disk_info(mddev, (void *)arg);
2696                         goto done_unlock;
2697 
2698                 case RESTART_ARRAY_RW:
2699                         err = restart_array(mddev);
2700                         goto done_unlock;
2701 
2702                 case STOP_ARRAY:
2703                         if (!(err = do_md_stop (mddev, 0)))
2704                                 mddev = NULL;
2705                         goto done_unlock;
2706 
2707                 case STOP_ARRAY_RO:
2708                         err = do_md_stop (mddev, 1);
2709                         goto done_unlock;
2710 
2711         /*
2712          * We have a problem here : there is no easy way to give a CHS
2713          * virtual geometry. We currently pretend that we have a 2 heads
2714          * 4 sectors (with a BIG number of cylinders...). This drives
2715          * dosfs just mad... ;-)
2716          */
2717                 case HDIO_GETGEO:
2718                         if (!loc) {
2719                                 err = -EINVAL;
2720                                 goto abort_unlock;
2721                         }
2722                         err = md_put_user (2, (char *) &loc->heads);
2723                         if (err)
2724                                 goto abort_unlock;
2725                         err = md_put_user (4, (char *) &loc->sectors);
2726                         if (err)
2727                                 goto abort_unlock;
2728                         err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
2729                                                 (short *) &loc->cylinders);
2730                         if (err)
2731                                 goto abort_unlock;
2732                         err = md_put_user (md_hd_struct[minor].start_sect,
2733                                                 (long *) &loc->start);
2734                         goto done_unlock;
2735         }
2736 
2737         /*
2738          * The remaining ioctls are changing the state of the
2739          * superblock, so we do not allow read-only arrays
2740          * here:
2741          */
2742         if (mddev->ro) {
2743                 err = -EROFS;
2744                 goto abort_unlock;
2745         }
2746 
2747         switch (cmd)
2748         {
2749                 case CLEAR_ARRAY:
2750                         err = clear_array(mddev);
2751                         goto done_unlock;
2752 
2753                 case ADD_NEW_DISK:
2754                 {
2755                         mdu_disk_info_t info;
2756                         if (md_copy_from_user(&info, (void*)arg, sizeof(info)))
2757                                 err = -EFAULT;
2758                         else
2759                                 err = add_new_disk(mddev, &info);
2760                         goto done_unlock;
2761                 }
2762                 case HOT_REMOVE_DISK:
2763                         err = hot_remove_disk(mddev, (kdev_t)arg);
2764                         goto done_unlock;
2765 
2766                 case HOT_ADD_DISK:
2767                         err = hot_add_disk(mddev, (kdev_t)arg);
2768                         goto done_unlock;
2769 
2770                 case SET_DISK_INFO:
2771                         err = set_disk_info(mddev, (void *)arg);
2772                         goto done_unlock;
2773 
2774                 case WRITE_RAID_INFO:
2775                         err = write_raid_info(mddev);
2776                         goto done_unlock;
2777 
2778                 case UNPROTECT_ARRAY:
2779                         err = unprotect_array(mddev);
2780                         goto done_unlock;
2781 
2782                 case PROTECT_ARRAY:
2783                         err = protect_array(mddev);
2784                         goto done_unlock;
2785 
2786                 case SET_DISK_FAULTY:
2787                         err = set_disk_faulty(mddev, (kdev_t)arg);
2788                         goto done_unlock;
2789 
2790                 case RUN_ARRAY:
2791                 {
2792 /* The data is never used....
2793                         mdu_param_t param;
2794                         err = md_copy_from_user(&param, (mdu_param_t *)arg,
2795                                                          sizeof(param));
2796                         if (err)
2797                                 goto abort_unlock;
2798 */
2799                         err = do_md_run (mddev);
2800                         /*
2801                          * we have to clean up the mess if
2802                          * the array cannot be run for some
2803                          * reason ...
2804                          */
2805                         if (err) {
2806                                 mddev->sb_dirty = 0;
2807                                 if (!do_md_stop (mddev, 0))
2808                                         mddev = NULL;
2809                         }
2810                         goto done_unlock;
2811                 }
2812 
2813                 default:
2814                         printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid);
2815                         err = -EINVAL;
2816                         goto abort_unlock;
2817         }
2818 
2819 done_unlock:
2820 abort_unlock:
2821         if (mddev)
2822                 unlock_mddev(mddev);
2823 
2824         return err;
2825 done:
2826         if (err)
2827                 printk("huh12?\n");
2828 abort:
2829         return err;
2830 }
2831 
2832 static int md_open (struct inode *inode, struct file *file)
2833 {
2834         /*
2835          * Always succeed, but increment the usage count
2836          */
2837         mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
2838         if (mddev)
2839                 atomic_inc(&mddev->active);
2840         return (0);
2841 }
2842 
2843 static int md_release (struct inode *inode, struct file * file)
2844 {
2845         mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
2846         if (mddev)
2847                 atomic_dec(&mddev->active);
2848         return 0;
2849 }
2850 
2851 static struct block_device_operations md_fops=
2852 {
2853         open:           md_open,
2854         release:        md_release,
2855         ioctl:          md_ioctl,
2856 };
2857 
2858 
2859 int md_thread(void * arg)
2860 {
2861         mdk_thread_t *thread = arg;
2862 
2863         md_lock_kernel();
2864 
2865         /*
2866          * Detach thread
2867          */
2868 
2869         daemonize();
2870 
2871         sprintf(current->comm, thread->name);
2872         md_init_signals();
2873         md_flush_signals();
2874         thread->tsk = current;
2875 
2876         /*
2877          * md_thread is a 'system-thread', it's priority should be very
2878          * high. We avoid resource deadlocks individually in each
2879          * raid personality. (RAID5 does preallocation) We also use RR and
2880          * the very same RT priority as kswapd, thus we will never get
2881          * into a priority inversion deadlock.
2882          *
2883          * we definitely have to have equal or higher priority than
2884          * bdflush, otherwise bdflush will deadlock if there are too
2885          * many dirty RAID5 blocks.
2886          */
2887         current->policy = SCHED_OTHER;
2888         current->nice = -20;
2889 //      md_unlock_kernel();
2890 
2891         up(thread->sem);
2892 
2893         for (;;) {
2894                 DECLARE_WAITQUEUE(wait, current);
2895 
2896                 add_wait_queue(&thread->wqueue, &wait);
2897                 set_task_state(current, TASK_INTERRUPTIBLE);
2898                 if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
2899                         dprintk("thread %p went to sleep.\n", thread);
2900                         schedule();
2901                         dprintk("thread %p woke up.\n", thread);
2902                 }
2903                 current->state = TASK_RUNNING;
2904                 remove_wait_queue(&thread->wqueue, &wait);
2905                 clear_bit(THREAD_WAKEUP, &thread->flags);
2906 
2907                 if (thread->run) {
2908                         thread->run(thread->data);
2909                         run_task_queue(&tq_disk);
2910                 } else
2911                         break;
2912                 if (md_signal_pending(current)) {
2913                         printk("%8s(%d) flushing signals.\n", current->comm,
2914                                 current->pid);
2915                         md_flush_signals();
2916                 }
2917         }
2918         up(thread->sem);
2919         return 0;
2920 }
2921 
2922 void md_wakeup_thread(mdk_thread_t *thread)
2923 {
2924         dprintk("waking up MD thread %p.\n", thread);
2925         set_bit(THREAD_WAKEUP, &thread->flags);
2926         wake_up(&thread->wqueue);
2927 }
2928 
2929 mdk_thread_t *md_register_thread (void (*run) (void *),
2930                                                 void *data, const char *name)
2931 {
2932         mdk_thread_t *thread;
2933         int ret;
2934         DECLARE_MUTEX_LOCKED(sem);
2935         
2936         thread = (mdk_thread_t *) kmalloc
2937                                 (sizeof(mdk_thread_t), GFP_KERNEL);
2938         if (!thread)
2939                 return NULL;
2940         
2941         memset(thread, 0, sizeof(mdk_thread_t));
2942         md_init_waitqueue_head(&thread->wqueue);
2943         
2944         thread->sem = &sem;
2945         thread->run = run;
2946         thread->data = data;
2947         thread->name = name;
2948         ret = kernel_thread(md_thread, thread, 0);
2949         if (ret < 0) {
2950                 kfree(thread);
2951                 return NULL;
2952         }
2953         down(&sem);
2954         return thread;
2955 }
2956 
2957 void md_interrupt_thread (mdk_thread_t *thread)
2958 {
2959         if (!thread->tsk) {
2960                 MD_BUG();
2961                 return;
2962         }
2963         printk("interrupting MD-thread pid %d\n", thread->tsk->pid);
2964         send_sig(SIGKILL, thread->tsk, 1);
2965 }
2966 
2967 void md_unregister_thread (mdk_thread_t *thread)
2968 {
2969         DECLARE_MUTEX_LOCKED(sem);
2970         
2971         thread->sem = &sem;
2972         thread->run = NULL;
2973         thread->name = NULL;
2974         if (!thread->tsk) {
2975                 MD_BUG();
2976                 return;
2977         }
2978         md_interrupt_thread(thread);
2979         down(&sem);
2980 }
2981 
2982 void md_recover_arrays (void)
2983 {
2984         if (!md_recovery_thread) {
2985                 MD_BUG();
2986                 return;
2987         }
2988         md_wakeup_thread(md_recovery_thread);
2989 }
2990 
2991 
2992 int md_error (kdev_t dev, kdev_t rdev)
2993 {
2994         mddev_t *mddev;
2995         mdk_rdev_t * rrdev;
2996         int rc;
2997 
2998         mddev = kdev_to_mddev(dev);
2999 /*      printk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",MAJOR(dev),MINOR(dev),MAJOR(rdev),MINOR(rdev), __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2),__builtin_return_address(3));
3000  */
3001         if (!mddev) {
3002                 MD_BUG();
3003                 return 0;
3004         }
3005         rrdev = find_rdev(mddev, rdev);
3006         mark_rdev_faulty(rrdev);
3007         /*
3008          * if recovery was running, stop it now.
3009          */
3010         if (mddev->pers->stop_resync)
3011                 mddev->pers->stop_resync(mddev);
3012         if (mddev->recovery_running)
3013                 md_interrupt_thread(md_recovery_thread);
3014         if (mddev->pers->error_handler) {
3015                 rc = mddev->pers->error_handler(mddev, rdev);
3016                 md_recover_arrays();
3017                 return rc;
3018         }
3019         return 0;
3020 }
3021 
3022 static int status_unused (char * page)
3023 {
3024         int sz = 0, i = 0;
3025         mdk_rdev_t *rdev;
3026         struct md_list_head *tmp;
3027 
3028         sz += sprintf(page + sz, "unused devices: ");
3029 
3030         ITERATE_RDEV_ALL(rdev,tmp) {
3031                 if (!rdev->same_set.next && !rdev->same_set.prev) {
3032                         /*
3033                          * The device is not yet used by any array.
3034                          */
3035                         i++;
3036                         sz += sprintf(page + sz, "%s ",
3037                                 partition_name(rdev->dev));
3038                 }
3039         }
3040         if (!i)
3041                 sz += sprintf(page + sz, "<none>");
3042 
3043         sz += sprintf(page + sz, "\n");
3044         return sz;
3045 }
3046 
3047 
3048 static int status_resync (char * page, mddev_t * mddev)
3049 {
3050         int sz = 0;
3051         unsigned long max_blocks, resync, res, dt, db, rt;
3052 
3053         resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
3054         max_blocks = mddev->sb->size;
3055 
3056         /*
3057          * Should not happen.
3058          */             
3059         if (!max_blocks) {
3060                 MD_BUG();
3061                 return 0;
3062         }
3063         res = (resync/1024)*1000/(max_blocks/1024 + 1);
3064         {
3065                 int i, x = res/50, y = 20-x;
3066                 sz += sprintf(page + sz, "[");
3067                 for (i = 0; i < x; i++)
3068                         sz += sprintf(page + sz, "=");
3069                 sz += sprintf(page + sz, ">");
3070                 for (i = 0; i < y; i++)
3071                         sz += sprintf(page + sz, ".");
3072                 sz += sprintf(page + sz, "] ");
3073         }
3074         if (!mddev->recovery_running)
3075                 /*
3076                  * true resync
3077                  */
3078                 sz += sprintf(page + sz, " resync =%3lu.%lu%% (%lu/%lu)",
3079                                 res/10, res % 10, resync, max_blocks);
3080         else
3081                 /*
3082                  * recovery ...
3083                  */
3084                 sz += sprintf(page + sz, " recovery =%3lu.%lu%% (%lu/%lu)",
3085                                 res/10, res % 10, resync, max_blocks);
3086 
3087         /*
3088          * We do not want to overflow, so the order of operands and
3089          * the * 100 / 100 trick are important. We do a +1 to be
3090          * safe against division by zero. We only estimate anyway.
3091          *
3092          * dt: time from mark until now
3093          * db: blocks written from mark until now
3094          * rt: remaining time
3095          */
3096         dt = ((jiffies - mddev->resync_mark) / HZ);
3097         if (!dt) dt++;
3098         db = resync - mddev->resync_mark_cnt;
3099         rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
3100         
3101         sz += sprintf(page + sz, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
3102 
3103         sz += sprintf(page + sz, " speed=%ldK/sec", db/dt);
3104 
3105         return sz;
3106 }
3107 
3108 static int md_status_read_proc(char *page, char **start, off_t off,
3109                         int count, int *eof, void *data)
3110 {
3111         int sz = 0, j, size;
3112         struct md_list_head *tmp, *tmp2;
3113         mdk_rdev_t *rdev;
3114         mddev_t *mddev;
3115 
3116         sz += sprintf(page + sz, "Personalities : ");
3117         for (j = 0; j < MAX_PERSONALITY; j++)
3118         if (pers[j])
3119                 sz += sprintf(page+sz, "[%s] ", pers[j]->name);
3120 
3121         sz += sprintf(page+sz, "\n");
3122 
3123 
3124         sz += sprintf(page+sz, "read_ahead ");
3125         if (read_ahead[MD_MAJOR] == INT_MAX)
3126                 sz += sprintf(page+sz, "not set\n");
3127         else
3128                 sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
3129 
3130         ITERATE_MDDEV(mddev,tmp) {
3131                 sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
3132                                                 mddev->pers ? "" : "in");
3133                 if (mddev->pers) {
3134                         if (mddev->ro)  
3135                                 sz += sprintf(page + sz, " (read-only)");
3136                         sz += sprintf(page + sz, " %s", mddev->pers->name);
3137                 }
3138 
3139                 size = 0;
3140                 ITERATE_RDEV(mddev,rdev,tmp2) {
3141                         sz += sprintf(page + sz, " %s[%d]",
3142                                 partition_name(rdev->dev), rdev->desc_nr);
3143                         if (rdev->faulty) {
3144                                 sz += sprintf(page + sz, "(F)");
3145                                 continue;
3146                         }
3147                         size += rdev->size;
3148                 }
3149 
3150                 if (mddev->nb_dev) {
3151                         if (mddev->pers)
3152                                 sz += sprintf(page + sz, "\n      %d blocks",
3153                                                  md_size[mdidx(mddev)]);
3154                         else
3155                                 sz += sprintf(page + sz, "\n      %d blocks", size);
3156                 }
3157 
3158                 if (!mddev->pers) {
3159                         sz += sprintf(page+sz, "\n");
3160                         continue;
3161                 }
3162 
3163                 sz += mddev->pers->status (page+sz, mddev);
3164 
3165                 sz += sprintf(page+sz, "\n      ");
3166                 if (mddev->curr_resync) {
3167                         sz += status_resync (page+sz, mddev);
3168                 } else {
3169                         if (md_atomic_read(&mddev->resync_sem.count) != 1)
3170                                 sz += sprintf(page + sz, "      resync=DELAYED");
3171                 }
3172                 sz += sprintf(page + sz, "\n");
3173         }
3174         sz += status_unused (page + sz);
3175 
3176         return sz;
3177 }
3178 
3179 int register_md_personality (int pnum, mdk_personality_t *p)
3180 {
3181         if (pnum >= MAX_PERSONALITY)
3182                 return -EINVAL;
3183 
3184         if (pers[pnum])
3185                 return -EBUSY;
3186 
3187         pers[pnum] = p;
3188         printk(KERN_INFO "%s personality registered\n", p->name);
3189         return 0;
3190 }
3191 
3192 int unregister_md_personality (int pnum)
3193 {
3194         if (pnum >= MAX_PERSONALITY)
3195                 return -EINVAL;
3196 
3197         printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
3198         pers[pnum] = NULL;
3199         return 0;
3200 }
3201 
3202 static mdp_disk_t *get_spare(mddev_t *mddev)
3203 {
3204         mdp_super_t *sb = mddev->sb;
3205         mdp_disk_t *disk;
3206         mdk_rdev_t *rdev;
3207         struct md_list_head *tmp;
3208 
3209         ITERATE_RDEV(mddev,rdev,tmp) {
3210                 if (rdev->faulty)
3211                         continue;
3212                 if (!rdev->sb) {
3213                         MD_BUG();
3214                         continue;
3215                 }
3216                 disk = &sb->disks[rdev->desc_nr];
3217                 if (disk_faulty(disk)) {
3218                         MD_BUG();
3219                         continue;
3220                 }
3221                 if (disk_active(disk))
3222                         continue;
3223                 return disk;
3224         }
3225         return NULL;
3226 }
3227 
3228 static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
3229 void md_sync_acct(kdev_t dev, unsigned long nr_sectors)
3230 {
3231         unsigned int major = MAJOR(dev);
3232         unsigned int index;
3233 
3234         index = disk_index(dev);
3235         if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
3236                 return;
3237 
3238         sync_io[major][index] += nr_sectors;
3239 }
3240 
3241 static int is_mddev_idle (mddev_t *mddev)
3242 {
3243         mdk_rdev_t * rdev;
3244         struct md_list_head *tmp;
3245         int idle;
3246         unsigned long curr_events;
3247 
3248         idle = 1;
3249         ITERATE_RDEV(mddev,rdev,tmp) {
3250                 int major = MAJOR(rdev->dev);
3251                 int idx = disk_index(rdev->dev);
3252 
3253                 if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
3254                         continue;
3255 
3256                 curr_events = kstat.dk_drive_rblk[major][idx] +
3257                                                 kstat.dk_drive_wblk[major][idx] ;
3258                 curr_events -= sync_io[major][idx];
3259 //              printk("events(major: %d, idx: %d): %ld\n", major, idx, curr_events);
3260                 if (curr_events != rdev->last_events) {
3261 //                      printk("!I(%ld)", curr_events - rdev->last_events);
3262                         rdev->last_events = curr_events;
3263                         idle = 0;
3264                 }
3265         }
3266         return idle;
3267 }
3268 
3269 MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait);
3270 
3271 void md_done_sync(mddev_t *mddev, int blocks, int ok)
3272 {
3273         /* another "blocks" (1K) blocks have been synced */
3274         atomic_sub(blocks, &mddev->recovery_active);
3275         wake_up(&mddev->recovery_wait);
3276         if (!ok) {
3277                 // stop recovery, signal do_sync ....
3278         }
3279 }
3280 
3281 #define SYNC_MARKS      10
3282 #define SYNC_MARK_STEP  (3*HZ)
3283 int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
3284 {
3285         mddev_t *mddev2;
3286         unsigned int max_blocks, currspeed,
3287                 j, window, err, serialize;
3288         kdev_t read_disk = mddev_to_kdev(mddev);
3289         unsigned long mark[SYNC_MARKS];
3290         unsigned long mark_cnt[SYNC_MARKS];     
3291         int last_mark,m;
3292         struct md_list_head *tmp;
3293         unsigned long last_check;
3294 
3295 
3296         err = down_interruptible(&mddev->resync_sem);
3297         if (err)
3298                 goto out_nolock;
3299 
3300 recheck:
3301         serialize = 0;
3302         ITERATE_MDDEV(mddev2,tmp) {
3303                 if (mddev2 == mddev)
3304                         continue;
3305                 if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
3306                         printk(KERN_INFO "md: serializing resync, md%d shares one or more physical units with md%d!\n", mdidx(mddev), mdidx(mddev2));
3307                         serialize = 1;
3308                         break;
3309                 }
3310         }
3311         if (serialize) {
3312                 interruptible_sleep_on(&resync_wait);
3313                 if (md_signal_pending(current)) {
3314                         md_flush_signals();
3315                         err = -EINTR;
3316                         goto out;
3317                 }
3318                 goto recheck;
3319         }
3320 
3321         mddev->curr_resync = 1;
3322 
3323         max_blocks = mddev->sb->size;
3324 
3325         printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
3326         printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
3327                                                 sysctl_speed_limit_min);
3328         printk(KERN_INFO "md: using maximum available idle IO bandwith (but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max);
3329 
3330         /*
3331          * Resync has low priority.
3332          */
3333         current->nice = 19;
3334 
3335         is_mddev_idle(mddev); /* this also initializes IO event counters */
3336         for (m = 0; m < SYNC_MARKS; m++) {
3337                 mark[m] = jiffies;
3338                 mark_cnt[m] = 0;
3339         }
3340         last_mark = 0;
3341         mddev->resync_mark = mark[last_mark];
3342         mddev->resync_mark_cnt = mark_cnt[last_mark];
3343 
3344         /*
3345          * Tune reconstruction:
3346          */
3347         window = MAX_READAHEAD*(PAGE_SIZE/1024);
3348         printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",window,max_blocks);
3349 
3350         atomic_set(&mddev->recovery_active, 0);
3351         init_waitqueue_head(&mddev->recovery_wait);
3352         last_check = 0;
3353         for (j = 0; j < max_blocks;) {
3354                 int blocks;
3355 
3356                 blocks = mddev->pers->sync_request(mddev, j);
3357 
3358                 if (blocks < 0) {
3359                         err = blocks;
3360                         goto out;
3361                 }
3362                 atomic_add(blocks, &mddev->recovery_active);
3363                 j += blocks;
3364                 mddev->curr_resync = j;
3365 
3366                 if (last_check + window > j)
3367                         continue;
3368                 
3369                 run_task_queue(&tq_disk); //??
3370 
3371                 if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
3372                         /* step marks */
3373                         int next = (last_mark+1) % SYNC_MARKS;
3374                         
3375                         mddev->resync_mark = mark[next];
3376                         mddev->resync_mark_cnt = mark_cnt[next];
3377                         mark[next] = jiffies;
3378                         mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
3379                         last_mark = next;
3380                 }
3381                         
3382 
3383                 if (md_signal_pending(current)) {
3384                         /*
3385                          * got a signal, exit.
3386                          */
3387                         mddev->curr_resync = 0;
3388                         printk("md_do_sync() got signal ... exiting\n");
3389                         md_flush_signals();
3390                         err = -EINTR;
3391                         goto out;
3392                 }
3393 
3394                 /*
3395                  * this loop exits only if either when we are slower than
3396                  * the 'hard' speed limit, or the system was IO-idle for
3397                  * a jiffy.
3398                  * the system might be non-idle CPU-wise, but we only care
3399                  * about not overloading the IO subsystem. (things like an
3400                  * e2fsck being done on the RAID array should execute fast)
3401                  */
3402 repeat:
3403                 if (md_need_resched(current))
3404                         schedule();
3405 
3406                 currspeed = (j-mddev->resync_mark_cnt)/((jiffies-mddev->resync_mark)/HZ +1) +1;
3407 
3408                 if (currspeed > sysctl_speed_limit_min) {
3409                         current->nice = 19;
3410 
3411                         if ((currspeed > sysctl_speed_limit_max) ||
3412                                         !is_mddev_idle(mddev)) {
3413                                 current->state = TASK_INTERRUPTIBLE;
3414                                 md_schedule_timeout(HZ/4);
3415                                 if (!md_signal_pending(current))
3416                                         goto repeat;
3417                         }
3418                 } else
3419                         current->nice = -20;
3420         }
3421         fsync_dev(read_disk);
3422         printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
3423         err = 0;
3424         /*
3425          * this also signals 'finished resyncing' to md_stop
3426          */
3427 out:
3428         wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
3429         up(&mddev->resync_sem);
3430 out_nolock:
3431         mddev->curr_resync = 0;
3432         wake_up(&resync_wait);
3433         return err;
3434 }
3435 
3436 
3437 /*
3438  * This is a kernel thread which syncs a spare disk with the active array
3439  *
3440  * the amount of foolproofing might seem to be a tad excessive, but an
3441  * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
3442  * of my root partition with the first 0.5 gigs of my /home partition ... so
3443  * i'm a bit nervous ;)
3444  */
3445 void md_do_recovery (void *data)
3446 {
3447         int err;
3448         mddev_t *mddev;
3449         mdp_super_t *sb;
3450         mdp_disk_t *spare;
3451         struct md_list_head *tmp;
3452 
3453         printk(KERN_INFO "md: recovery thread got woken up ...\n");
3454 restart:
3455         ITERATE_MDDEV(mddev,tmp) {
3456                 sb = mddev->sb;
3457                 if (!sb)
3458                         continue;
3459                 if (mddev->recovery_running)
3460                         continue;
3461                 if (sb->active_disks == sb->raid_disks)
3462                         continue;
3463                 if (!sb->spare_disks) {
3464                         printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev));
3465                         continue;
3466                 }
3467                 /*
3468                  * now here we get the spare and resync it.
3469                  */
3470                 if ((spare = get_spare(mddev)) == NULL)
3471                         continue;
3472                 printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3473                 if (!mddev->pers->diskop)
3474                         continue;
3475                 if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
3476                         continue;
3477                 down(&mddev->recovery_sem);
3478                 mddev->recovery_running = 1;
3479                 err = md_do_sync(mddev, spare);
3480                 if (err == -EIO) {
3481                         printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3482                         if (!disk_faulty(spare)) {
3483                                 mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
3484                                 mark_disk_faulty(spare);
3485                                 mark_disk_nonsync(spare);
3486                                 mark_disk_inactive(spare);
3487                                 sb->spare_disks--;
3488                                 sb->working_disks--;
3489                                 sb->failed_disks++;
3490                         }
3491                 } else
3492                         if (disk_faulty(spare))
3493                                 mddev->pers->diskop(mddev, &spare,
3494                                                 DISKOP_SPARE_INACTIVE);
3495                 if (err == -EINTR || err == -ENOMEM) {
3496                         /*
3497                          * Recovery got interrupted, or ran out of mem ...
3498                          * signal back that we have finished using the array.
3499                          */
3500                         mddev->pers->diskop(mddev, &spare,
3501                                                          DISKOP_SPARE_INACTIVE);
3502                         up(&mddev->recovery_sem);
3503                         mddev->recovery_running = 0;
3504                         continue;
3505                 } else {
3506                         mddev->recovery_running = 0;
3507                         up(&mddev->recovery_sem);
3508                 }
3509                 if (!disk_faulty(spare)) {
3510                         /*
3511                          * the SPARE_ACTIVE diskop possibly changes the
3512                          * pointer too
3513                          */
3514                         mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
3515                         mark_disk_sync(spare);
3516                         mark_disk_active(spare);
3517                         sb->active_disks++;
3518                         sb->spare_disks--;
3519                 }
3520                 mddev->sb_dirty = 1;
3521                 md_update_sb(mddev);
3522                 goto restart;
3523         }
3524         printk(KERN_INFO "md: recovery thread finished ...\n");
3525         
3526 }
3527 
3528 int md_notify_reboot(struct notifier_block *this,
3529                                         unsigned long code, void *x)
3530 {
3531         struct md_list_head *tmp;
3532         mddev_t *mddev;
3533 
3534         if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
3535                                   || (code == MD_SYS_POWER_OFF)) {
3536 
3537                 printk(KERN_INFO "stopping all md devices.\n");
3538 
3539                 ITERATE_MDDEV(mddev,tmp)
3540                         do_md_stop (mddev, 1);
3541                 /*
3542                  * certain more exotic SCSI devices are known to be
3543                  * volatile wrt too early system reboots. While the
3544                  * right place to handle this issue is the given
3545                  * driver, we do want to have a safe RAID driver ...
3546                  */
3547                 md_mdelay(1000*1);
3548         }
3549         return NOTIFY_DONE;
3550 }
3551 
3552 struct notifier_block md_notifier = {
3553         md_notify_reboot,
3554         NULL,
3555         0
3556 };
3557 #ifndef MODULE
3558 static int md__init raid_setup(char *str)
3559 {
3560         int len, pos;
3561 
3562         len = strlen(str) + 1;
3563         pos = 0;
3564 
3565         while (pos < len) {
3566                 char *comma = strchr(str+pos, ',');
3567                 int wlen;
3568                 if (comma)
3569                         wlen = (comma-str)-pos;
3570                 else    wlen = (len-1)-pos;
3571 
3572                 if (strncmp(str, "noautodetect", wlen) == 0)
3573                         raid_setup_args.noautodetect = 1;
3574                 pos += wlen+1;
3575         }
3576         raid_setup_args.set = 1;
3577         return 1;
3578 }
3579 __setup("raid=", raid_setup);
3580 #endif
3581 static void md_geninit (void)
3582 {
3583         int i;
3584 
3585         for(i = 0; i < MAX_MD_DEVS; i++) {
3586                 md_blocksizes[i] = 1024;
3587                 md_size[i] = 0;
3588                 md_hardsect_sizes[i] = 512;
3589                 md_maxreadahead[i] = MD_READAHEAD;
3590                 register_disk(&md_gendisk, MKDEV(MAJOR_NR,i), 1, &md_fops, 0);
3591         }
3592         blksize_size[MAJOR_NR] = md_blocksizes;
3593         blk_size[MAJOR_NR] = md_size;
3594         max_readahead[MAJOR_NR] = md_maxreadahead;
3595         hardsect_size[MAJOR_NR] = md_hardsect_sizes;
3596 
3597         printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
3598 
3599 #ifdef CONFIG_PROC_FS
3600         create_proc_read_entry("mdstat", 0, NULL, md_status_read_proc, NULL);
3601 #endif
3602 }
3603 
3604 int md__init md_init (void)
3605 {
3606         static char * name = "mdrecoveryd";
3607         
3608         printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n",
3609                         MD_MAJOR_VERSION, MD_MINOR_VERSION,
3610                         MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
3611 
3612         if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops))
3613         {
3614                 printk (KERN_ALERT "Unable to get major %d for md\n", MAJOR_NR);
3615                 return (-1);
3616         }
3617         devfs_handle = devfs_mk_dir (NULL, "md", NULL);
3618         devfs_register_series (devfs_handle, "%u",MAX_MD_DEVS,DEVFS_FL_DEFAULT,
3619                                 MAJOR_NR, 0, S_IFBLK | S_IRUSR | S_IWUSR,
3620                                 &md_fops, NULL);
3621 
3622         /* forward all md request to md_make_request */
3623         blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_make_request);
3624         
3625 
3626         read_ahead[MAJOR_NR] = INT_MAX;
3627         md_gendisk.next = gendisk_head;
3628 
3629         gendisk_head = &md_gendisk;
3630 
3631         md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
3632         if (!md_recovery_thread)
3633                 printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n");
3634 
3635         md_register_reboot_notifier(&md_notifier);
3636         raid_table_header = register_sysctl_table(raid_root_table, 1);
3637 
3638         md_geninit();
3639         return (0);
3640 }
3641 
3642 #ifdef CONFIG_MD_BOOT
3643 #define MAX_MD_BOOT_DEVS        8
3644 struct {
3645         unsigned long set;
3646         int pers[MAX_MD_BOOT_DEVS];
3647         int chunk[MAX_MD_BOOT_DEVS];
3648         kdev_t devices[MAX_MD_BOOT_DEVS][MD_SB_DISKS];
3649 } md_setup_args md__initdata = { 0, };
3650 
3651 /*
3652  * Parse the command-line parameters given our kernel, but do not
3653  * actually try to invoke the MD device now; that is handled by
3654  * md_setup_drive after the low-level disk drivers have initialised.
3655  *
3656  * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
3657  *             assigns the task of parsing integer arguments to the
3658  *             invoked program now).  Added ability to initialise all
3659  *             the MD devices (by specifying multiple "md=" lines)
3660  *             instead of just one.  -- KTK
3661  * 18May2000: Added support for persistant-superblock arrays:
3662  *             md=n,0,factor,fault,device-list   uses RAID0 for device n
3663  *             md=n,-1,factor,fault,device-list  uses LINEAR for device n
3664  *             md=n,device-list      reads a RAID superblock from the devices
3665  *             elements in device-list are read by name_to_kdev_t so can be
3666  *             a hex number or something like /dev/hda1 /dev/sdb
3667  */
3668 extern kdev_t name_to_kdev_t(char *line) md__init;
3669 static int md__init md_setup(char *str)
3670 {
3671         int minor, level, factor, fault, i=0;
3672         kdev_t device;
3673         char *devnames, *pername = "";
3674 
3675         if(get_option(&str, &minor) != 2) {     /* MD Number */
3676                 printk("md: Too few arguments supplied to md=.\n");
3677                 return 0;
3678         }
3679         if (minor >= MAX_MD_BOOT_DEVS) {
3680                 printk ("md: Minor device number too high.\n");
3681                 return 0;
3682         } else if (md_setup_args.set & (1 << minor)) {
3683                 printk ("md: Warning - md=%d,... has been specified twice;\n"
3684                         "    will discard the first definition.\n", minor);
3685         }
3686         switch(get_option(&str, &level)) {      /* RAID Personality */
3687         case 2: /* could be 0 or -1.. */
3688                 if (level == 0 || level == -1) {
3689                         if (get_option(&str, &factor) != 2 ||   /* Chunk Size */
3690                             get_option(&str, &fault) != 2) {
3691                                 printk("md: Too few arguments supplied to md=.\n");
3692                                 return 0;
3693                         }
3694                         md_setup_args.pers[minor] = level;
3695                         md_setup_args.chunk[minor] = 1 << (factor+12);
3696                         switch(level) {
3697                         case -1:
3698                                 level = LINEAR;
3699                                 pername = "linear";
3700                                 break;
3701                         case 0:
3702                                 level = RAID0;
3703                                 pername = "raid0";
3704                                 break;
3705                         default:
3706                                 printk ("md: The kernel has not been configured for raid%d"
3707                                         " support!\n", level);
3708                                 return 0;
3709                         }
3710                         md_setup_args.pers[minor] = level;
3711                         break;
3712                 }
3713                 /* FALL THROUGH */
3714         case 1: /* the first device is numeric */
3715                 md_setup_args.devices[minor][i++] = level;
3716                 /* FALL THROUGH */
3717         case 0:
3718                 md_setup_args.pers[minor] = 0;
3719                 pername="super-block";
3720         }
3721         devnames = str;
3722         for (; i<MD_SB_DISKS && str; i++) {
3723                 if ((device = name_to_kdev_t(str))) {
3724                         md_setup_args.devices[minor][i] = device;
3725                 } else {
3726                         printk ("md: Unknown device name, %s.\n", str);
3727                         return 0;
3728                 }
3729                 if ((str = strchr(str, ',')) != NULL)
3730                         str++;
3731         }
3732         if (!i) {
3733                 printk ("md: No devices specified for md%d?\n", minor);
3734                 return 0;
3735         }
3736 
3737         printk ("md: Will configure md%d (%s) from %s, below.\n",
3738                 minor, pername, devnames);
3739         md_setup_args.devices[minor][i] = (kdev_t) 0;
3740         md_setup_args.set |= (1 << minor);
3741         return 1;
3742 }
3743 
3744 void md__init md_setup_drive(void)
3745 {
3746         int minor, i;
3747         kdev_t dev;
3748         mddev_t*mddev;
3749 
3750         for (minor = 0; minor < MAX_MD_BOOT_DEVS; minor++) {
3751                 mdu_disk_info_t dinfo;
3752                 int err=0;
3753                 if (!(md_setup_args.set & (1 << minor)))
3754                         continue;
3755                 printk("md: Loading md%d.\n", minor);
3756                 if (mddev_map[minor].mddev) {
3757                         printk(".. md%d already autodetected - use raid=noautodetect\n", minor);
3758                         continue;
3759                 }
3760                 mddev = alloc_mddev(MKDEV(MD_MAJOR,minor));
3761                 if (md_setup_args.pers[minor]) {
3762                         /* non-persistent */
3763                         mdu_array_info_t ainfo;
3764                         ainfo.level = pers_to_level(md_setup_args.pers[minor]);
3765                         ainfo.size = 0;
3766                         ainfo.nr_disks =0;
3767                         ainfo.raid_disks =0;
3768                         ainfo.md_minor =minor;
3769                         ainfo.not_persistent = 1;
3770 
3771                         ainfo.state = MD_SB_CLEAN;
3772                         ainfo.active_disks = 0;
3773                         ainfo.working_disks = 0;
3774                         ainfo.failed_disks = 0;
3775                         ainfo.spare_disks = 0;
3776                         ainfo.layout = 0;
3777                         ainfo.chunk_size = md_setup_args.chunk[minor];
3778                         err = set_array_info(mddev, &ainfo);
3779                         for (i=0; !err && (dev = md_setup_args.devices[minor][i]); i++) {
3780                                 dinfo.number = i;
3781                                 dinfo.raid_disk = i;
3782                                 dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
3783                                 dinfo.major = MAJOR(dev);
3784                                 dinfo.minor = MINOR(dev);
3785                                 mddev->sb->nr_disks++;
3786                                 mddev->sb->raid_disks++;
3787                                 mddev->sb->active_disks++;
3788                                 mddev->sb->working_disks++;
3789                                 err = add_new_disk (mddev, &dinfo);
3790                         }
3791                 } else {
3792                         /* persistent */
3793                         for (i = 0; (dev = md_setup_args.devices[minor][i]); i++) {
3794                                 dinfo.major = MAJOR(dev);
3795                                 dinfo.minor = MINOR(dev);
3796                                 add_new_disk (mddev, &dinfo);
3797                         }
3798                 }
3799                 if (!err)
3800                         err = do_md_run(mddev);
3801                 if (err) {
3802                         mddev->sb_dirty = 0;
3803                         do_md_stop(mddev, 0);
3804                         printk("md: starting md%d failed\n", minor);
3805                 }
3806         }
3807 }
3808 
3809 __setup("md=", md_setup);
3810 #endif
3811 
3812 #ifdef MODULE
3813 int init_module (void)
3814 {
3815         return md_init();
3816 }
3817 
3818 static void free_device_names(void)
3819 {
3820         while (device_names.next != &device_names) {
3821                 struct list_head *tmp = device_names.next;
3822                 list_del(tmp);
3823                 kfree(tmp);
3824         }
3825 }
3826 
3827 
3828 void cleanup_module (void)
3829 {
3830         struct gendisk **gendisk_ptr;
3831 
3832         md_unregister_thread(md_recovery_thread);
3833         devfs_unregister(devfs_handle);
3834 
3835         devfs_unregister_blkdev(MAJOR_NR,"md");
3836         unregister_reboot_notifier(&md_notifier);
3837         unregister_sysctl_table(raid_table_header);
3838 #ifdef CONFIG_PROC_FS
3839         remove_proc_entry("mdstat", NULL);
3840 #endif
3841         
3842         gendisk_ptr = &gendisk_head;
3843         while (*gendisk_ptr) {
3844                 if (*gendisk_ptr == &md_gendisk) {
3845                         *gendisk_ptr = md_gendisk.next;
3846                         break;
3847                 }
3848                 gendisk_ptr = & (*gendisk_ptr)->next;
3849         }
3850         blk_dev[MAJOR_NR].queue = NULL;
3851         blksize_size[MAJOR_NR] = NULL;
3852         blk_size[MAJOR_NR] = NULL;
3853         max_readahead[MAJOR_NR] = NULL;
3854         hardsect_size[MAJOR_NR] = NULL;
3855         
3856         free_device_names();
3857 
3858 }
3859 #endif
3860 
3861 __initcall(md_init);
3862 #if defined(CONFIG_AUTODETECT_RAID) || defined(CONFIG_MD_BOOT)
3863 __initcall(md_run_setup);
3864 #endif
3865 
3866 MD_EXPORT_SYMBOL(md_size);
3867 MD_EXPORT_SYMBOL(register_md_personality);
3868 MD_EXPORT_SYMBOL(unregister_md_personality);
3869 MD_EXPORT_SYMBOL(partition_name);
3870 MD_EXPORT_SYMBOL(md_error);
3871 MD_EXPORT_SYMBOL(md_do_sync);
3872 MD_EXPORT_SYMBOL(md_sync_acct);
3873 MD_EXPORT_SYMBOL(md_done_sync);
3874 MD_EXPORT_SYMBOL(md_recover_arrays);
3875 MD_EXPORT_SYMBOL(md_register_thread);
3876 MD_EXPORT_SYMBOL(md_unregister_thread);
3877 MD_EXPORT_SYMBOL(md_update_sb);
3878 MD_EXPORT_SYMBOL(md_wakeup_thread);
3879 MD_EXPORT_SYMBOL(md_print_devices);
3880 MD_EXPORT_SYMBOL(find_rdev_nr);
3881 MD_EXPORT_SYMBOL(md_interrupt_thread);
3882 MD_EXPORT_SYMBOL(mddev_map);
3883 MD_EXPORT_SYMBOL(md_check_ordering);
3884 
3885 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.